"""Convert regional netcdf files to a single GLAMBIE submission CSV. Three checks run before writing: 1. All 19 RGI regions are present. 2. Spot check: 20 randomly sampled glaciers (fixed seed) compared against fixed_geometry_mass_balance reference CSVs. Reports the max single-year absolute difference and mean difference over the overlap period (both in mm w.e./yr). Raises if any glacier exceeds MAX_DIFF_THRESHOLD. 3. Area check: regional total area matches glacier_statistics (hence coverage % is also accurate). """ import glob import os import numpy as np import pandas as pd import xarray as xr N_SPOT_CHECK = 20 MAX_DIFF_THRESHOLD = 5.0 # mm w.e./yr, max single-year absolute difference NETCDF_DIR = os.path.join(os.path.dirname(__file__), "glambie_fixed_geometry") GLACIER_DIR = os.path.join(NETCDF_DIR, "per_glacier") STATS_DIR = os.path.join(os.path.dirname(__file__), "oggm_ref_files") OUTPUT_CSV = os.path.join(os.path.dirname(__file__), "glambie_submission_fixed_geometry.csv") COLUMNS = [ "region_id", "start_date", "end_date", "glacier_change_observed", "glacier_change_uncertainty", "glacier_change_observed_above_water_level", "glacier_change_observed_above_water_level_uncertainty", "glacier_change_observed_below_water_level", "glacier_change_observed_below_water_level_uncertainty", "glacier_frontal_ablation", "glacier_frontal_ablation_uncertainty", "unit", "glacier_area_reference_start", "glacier_area_reference_end", "observational_coverage_percentage", "remarks", ] def nc_to_rows(nc_path): with xr.open_dataset(nc_path) as ds: times = pd.DatetimeIndex(ds["time"].values) mb = ds["region_specific_mb_mwe"].values area_km2 = float(ds["total_region_area_m2"].values) / 1e6 coverage = float(ds["completion_rate_pct"].values) region_id = int(ds.attrs["region_id"]) rows = [] for i, t in enumerate(times): end = t + pd.offsets.MonthBegin(1) rows.append({ "region_id": region_id, "_sort_time": t, "start_date": t.strftime("%d/%m/%Y"), "end_date": end.strftime("%d/%m/%Y"), "glacier_change_observed": mb[i], "glacier_change_uncertainty": np.nan, "glacier_change_observed_above_water_level": mb[i], "glacier_change_observed_above_water_level_uncertainty": np.nan, "glacier_change_observed_below_water_level": 0.0, "glacier_change_observed_below_water_level_uncertainty": np.nan, "glacier_frontal_ablation": np.nan, "glacier_frontal_ablation_uncertainty": np.nan, "unit": "mwe", "glacier_area_reference_start": area_km2, "glacier_area_reference_end": area_km2, "observational_coverage_percentage": coverage, "remarks": np.nan, }) return rows # --------------------------------------------------------------------------- nc_files = sorted(glob.glob(os.path.join(NETCDF_DIR, "region_specific_mb_RGI*.nc"))) if not nc_files: raise RuntimeError(f"No regional netcdf files found in {NETCDF_DIR}") # ---- Check 1: all 19 regions ----------------------------------------------- region_ids = [] for fp in nc_files: with xr.open_dataset(fp) as ds: region_ids.append(int(ds.attrs["region_id"])) missing = sorted(set(range(1, 20)) - set(region_ids)) if missing: raise RuntimeError(f"Missing RGI regions: {[f'{r:02d}' for r in missing]}") print("[OK] All 19 regions present\n") # ---- Check 2: spot check annual MB against reference ----------------------- # Sample N_SPOT_CHECK glaciers at random (fixed seed for reproducibility). # All regions must have a reference CSV — missing file raises immediately. all_glacier_files = sorted(glob.glob( os.path.join(GLACIER_DIR, "**", "RGI*.nc"), recursive=True )) rng = np.random.default_rng(42) sample = rng.choice(all_glacier_files, size=min(N_SPOT_CHECK, len(all_glacier_files)), replace=False) print(f"Spot check ({len(sample)} random glaciers, threshold {MAX_DIFF_THRESHOLD} mm w.e./yr max single-year diff):") n_checked = 0 for gfp in sorted(sample): fname = os.path.basename(gfp) rgi_id = fname.replace(".nc", "") region_id = rgi_id.split("-")[1].split(".")[0] ref_path = os.path.join(STATS_DIR, f"fixed_geometry_mass_balance_{region_id}.csv") if not os.path.exists(ref_path): raise RuntimeError(f"Reference CSV missing for region {region_id}: {ref_path}") ref_csv = pd.read_csv(ref_path, index_col=0) if rgi_id not in ref_csv.columns: print(f" {rgi_id}: not in reference CSV (likely failed glacier), skipping") continue with xr.open_dataset(gfp) as ds: if int(ds["task_success"].values) != 1: print(f" {rgi_id}: task_success=0, skipping") continue monthly = ds["specific_mb_mwe"].to_series() annual = monthly.resample("YE").sum() * 1000 # m w.e. → mm w.e. annual.index = annual.index.year ref_annual = ref_csv[rgi_id] overlap = annual.index.intersection(ref_annual.index) diff = annual[overlap] - ref_annual[overlap] max_diff = diff.abs().max() mean_diff = diff.mean() if max_diff > MAX_DIFF_THRESHOLD: raise RuntimeError( f"MB mismatch for {rgi_id}: max single-year diff = {max_diff:.4f} mm w.e./yr " f"(mean = {mean_diff:.4f}) — threshold is {MAX_DIFF_THRESHOLD}" ) print(f" {rgi_id}: max={max_diff:.2e} mean={mean_diff:+.4f} mm w.e./yr") n_checked += 1 print(f"[OK] Spot check passed ({n_checked} glaciers checked)\n") # ---- Check 3: area vs glacier_statistics ----------------------------------- print("Area check:") area_warnings = [] for fp in nc_files: with xr.open_dataset(fp) as ds: region_id = f"{int(ds.attrs['region_id']):02d}" nc_area = float(ds["total_region_area_m2"].values) / 1e6 coverage = float(ds["completion_rate_pct"].values) stats_path = os.path.join(STATS_DIR, f"glacier_statistics_{region_id}.csv") expected_area = pd.read_csv(stats_path, index_col=0, low_memory=False)["rgi_area_km2"].sum() diff = nc_area - expected_area flag = " *** MISMATCH" if abs(diff) > 1.0 else "" if flag: area_warnings.append(f"RGI{region_id}") print(f" RGI{region_id}: expected={expected_area:.3f} got={nc_area:.3f} diff={diff:+.3f} km²" f" coverage={coverage:.1f}%{flag}") if not area_warnings: print("[OK] All regional areas match glacier_statistics\n") else: print(f"\n[WARN] Area mismatch in: {', '.join(area_warnings)} — some glacier files may be missing\n") # ---- Build and write CSV --------------------------------------------------- all_rows = [] for fp in nc_files: all_rows.extend(nc_to_rows(fp)) df = pd.DataFrame(all_rows) df = df.sort_values(["region_id", "_sort_time"]).reset_index(drop=True) df = df[COLUMNS] df["glacier_change_observed"] = df["glacier_change_observed"].round(4) df["glacier_change_observed_above_water_level"] = df["glacier_change_observed"] df.to_csv(OUTPUT_CSV, index=False, na_rep="") print(f"Wrote {len(df)} rows ({df['region_id'].nunique()} regions) → {OUTPUT_CSV}")