"""Convert regional netcdf files to a single GLAMBIE submission CSV.

Three checks run before writing:
  1. All 19 RGI regions are present.
  2. Spot check: 20 randomly sampled glaciers (fixed seed) compared against
     fixed_geometry_mass_balance reference CSVs.  Reports the max single-year
     absolute difference and mean difference over the overlap period (both in
     mm w.e./yr).  Raises if any glacier exceeds MAX_DIFF_THRESHOLD.
  3. Area check: regional total area matches glacier_statistics (hence
     coverage % is also accurate).
"""
import glob
import os

import numpy as np
import pandas as pd
import xarray as xr

N_SPOT_CHECK = 20
MAX_DIFF_THRESHOLD = 5.0  # mm w.e./yr, max single-year absolute difference

NETCDF_DIR = os.path.join(os.path.dirname(__file__), "glambie_fixed_geometry")
GLACIER_DIR = os.path.join(NETCDF_DIR, "per_glacier")
STATS_DIR = os.path.join(os.path.dirname(__file__), "oggm_ref_files")
OUTPUT_CSV = os.path.join(os.path.dirname(__file__), "glambie_submission_fixed_geometry.csv")

COLUMNS = [
    "region_id",
    "start_date",
    "end_date",
    "glacier_change_observed",
    "glacier_change_uncertainty",
    "glacier_change_observed_above_water_level",
    "glacier_change_observed_above_water_level_uncertainty",
    "glacier_change_observed_below_water_level",
    "glacier_change_observed_below_water_level_uncertainty",
    "glacier_frontal_ablation",
    "glacier_frontal_ablation_uncertainty",
    "unit",
    "glacier_area_reference_start",
    "glacier_area_reference_end",
    "observational_coverage_percentage",
    "remarks",
]


def nc_to_rows(nc_path):
    with xr.open_dataset(nc_path) as ds:
        times = pd.DatetimeIndex(ds["time"].values)
        mb = ds["region_specific_mb_mwe"].values
        area_km2 = float(ds["total_region_area_m2"].values) / 1e6
        coverage = float(ds["completion_rate_pct"].values)
        region_id = int(ds.attrs["region_id"])

    rows = []
    for i, t in enumerate(times):
        end = t + pd.offsets.MonthBegin(1)
        rows.append({
            "region_id": region_id,
            "_sort_time": t,
            "start_date": t.strftime("%d/%m/%Y"),
            "end_date": end.strftime("%d/%m/%Y"),
            "glacier_change_observed": mb[i],
            "glacier_change_uncertainty": np.nan,
            "glacier_change_observed_above_water_level": mb[i],
            "glacier_change_observed_above_water_level_uncertainty": np.nan,
            "glacier_change_observed_below_water_level": 0.0,
            "glacier_change_observed_below_water_level_uncertainty": np.nan,
            "glacier_frontal_ablation": np.nan,
            "glacier_frontal_ablation_uncertainty": np.nan,
            "unit": "mwe",
            "glacier_area_reference_start": area_km2,
            "glacier_area_reference_end": area_km2,
            "observational_coverage_percentage": coverage,
            "remarks": np.nan,
        })
    return rows


# ---------------------------------------------------------------------------
nc_files = sorted(glob.glob(os.path.join(NETCDF_DIR, "region_specific_mb_RGI*.nc")))
if not nc_files:
    raise RuntimeError(f"No regional netcdf files found in {NETCDF_DIR}")

# ---- Check 1: all 19 regions -----------------------------------------------
region_ids = []
for fp in nc_files:
    with xr.open_dataset(fp) as ds:
        region_ids.append(int(ds.attrs["region_id"]))

missing = sorted(set(range(1, 20)) - set(region_ids))
if missing:
    raise RuntimeError(f"Missing RGI regions: {[f'{r:02d}' for r in missing]}")
print("[OK] All 19 regions present\n")

# ---- Check 2: spot check annual MB against reference -----------------------
# Sample N_SPOT_CHECK glaciers at random (fixed seed for reproducibility).
# All regions must have a reference CSV — missing file raises immediately.
all_glacier_files = sorted(glob.glob(
    os.path.join(GLACIER_DIR, "**", "RGI*.nc"), recursive=True
))
rng = np.random.default_rng(42)
sample = rng.choice(all_glacier_files, size=min(N_SPOT_CHECK, len(all_glacier_files)), replace=False)

print(f"Spot check ({len(sample)} random glaciers, threshold {MAX_DIFF_THRESHOLD} mm w.e./yr max single-year diff):")
n_checked = 0
for gfp in sorted(sample):
    fname = os.path.basename(gfp)
    rgi_id = fname.replace(".nc", "")
    region_id = rgi_id.split("-")[1].split(".")[0]

    ref_path = os.path.join(STATS_DIR, f"fixed_geometry_mass_balance_{region_id}.csv")
    if not os.path.exists(ref_path):
        raise RuntimeError(f"Reference CSV missing for region {region_id}: {ref_path}")

    ref_csv = pd.read_csv(ref_path, index_col=0)
    if rgi_id not in ref_csv.columns:
        print(f"  {rgi_id}: not in reference CSV (likely failed glacier), skipping")
        continue

    with xr.open_dataset(gfp) as ds:
        if int(ds["task_success"].values) != 1:
            print(f"  {rgi_id}: task_success=0, skipping")
            continue
        monthly = ds["specific_mb_mwe"].to_series()

    annual = monthly.resample("YE").sum() * 1000  # m w.e. → mm w.e.
    annual.index = annual.index.year
    ref_annual = ref_csv[rgi_id]
    overlap = annual.index.intersection(ref_annual.index)
    diff = annual[overlap] - ref_annual[overlap]
    max_diff = diff.abs().max()
    mean_diff = diff.mean()
    if max_diff > MAX_DIFF_THRESHOLD:
        raise RuntimeError(
            f"MB mismatch for {rgi_id}: max single-year diff = {max_diff:.4f} mm w.e./yr "
            f"(mean = {mean_diff:.4f}) — threshold is {MAX_DIFF_THRESHOLD}"
        )
    print(f"  {rgi_id}: max={max_diff:.2e}  mean={mean_diff:+.4f}  mm w.e./yr")
    n_checked += 1

print(f"[OK] Spot check passed ({n_checked} glaciers checked)\n")

# ---- Check 3: area vs glacier_statistics -----------------------------------
print("Area check:")
area_warnings = []
for fp in nc_files:
    with xr.open_dataset(fp) as ds:
        region_id = f"{int(ds.attrs['region_id']):02d}"
        nc_area = float(ds["total_region_area_m2"].values) / 1e6
        coverage = float(ds["completion_rate_pct"].values)

    stats_path = os.path.join(STATS_DIR, f"glacier_statistics_{region_id}.csv")
    expected_area = pd.read_csv(stats_path, index_col=0, low_memory=False)["rgi_area_km2"].sum()
    diff = nc_area - expected_area
    flag = "  *** MISMATCH" if abs(diff) > 1.0 else ""
    if flag:
        area_warnings.append(f"RGI{region_id}")
    print(f"  RGI{region_id}: expected={expected_area:.3f}  got={nc_area:.3f}  diff={diff:+.3f} km²"
          f"  coverage={coverage:.1f}%{flag}")

if not area_warnings:
    print("[OK] All regional areas match glacier_statistics\n")
else:
    print(f"\n[WARN] Area mismatch in: {', '.join(area_warnings)} — some glacier files may be missing\n")

# ---- Build and write CSV ---------------------------------------------------
all_rows = []
for fp in nc_files:
    all_rows.extend(nc_to_rows(fp))

df = pd.DataFrame(all_rows)
df = df.sort_values(["region_id", "_sort_time"]).reset_index(drop=True)
df = df[COLUMNS]
df["glacier_change_observed"] = df["glacier_change_observed"].round(4)
df["glacier_change_observed_above_water_level"] = df["glacier_change_observed"]

df.to_csv(OUTPUT_CSV, index=False, na_rep="")
print(f"Wrote {len(df)} rows ({df['region_id'].nunique()} regions) → {OUTPUT_CSV}")