"""
ukesm_scenarios.py
------------------
Helper module for loading and stitching UKESM1-2-LL scenario files.

Scenarios branch from each other at different points in time (see figure).
This module resolves the full ancestry chain for any requested scenario and
returns a single continuous xarray.Dataset from the pre-industrial past
through the end of the scenario.

Files are searched in DIRPATH and NEW_DATA_DIRPATH. If both contain the same
scenario/variable, NEW_DATA_DIRPATH is preferred and a note is printed.

Scenario tree (approximate branch years):
    piControl  ──────────────────────────────────────────────────────────────
    hist       (1850–2014, standalone)
    up2p0      (1850+, standalone — parallel run to hist, not a branch)
      ├── up2p0-gwl1p5             (branches ~1919)
      │     ├── up2p0-gwl1p5-50y-dn1p0    (branches ~1969)
      │     └── up2p0-gwl1p5-200y-dn1p0   (branches ~2119)
      ├── up2p0-gwl2p0             (branches ~1944)
      │     ├── up2p0-gwl2p0-50y-dn0p5    (branches ~1994)
      │     ├── up2p0-gwl2p0-50y-dn1p0    (branches ~1994)
      │     ├── up2p0-gwl2p0-50y-dn2p0    (branches ~1994)
      │     ├── up2p0-gwl2p0-200y-dn0p5   (branches ~2144)
      │     └── up2p0-gwl2p0-200y-dn1p0   (branches ~2144)
      ├── up2p0-gwl3p0             (branches ~1992)
      │     ├── up2p0-gwl3p0-50y-dn0p5    (branches ~2043)
      │     ├── up2p0-gwl3p0-50y-dn1p0    (branches ~2043)
      │     ├── up2p0-gwl3p0-50y-dn2p0    (branches ~2044)
      │     ├── up2p0-gwl3p0-200y-dn0p5   (branches ~2193)
      │     └── up2p0-gwl3p0-200y-dn1p0   (branches ~2182)
      ├── up2p0-gwl4p0             (branches ~2044)
      │     ├── up2p0-gwl4p0-50y-dn0p5    (branches ~2095)
      │     ├── up2p0-gwl4p0-50y-dn2p0    (branches ~2094)
      │     │     └── up2p0-gwl4p0-50y-dn2p0-gwl2p0  (branches ~2233)
      │     └── up2p0-gwl4p0-200y-dn1p0   (branches ~2245)
      ├── up2p0-gwl5p0             (branches ~2082)
      │     ├── up2p0-gwl5p0-50y-dn2p0    (branches ~2132)
      │     ├── up2p0-gwl5p0-200y-dn0p5   (branches ~2282)
      │     ├── up2p0-gwl5p0-200y-dn1p0   (branches ~2282)
      │     └── up2p0-gwl5p0-200y-dn2p0   (branches ~2282)
      └── up2p0-gwl6p0             (branches ~2137)
            ├── up2p0-gwl6p0-50y-dn1p0    (branches ~2187)
            ├── up2p0-gwl6p0-50y-dn2p0    (branches ~2187)
            ├── up2p0-gwl6p0-200y-dn0p5   (branches ~2337)
            ├── up2p0-gwl6p0-200y-dn1p0   (branches ~2337)
            └── up2p0-gwl6p0-200y-dn2p0   (branches ~2337)
"""

import glob
import xarray as xr
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime

# ---------------------------------------------------------------------------
# Configuration — adjust these two globals for your environment
# ---------------------------------------------------------------------------

DIRPATH = "/home/www/lschuster/terrafirma_oggm_proj/terrafirma_data/raw_data"

# Additional local data directory.  Files here supplement DIRPATH; having the
# same scenario/variable in both directories raises a FileExistsError.
NEW_DATA_DIRPATH = str(Path(__file__).parent / "new_data")

VARIABLES = ["tas", "pr"]  # variables to load; set to None to load all

# Prefix prepended to scenario names in the actual filenames on disk
# e.g. registry name "up2p0" -> filename fragment "esm-up2p0"
FILE_SCENARIO_PREFIX = "esm-"

# ---------------------------------------------------------------------------
# Scenario registry
#
# Each entry maps a scenario name to its *parent* scenario and the year at
# which it branches off.  The loader will concatenate the parent chain up to
# the branch year, then append the child scenario's own files.
#
# branch_year: the first year that belongs to the child scenario.
#              Everything strictly before this year is taken from the parent.
# ---------------------------------------------------------------------------

SCENARIO_PARENTS: dict[str, dict] = {
    # Leaf entries have no parent (or parent = None / piControl not loaded here)
    "piControl": {"parent": None,  "branch_year": None},
    "hist":      {"parent": None,  "branch_year": None},  # standalone 1850–2014
    "up2p0":     {"parent": None,  "branch_year": None},  # standalone from 1850, parallel to hist

    # GWL1.5 branch and its children
    "up2p0-gwl1p5":             {"parent": "up2p0",         "branch_year": 1919},
    "up2p0-gwl1p5-50y-dn1p0":   {"parent": "up2p0-gwl1p5", "branch_year": 1969},
    "up2p0-gwl1p5-200y-dn1p0":  {"parent": "up2p0-gwl1p5", "branch_year": 2119},

    # GWL2.0 branch and its children
    "up2p0-gwl2p0":             {"parent": "up2p0",          "branch_year": 1944},
    "up2p0-gwl2p0-50y-dn0p5":   {"parent": "up2p0-gwl2p0",  "branch_year": 1994},
    "up2p0-gwl2p0-50y-dn1p0":   {"parent": "up2p0-gwl2p0",  "branch_year": 1994},
    "up2p0-gwl2p0-50y-dn2p0":   {"parent": "up2p0-gwl2p0",  "branch_year": 1994},
    "up2p0-gwl2p0-200y-dn0p5":  {"parent": "up2p0-gwl2p0",  "branch_year": 2144},
    "up2p0-gwl2p0-200y-dn1p0":  {"parent": "up2p0-gwl2p0",  "branch_year": 2144},

    # GWL3.0 branch and its children
    "up2p0-gwl3p0":             {"parent": "up2p0",          "branch_year": 1992},
    "up2p0-gwl3p0-50y-dn0p5":   {"parent": "up2p0-gwl3p0",  "branch_year": 2043},
    "up2p0-gwl3p0-50y-dn1p0":   {"parent": "up2p0-gwl3p0",  "branch_year": 2043},
    "up2p0-gwl3p0-50y-dn2p0":   {"parent": "up2p0-gwl3p0",  "branch_year": 2044},
    "up2p0-gwl3p0-200y-dn0p5":  {"parent": "up2p0-gwl3p0",  "branch_year": 2193},
    "up2p0-gwl3p0-200y-dn1p0":  {"parent": "up2p0-gwl3p0",  "branch_year": 2182},

    # GWL4.0 branch and its children
    "up2p0-gwl4p0":                     {"parent": "up2p0",                   "branch_year": 2044},
    "up2p0-gwl4p0-50y-dn0p5":           {"parent": "up2p0-gwl4p0",            "branch_year": 2095},
    "up2p0-gwl4p0-50y-dn2p0":           {"parent": "up2p0-gwl4p0",            "branch_year": 2094},
    "up2p0-gwl4p0-50y-dn2p0-gwl2p0":    {"parent": "up2p0-gwl4p0-50y-dn2p0",  "branch_year": 2233},
    "up2p0-gwl4p0-200y-dn1p0":          {"parent": "up2p0-gwl4p0",            "branch_year": 2245},

    # GWL5.0 branch and its children
    "up2p0-gwl5p0":             {"parent": "up2p0",          "branch_year": 2082},
    "up2p0-gwl5p0-50y-dn2p0":   {"parent": "up2p0-gwl5p0",  "branch_year": 2132},
    "up2p0-gwl5p0-200y-dn0p5":  {"parent": "up2p0-gwl5p0",  "branch_year": 2282},
    "up2p0-gwl5p0-200y-dn1p0":  {"parent": "up2p0-gwl5p0",  "branch_year": 2282},
    "up2p0-gwl5p0-200y-dn2p0":  {"parent": "up2p0-gwl5p0",  "branch_year": 2282},

    # GWL6.0 branch and its children
    "up2p0-gwl6p0":             {"parent": "up2p0",          "branch_year": 2137},
    "up2p0-gwl6p0-50y-dn1p0":   {"parent": "up2p0-gwl6p0",  "branch_year": 2187},
    "up2p0-gwl6p0-50y-dn2p0":   {"parent": "up2p0-gwl6p0",  "branch_year": 2187},
    "up2p0-gwl6p0-200y-dn0p5":  {"parent": "up2p0-gwl6p0",  "branch_year": 2337},
    "up2p0-gwl6p0-200y-dn1p0":  {"parent": "up2p0-gwl6p0",  "branch_year": 2337},
    "up2p0-gwl6p0-200y-dn2p0":  {"parent": "up2p0-gwl6p0",  "branch_year": 2337},
}

# Scenarios that must never appear as a parent of another scenario
_UNBRANCHING_SCENARIOS = {"hist", "piControl"}

# Convenience list of all known scenario names (for validation / iteration)
ALL_SCENARIOS: list[str] = list(SCENARIO_PARENTS.keys())

# ---------------------------------------------------------------------------
# Module-load-time sanity checks on the registry
# ---------------------------------------------------------------------------
for _name, _info in SCENARIO_PARENTS.items():
    _parent = _info["parent"]
    if _parent is None:
        continue
    if _parent not in SCENARIO_PARENTS:
        raise ValueError(
            f"SCENARIO_PARENTS['{_name}'] references unknown parent '{_parent}'."
        )
    if _parent in _UNBRANCHING_SCENARIOS:
        raise ValueError(
            f"SCENARIO_PARENTS['{_name}'] uses '{_parent}' as a parent, "
            f"but '{_parent}' is never branched from."
        )
del _name, _info, _parent  # clean up loop variables from module namespace


# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------

def _glob_scenario_files(scenario: str, variable: str, dirpath: str = DIRPATH) -> list[str]:
    """Return sorted list of NetCDF files for a given scenario and variable.

    Searches *dirpath* and NEW_DATA_DIRPATH. If both contain matching files,
    NEW_DATA_DIRPATH is preferred and a note is printed.
    """
    file_scenario = f"{FILE_SCENARIO_PREFIX}{scenario}"
    fname_glob = f"{variable}_Amon_UKESM1-2-LL_{file_scenario}_r1i1p1f1_gn_*.nc"

    files_main = sorted(glob.glob(str(Path(dirpath) / fname_glob)))

    files_extra: list[str] = []
    if NEW_DATA_DIRPATH and Path(NEW_DATA_DIRPATH).resolve() != Path(dirpath).resolve():
        files_extra = sorted(glob.glob(str(Path(NEW_DATA_DIRPATH) / fname_glob)))

    if files_main and files_extra:
        print(
            f"  Note: scenario='{scenario}', variable='{variable}' found in both "
            f"directories; using new_data version (longer record).\n"
            f"    DIRPATH  : {[Path(f).name for f in files_main]}\n"
            f"    new_data : {[Path(f).name for f in files_extra]}"
        )
        return files_extra

    files = files_main or files_extra
    if not files:
        raise FileNotFoundError(
            f"No files found for scenario='{scenario}', variable='{variable}' in:\n"
            f"  {Path(dirpath) / fname_glob}\n"
            f"  {Path(NEW_DATA_DIRPATH) / fname_glob}"
        )
    return files


def _fill_missing_months(ds: xr.Dataset, scenario: str = "") -> xr.Dataset:
    """Detect single missing months and fill them with the same month from the prior year.

    A gap of exactly 2 in the month-index sequence (year*12 + month-1) means
    one month is absent.  Each such gap is filled by copying the data for
    that calendar month from the immediately preceding year and re-labelling
    the time coordinate.  Raises an error if more than 5 gaps are found
    (which would indicate a structural problem rather than a sporadic glitch).
    """
    time = ds.time
    yr = time.dt.year.values
    mo = time.dt.month.values
    month_idx = yr * 12 + (mo - 1)
    diffs = month_idx[1:] - month_idx[:-1]
    gap_pos = [i for i, d in enumerate(diffs) if d == 2]

    if not gap_pos:
        return ds

    if len(gap_pos) > 5:
        raise ValueError(
            f"Scenario '{scenario}': {len(gap_pos)} missing-month gaps found; "
            f"expected at most a few.  Check the raw files."
        )

    print(f"  Filling {len(gap_pos)} missing month(s) in '{scenario}'.")

    pieces = []
    prev = 0
    for i in gap_pos:
        pieces.append(ds.isel(time=slice(prev, i + 1)))

        # Determine the missing calendar month/year
        t_before = time.values[i]
        miss_month = t_before.month % 12 + 1
        miss_year = t_before.year + (1 if t_before.month == 12 else 0)

        # Source: same calendar month from the previous year
        src_mask = (ds.time.dt.year == miss_year - 1) & (ds.time.dt.month == miss_month)
        src = ds.sel(time=src_mask)
        if len(src.time) == 0:
            raise ValueError(
                f"Cannot fill missing {miss_year}-{miss_month:02d} in '{scenario}': "
                f"no data for the same month in year {miss_year - 1}."
            )

        t_src = src.time.values[0]
        t_new = t_src.replace(year=miss_year)
        fill = src.assign_coords(time=[t_new])
        pieces.append(fill)
        prev = i + 1

    pieces.append(ds.isel(time=slice(prev, None)))
    return xr.concat(pieces, dim="time", data_vars="minimal", coords="minimal", compat="override")


def _set_month_start_time(ds: xr.Dataset) -> xr.Dataset:
    """Normalize all timestamps to the first day of their month."""
    new_time = [t.replace(day=1) for t in ds.time.values]
    return ds.assign_coords(time=("time", new_time))


def _open_scenario_raw(scenario: str, variable: str, dirpath: str = DIRPATH) -> xr.Dataset:
    """Open all files for one scenario/variable as a single Dataset (no stitching).

    Automatically fills up to a handful of single missing months by copying
    the same calendar month from the previous year.
    """
    files = _glob_scenario_files(scenario, variable, dirpath)
    # Use the newer decode_times API to keep cftime support without warnings.
    time_coder = xr.coders.CFDatetimeCoder(use_cftime=True)
    ds = xr.open_mfdataset(
        files,
        combine="by_coords",
        decode_times=time_coder,
        data_vars="minimal",
        compat="override",
        coords="minimal",
    )
    ds = _fill_missing_months(ds, scenario=f"{scenario}/{variable}")
    ds = _set_month_start_time(ds)
    return ds


def _resolve_ancestry(scenario: str) -> list[tuple[str, int | None, int | None]]:
    """
    Walk the parent chain and return an ordered list of
        (scenario_name, keep_from_year, keep_until_year)
    where keep_from_year / keep_until_year are the time-slice bounds
    (None means 'open-ended').

    Example for 'up2p0-gwl4p0-50y-dn2p0-gwl2p0':
        [('hist',                         None, 1850),
         ('up2p0',                         1850, 2044),
         ('up2p0-gwl4p0',                  2044, 2095),
         ('up2p0-gwl4p0-50y-dn2p0',        2095, 2233),
         ('up2p0-gwl4p0-50y-dn2p0-gwl2p0', 2233, None)]
    """
    if scenario not in SCENARIO_PARENTS:
        raise ValueError(
            f"Unknown scenario '{scenario}'. "
            f"Known scenarios: {ALL_SCENARIOS}"
        )

    # Build chain from root to leaf
    chain = []
    current = scenario
    while current is not None:
        info = SCENARIO_PARENTS[current]
        chain.append((current, info["branch_year"]))
        current = info["parent"]

    chain.reverse()  # now root-first

    # Convert to (name, keep_from, keep_until) slices
    segments = []
    for i, (name, branch_year) in enumerate(chain):
        keep_from = branch_year  # year this segment starts in the final series
        # The segment ends when the next child branches off
        if i + 1 < len(chain):
            keep_until = chain[i + 1][1]  # branch_year of the next entry
        else:
            keep_until = None  # last segment: keep everything
        segments.append((name, keep_from, keep_until))

    return segments


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------

def load_scenario(
    scenario: str,
    variable: str | list[str] | None = None,
    dirpath: str = DIRPATH,
    lat: float | None = None,
    lon: float | None = None,
) -> xr.Dataset:
    """
    Load a full, stitched time series for *scenario*, optionally restricted
    to one or more *variable* names.

    Parameters
    ----------
    scenario : str
        One of the keys in SCENARIO_PARENTS, e.g. 'up2p0-gwl4p0'.
    variable : str | list[str] | None
        Variable(s) to load.  Defaults to VARIABLES defined at module level.
        Pass None to load all variables found in the files.
    dirpath : str
        Root directory containing the NetCDF files.
    lat : float | None
        If provided together with *lon*, select the nearest-neighbour grid
        point before loading the full spatial field into memory.
    lon : float | None
        Longitude of the point to select (degrees East, -180 to 360).

    Returns
    -------
    xr.Dataset
        A single Dataset with a continuous time coordinate spanning the
        scenario's full history (stitched from parent segments as needed).
        If lat/lon were supplied the spatial dimensions are dropped and the
        Dataset contains only the time series for that grid point.
    """
    if variable is None:
        variable = VARIABLES
    if isinstance(variable, str):
        variable = [variable]

    segments = _resolve_ancestry(scenario)

    # Load and slice each segment, then concatenate
    pieces: list[xr.Dataset] = []

    for seg_name, keep_from, keep_until in segments:
        # Load raw data for this ancestor/scenario
        # For multi-variable loads we open each variable separately and merge
        if variable is not None:
            ds_list = []
            for var in variable:
                try:
                    ds_var = _open_scenario_raw(seg_name, var, dirpath)
                    if lat is not None and lon is not None:
                        ds_var = ds_var.sel(lat=lat, lon=lon, method="nearest")
                    ds_list.append(ds_var)
                except FileNotFoundError as e:
                    print(f"  Warning: {e}")
            if not ds_list:
                raise FileNotFoundError(
                    f"No data found for any variable in segment '{seg_name}'."
                )
            ds = xr.merge(ds_list, compat="override")
        else:
            # Load the first variable found (fallback, single-var glob)
            raise ValueError("variable=None not supported; specify variable(s) explicitly.")

        # Slice the *end* of this segment only — we never truncate the start.
        # keep_until is the branch year of the next child (exclusive upper bound);
        # everything from that year onward belongs to the child segment instead.
        if keep_until is not None:
            ds = ds.sel(time=ds.time.dt.year < keep_until)

        if len(ds.time) == 0:
            raise ValueError(
                f"Segment '{seg_name}' yielded 0 time steps after slicing to "
                f"year < {keep_until}. Check branch years in SCENARIO_PARENTS."
            )

        pieces.append(ds)

    if not pieces:
        raise RuntimeError(f"No data loaded for scenario '{scenario}'.")

    # Concatenate along time
    stitched = xr.concat(pieces, dim="time", data_vars="minimal", coords="minimal", compat="override")

    # ------------------------------------------------------------------
    # Continuity check: monthly time axis must be gapless and duplicate-free
    # ------------------------------------------------------------------
    time = stitched.time
    if len(time) > 1:
        # Express time as (year * 12 + month) integers for arithmetic
        months = time.dt.year.values * 12 + (time.dt.month.values - 1)
        diffs = months[1:] - months[:-1]
        gaps = (diffs != 1).sum()
        if gaps > 0:
            bad = [(str(time.values[i]), str(time.values[i + 1]))
                   for i, d in enumerate(diffs) if d != 1]
            raise ValueError(
                f"Stitched time series for '{scenario}' has {gaps} discontinuity/ies "
                f"(missing or duplicated months). First offending transition(s): {bad[:5]}"
            )

    # Attach scenario metadata
    stitched.attrs["scenario"] = scenario
    stitched.attrs["ancestry"] = " → ".join(s[0] for s in segments)

    # ------------------------------------------------------------------
    # Handle an incomplete final year (leaf scenario tail only).
    # If exactly one month is missing (11/12), fill it from the same
    # calendar month in the previous year. For larger incompleteness,
    # keep conservative behavior and crop the final year.
    # ------------------------------------------------------------------
    last_year = int(stitched.time.dt.year.values[-1])
    last_year_count = int((stitched.time.dt.year == last_year).sum())
    if last_year_count == 11:
        months_present = set(int(m) for m in stitched.time.dt.month.where(stitched.time.dt.year == last_year, drop=True).values)
        missing_months = [m for m in range(1, 13) if m not in months_present]
        if len(missing_months) != 1:
            raise ValueError(
                f"Expected exactly one missing month in final year {last_year} for '{scenario}', "
                f"but found {len(missing_months)}: {missing_months}."
            )
        miss_month = missing_months[0]
        src_mask = (stitched.time.dt.year == last_year - 1) & (stitched.time.dt.month == miss_month)
        src = stitched.sel(time=src_mask)
        if len(src.time) == 0:
            raise ValueError(
                f"Cannot fill missing final month {last_year}-{miss_month:02d} in '{scenario}': "
                f"no source month found in {last_year - 1}-{miss_month:02d}."
            )
        t_src = src.time.values[0]
        t_new = t_src.replace(year=last_year)
        fill = src.assign_coords(time=[t_new])
        stitched = xr.concat([stitched, fill], dim="time", data_vars="minimal", coords="minimal", compat="override")
        stitched = stitched.sortby("time")
        print(
            f"  Filled missing final month {last_year}-{miss_month:02d} "
            f"from {last_year - 1}-{miss_month:02d} in '{scenario}'."
        )
    elif last_year_count < 12:
        print(f"  Cropping incomplete last year {last_year} "
              f"({last_year_count}/12 months) from '{scenario}'.")
        stitched = stitched.sel(time=stitched.time.dt.year < last_year)

    return stitched


def load_all_scenarios(
    variable: str | list[str] | None = None,
    dirpath: str = DIRPATH,
    scenarios: list[str] | None = None,
    lat: float | None = None,
    lon: float | None = None,
) -> dict[str, xr.Dataset]:
    """
    Convenience wrapper: load multiple (or all) scenarios into a dict.

    Parameters
    ----------
    variable : str | list[str] | None
        Variable(s) to load (default: module-level VARIABLES).
    dirpath : str
        Root directory containing the NetCDF files.
    scenarios : list[str] | None
        Subset of scenarios to load; defaults to ALL_SCENARIOS.
    lat : float | None
        Nearest-neighbour latitude to select (forwarded to load_scenario).
    lon : float | None
        Nearest-neighbour longitude to select (forwarded to load_scenario).

    Returns
    -------
    dict[str, xr.Dataset]
        Mapping scenario_name → stitched Dataset.
    """
    if scenarios is None:
        scenarios = ALL_SCENARIOS
    return {s: load_scenario(s, variable=variable, dirpath=dirpath, lat=lat, lon=lon) for s in scenarios}


def _to_monthly_dataframe(series_by_scenario: dict[str, pd.Series]) -> pd.DataFrame:
    """Combine per-scenario monthly Series into one DataFrame."""
    if not series_by_scenario:
        return pd.DataFrame()
    df = pd.concat(series_by_scenario, axis=1)
    df.columns = [str(c) for c in df.columns]
    df.index.name = "time"
    return df


def _log(msg: str) -> None:
    """Small timestamped logger for long-running CLI tasks."""
    print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}", flush=True)


def _write_flattened_glacier_file(
    ds: xr.Dataset,
    glacier_ds: xr.Dataset,
    scenario: str,
    out_dir: Path,
) -> Path:
    """Write a flattened glacier-only NetCDF for one scenario.

    Output variables:
      - tas(time, points)
      - pr(time, points)
      - glacier_area(points)
      - glacier_area_by_region(reg, points)
      - latitude(points)
      - longitude(points)
    """
    # Align glacier weights to climate coordinates (handles tiny numeric
    # coordinate differences between files while keeping the same grid shape).
    glacier_on_ds = glacier_ds.sel(lat=ds.lat, lon=ds.lon, method="nearest")

    mask = glacier_on_ds.glacier_area.fillna(0) > 0
    mask_points = mask.stack(points=("lat", "lon"))
    keep_idx = np.flatnonzero(mask_points.values)
    if len(keep_idx) == 0:
        raise ValueError(f"Scenario '{scenario}': no glacier points found (mask empty).")

    ds_flat = ds[["tas", "pr"]].stack(points=("lat", "lon")).isel(points=keep_idx)
    # Drop stacked MultiIndex to avoid conflicts when assigning integer points.
    ds_flat = ds_flat.reset_index("points", drop=False)
    area_flat = glacier_on_ds.glacier_area.fillna(0).stack(points=("lat", "lon")).isel(points=keep_idx)
    area_flat = area_flat.reset_index("points", drop=True)
    area_reg_flat = (
        glacier_on_ds.glacier_area_by_region.fillna(0)
        .stack(points=("lat", "lon"))
        .isel(points=keep_idx)
    )
    area_reg_flat = area_reg_flat.reset_index("points", drop=True)

    out = xr.Dataset(
        data_vars={
            "tas": ds_flat["tas"],
            "pr": ds_flat["pr"],
            "glacier_area": area_flat,
            "glacier_area_by_region": area_reg_flat,
        },
        coords={
            "time": ds_flat.time,
            "points": np.arange(ds_flat.sizes["points"], dtype=np.int64),
            "reg": glacier_on_ds.reg,
            "latitude": ("points", ds_flat.lat.values),
            "longitude": ("points", ds_flat.lon.values),
        },
        attrs={
            "Conventions": "CF-1.7",
            "history": (
                f"{datetime.now().isoformat(timespec='seconds')} stitched scenario and "
                "flattened to glacier-only points"
            ),
            "source": "UKESM1-2-LL TerraFIRMA scenario stitching",
            "scenario": scenario,
            "note": "Contains only grid points where glacier_area > 0.",
        },
    )

    out["tas"].attrs.setdefault("long_name", "Near-surface air temperature")
    out["pr"].attrs.setdefault("long_name", "Precipitation")
    out["latitude"].attrs.update({"units": "degrees_north", "standard_name": "latitude"})
    out["longitude"].attrs.update({"units": "degrees_east", "standard_name": "longitude"})

    out_dir.mkdir(parents=True, exist_ok=True)
    out_path = out_dir / f"ukesm_{scenario}_flat_glaciers.nc"
    out.to_netcdf(out_path)
    out.close()
    return out_path


def generate_summary_files(
    summary_dir: str | Path = "summary",
    scenarios: list[str] | None = None,
    dirpath: str = DIRPATH,
    glacier_grid_path: str | Path = "glacier_grid_on_ukesm.nc",
) -> None:
    """Generate monthly summary CSV files for all scenarios.

        Outputs written in *summary_dir*:
      - global_tas_monthly.csv
      - global_pr_monthly.csv
      - glacier_global_tas_monthly.csv
      - glacier_global_pr_monthly.csv
      - by_region/tas_monthly_regXX.csv
      - by_region/pr_monthly_regXX.csv
            - flattened/ukesm_<scenario>_flat_glaciers.nc
    """
    if scenarios is None:
        scenarios = ALL_SCENARIOS

    summary_dir = Path(summary_dir)
    by_region_dir = summary_dir / "by_region"
    flat_dir = summary_dir / "flattened"
    summary_dir.mkdir(parents=True, exist_ok=True)
    by_region_dir.mkdir(parents=True, exist_ok=True)
    flat_dir.mkdir(parents=True, exist_ok=True)

    glacier_grid_path = Path(glacier_grid_path)
    if not glacier_grid_path.is_absolute():
        glacier_grid_path = Path(__file__).parent / glacier_grid_path
    if not glacier_grid_path.exists():
        raise FileNotFoundError(f"Glacier grid file not found: {glacier_grid_path}")

    _log(f"Loading glacier grid: {glacier_grid_path}")
    gds = xr.open_dataset(glacier_grid_path)
    if "glacier_area" not in gds or "glacier_area_by_region" not in gds:
        raise ValueError(
            "Glacier grid dataset must contain 'glacier_area' and 'glacier_area_by_region'."
        )

    region_ids = [int(r) for r in gds.reg.values]

    global_tas: dict[str, pd.Series] = {}
    global_pr: dict[str, pd.Series] = {}
    glacier_tas: dict[str, pd.Series] = {}
    glacier_pr: dict[str, pd.Series] = {}
    glacier_tas_by_reg: dict[int, dict[str, pd.Series]] = {r: {} for r in region_ids}
    glacier_pr_by_reg: dict[int, dict[str, pd.Series]] = {r: {} for r in region_ids}

    for scenario in scenarios:
        _log(f"Scenario {scenario}: loading stitched tas/pr dataset")
        ds = load_scenario(scenario, variable=["tas", "pr"], dirpath=dirpath)
        _log(f"Scenario {scenario}: loaded {ds.sizes['time']} monthly steps — reading into memory")
        ds.load()  # force all lazy chunks into RAM once; avoids 38 repeated disk reads in the loops below
        _log(f"Scenario {scenario}: in-memory load complete")

        # Global area-weighted means (cos(latitude) weights)
        w_global = xr.DataArray(
            np.cos(np.deg2rad(ds.lat.values)),
            coords={"lat": ds.lat},
            dims=("lat",),
            name="w_global",
        )
        ts_tas = ds.tas.weighted(w_global).mean(dim=("lon", "lat"))
        ts_pr = ds.pr.weighted(w_global).mean(dim=("lon", "lat"))
        global_tas[scenario] = ts_tas.to_series()
        global_pr[scenario] = ts_pr.to_series()

        # Glacier global weighted means
        w_glacier_global = gds.glacier_area.fillna(0)
        gs_tas = ds.tas.weighted(w_glacier_global).mean(dim=("lon", "lat"))
        gs_pr = ds.pr.weighted(w_glacier_global).mean(dim=("lon", "lat"))
        glacier_tas[scenario] = gs_tas.to_series()
        glacier_pr[scenario] = gs_pr.to_series()

        # Per-region glacier weighted means
        for reg in region_ids:
            w_reg = gds.glacier_area_by_region.sel(reg=reg).fillna(0)
            if float(w_reg.sum()) <= 0:
                continue
            rs_tas = ds.tas.weighted(w_reg).mean(dim=("lon", "lat"))
            rs_pr = ds.pr.weighted(w_reg).mean(dim=("lon", "lat"))
            glacier_tas_by_reg[reg][scenario] = rs_tas.to_series()
            glacier_pr_by_reg[reg][scenario] = rs_pr.to_series()

        out_flat = _write_flattened_glacier_file(ds=ds, glacier_ds=gds, scenario=scenario, out_dir=flat_dir)
        _log(f"Scenario {scenario}: wrote flattened glacier file {out_flat}")
        ds.close()

    _log(f"Writing summary CSV files to: {summary_dir}")
    _to_monthly_dataframe(global_tas).to_csv(summary_dir / "global_tas_monthly.csv")
    _to_monthly_dataframe(global_pr).to_csv(summary_dir / "global_pr_monthly.csv")
    _to_monthly_dataframe(glacier_tas).to_csv(summary_dir / "glacier_global_tas_monthly.csv")
    _to_monthly_dataframe(glacier_pr).to_csv(summary_dir / "glacier_global_pr_monthly.csv")

    for reg in region_ids:
        reg_tag = f"reg{reg:02d}"
        _to_monthly_dataframe(glacier_tas_by_reg[reg]).to_csv(
            by_region_dir / f"tas_monthly_{reg_tag}.csv"
        )
        _to_monthly_dataframe(glacier_pr_by_reg[reg]).to_csv(
            by_region_dir / f"pr_monthly_{reg_tag}.csv"
        )

    _log("Summary generation complete.")


# ---------------------------------------------------------------------------
# Quick demo / smoke test
# ---------------------------------------------------------------------------

if __name__ == "__main__":
    import sys

    # CLI mode 1: summary generation
    #   python terrafirma_helpers.py summaries
    #   python terrafirma_helpers.py summaries up2p0,up2p0-gwl2p0
    if len(sys.argv) > 1 and sys.argv[1] == "summaries":
        scenarios_arg = sys.argv[2] if len(sys.argv) > 2 else None
        scenarios = None if scenarios_arg is None else [s.strip() for s in scenarios_arg.split(",") if s.strip()]
        generate_summary_files(scenarios=scenarios)
        raise SystemExit(0)

    # CLI mode 2: existing smoke test
    scenario = sys.argv[1] if len(sys.argv) > 1 else "up2p0-gwl4p0"
    var = sys.argv[2] if len(sys.argv) > 2 else "tas"

    print(f"Resolving ancestry for '{scenario}':")
    for seg, frm, to in _resolve_ancestry(scenario):
        print(f"  {seg:45s}  [{frm} – {to}]")

    print(f"\nLoading variable '{var}' ...")
    ds = load_scenario(scenario, variable=var, lat=float(sys.argv[3]) if len(sys.argv) > 3 else None, lon=float(sys.argv[4]) if len(sys.argv) > 4 else None)
    print(ds)
    print(f"\nTime range: {ds.time.values[0]} → {ds.time.values[-1]}")
    print(f"Ancestry:   {ds.attrs['ancestry']}")