""" ukesm_scenarios.py ------------------ Helper module for loading and stitching UKESM1-2-LL scenario files. Scenarios branch from each other at different points in time (see figure). This module resolves the full ancestry chain for any requested scenario and returns a single continuous xarray.Dataset from the pre-industrial past through the end of the scenario. Files are searched in DIRPATH and NEW_DATA_DIRPATH. If both contain the same scenario/variable, NEW_DATA_DIRPATH is preferred and a note is printed. Scenario tree (approximate branch years): piControl ────────────────────────────────────────────────────────────── hist (1850–2014, standalone) up2p0 (1850+, standalone — parallel run to hist, not a branch) ├── up2p0-gwl1p5 (branches ~1919) │ ├── up2p0-gwl1p5-50y-dn1p0 (branches ~1969) │ └── up2p0-gwl1p5-200y-dn1p0 (branches ~2119) ├── up2p0-gwl2p0 (branches ~1944) │ ├── up2p0-gwl2p0-50y-dn0p5 (branches ~1994) │ ├── up2p0-gwl2p0-50y-dn1p0 (branches ~1994) │ ├── up2p0-gwl2p0-50y-dn2p0 (branches ~1994) │ ├── up2p0-gwl2p0-200y-dn0p5 (branches ~2144) │ └── up2p0-gwl2p0-200y-dn1p0 (branches ~2144) ├── up2p0-gwl3p0 (branches ~1992) │ ├── up2p0-gwl3p0-50y-dn0p5 (branches ~2043) │ ├── up2p0-gwl3p0-50y-dn1p0 (branches ~2043) │ ├── up2p0-gwl3p0-50y-dn2p0 (branches ~2044) │ ├── up2p0-gwl3p0-200y-dn0p5 (branches ~2193) │ └── up2p0-gwl3p0-200y-dn1p0 (branches ~2182) ├── up2p0-gwl4p0 (branches ~2044) │ ├── up2p0-gwl4p0-50y-dn0p5 (branches ~2095) │ ├── up2p0-gwl4p0-50y-dn2p0 (branches ~2094) │ │ └── up2p0-gwl4p0-50y-dn2p0-gwl2p0 (branches ~2233) │ └── up2p0-gwl4p0-200y-dn1p0 (branches ~2245) ├── up2p0-gwl5p0 (branches ~2082) │ ├── up2p0-gwl5p0-50y-dn2p0 (branches ~2132) │ ├── up2p0-gwl5p0-200y-dn0p5 (branches ~2282) │ ├── up2p0-gwl5p0-200y-dn1p0 (branches ~2282) │ └── up2p0-gwl5p0-200y-dn2p0 (branches ~2282) └── up2p0-gwl6p0 (branches ~2137) ├── up2p0-gwl6p0-50y-dn1p0 (branches ~2187) ├── up2p0-gwl6p0-50y-dn2p0 (branches ~2187) ├── up2p0-gwl6p0-200y-dn0p5 (branches ~2337) ├── up2p0-gwl6p0-200y-dn1p0 (branches ~2337) └── up2p0-gwl6p0-200y-dn2p0 (branches ~2337) """ import glob import xarray as xr import numpy as np import pandas as pd from pathlib import Path from datetime import datetime # --------------------------------------------------------------------------- # Configuration — adjust these two globals for your environment # --------------------------------------------------------------------------- DIRPATH = "/home/www/lschuster/terrafirma_oggm_proj/terrafirma_data/raw_data" # Additional local data directory. Files here supplement DIRPATH; having the # same scenario/variable in both directories raises a FileExistsError. NEW_DATA_DIRPATH = str(Path(__file__).parent / "new_data") VARIABLES = ["tas", "pr"] # variables to load; set to None to load all # Prefix prepended to scenario names in the actual filenames on disk # e.g. registry name "up2p0" -> filename fragment "esm-up2p0" FILE_SCENARIO_PREFIX = "esm-" # --------------------------------------------------------------------------- # Scenario registry # # Each entry maps a scenario name to its *parent* scenario and the year at # which it branches off. The loader will concatenate the parent chain up to # the branch year, then append the child scenario's own files. # # branch_year: the first year that belongs to the child scenario. # Everything strictly before this year is taken from the parent. # --------------------------------------------------------------------------- SCENARIO_PARENTS: dict[str, dict] = { # Leaf entries have no parent (or parent = None / piControl not loaded here) "piControl": {"parent": None, "branch_year": None}, "hist": {"parent": None, "branch_year": None}, # standalone 1850–2014 "up2p0": {"parent": None, "branch_year": None}, # standalone from 1850, parallel to hist # GWL1.5 branch and its children "up2p0-gwl1p5": {"parent": "up2p0", "branch_year": 1919}, "up2p0-gwl1p5-50y-dn1p0": {"parent": "up2p0-gwl1p5", "branch_year": 1969}, "up2p0-gwl1p5-200y-dn1p0": {"parent": "up2p0-gwl1p5", "branch_year": 2119}, # GWL2.0 branch and its children "up2p0-gwl2p0": {"parent": "up2p0", "branch_year": 1944}, "up2p0-gwl2p0-50y-dn0p5": {"parent": "up2p0-gwl2p0", "branch_year": 1994}, "up2p0-gwl2p0-50y-dn1p0": {"parent": "up2p0-gwl2p0", "branch_year": 1994}, "up2p0-gwl2p0-50y-dn2p0": {"parent": "up2p0-gwl2p0", "branch_year": 1994}, "up2p0-gwl2p0-200y-dn0p5": {"parent": "up2p0-gwl2p0", "branch_year": 2144}, "up2p0-gwl2p0-200y-dn1p0": {"parent": "up2p0-gwl2p0", "branch_year": 2144}, # GWL3.0 branch and its children "up2p0-gwl3p0": {"parent": "up2p0", "branch_year": 1992}, "up2p0-gwl3p0-50y-dn0p5": {"parent": "up2p0-gwl3p0", "branch_year": 2043}, "up2p0-gwl3p0-50y-dn1p0": {"parent": "up2p0-gwl3p0", "branch_year": 2043}, "up2p0-gwl3p0-50y-dn2p0": {"parent": "up2p0-gwl3p0", "branch_year": 2044}, "up2p0-gwl3p0-200y-dn0p5": {"parent": "up2p0-gwl3p0", "branch_year": 2193}, "up2p0-gwl3p0-200y-dn1p0": {"parent": "up2p0-gwl3p0", "branch_year": 2182}, # GWL4.0 branch and its children "up2p0-gwl4p0": {"parent": "up2p0", "branch_year": 2044}, "up2p0-gwl4p0-50y-dn0p5": {"parent": "up2p0-gwl4p0", "branch_year": 2095}, "up2p0-gwl4p0-50y-dn2p0": {"parent": "up2p0-gwl4p0", "branch_year": 2094}, "up2p0-gwl4p0-50y-dn2p0-gwl2p0": {"parent": "up2p0-gwl4p0-50y-dn2p0", "branch_year": 2233}, "up2p0-gwl4p0-200y-dn1p0": {"parent": "up2p0-gwl4p0", "branch_year": 2245}, # GWL5.0 branch and its children "up2p0-gwl5p0": {"parent": "up2p0", "branch_year": 2082}, "up2p0-gwl5p0-50y-dn2p0": {"parent": "up2p0-gwl5p0", "branch_year": 2132}, "up2p0-gwl5p0-200y-dn0p5": {"parent": "up2p0-gwl5p0", "branch_year": 2282}, "up2p0-gwl5p0-200y-dn1p0": {"parent": "up2p0-gwl5p0", "branch_year": 2282}, "up2p0-gwl5p0-200y-dn2p0": {"parent": "up2p0-gwl5p0", "branch_year": 2282}, # GWL6.0 branch and its children "up2p0-gwl6p0": {"parent": "up2p0", "branch_year": 2137}, "up2p0-gwl6p0-50y-dn1p0": {"parent": "up2p0-gwl6p0", "branch_year": 2187}, "up2p0-gwl6p0-50y-dn2p0": {"parent": "up2p0-gwl6p0", "branch_year": 2187}, "up2p0-gwl6p0-200y-dn0p5": {"parent": "up2p0-gwl6p0", "branch_year": 2337}, "up2p0-gwl6p0-200y-dn1p0": {"parent": "up2p0-gwl6p0", "branch_year": 2337}, "up2p0-gwl6p0-200y-dn2p0": {"parent": "up2p0-gwl6p0", "branch_year": 2337}, } # Scenarios that must never appear as a parent of another scenario _UNBRANCHING_SCENARIOS = {"hist", "piControl"} # Convenience list of all known scenario names (for validation / iteration) ALL_SCENARIOS: list[str] = list(SCENARIO_PARENTS.keys()) # --------------------------------------------------------------------------- # Module-load-time sanity checks on the registry # --------------------------------------------------------------------------- for _name, _info in SCENARIO_PARENTS.items(): _parent = _info["parent"] if _parent is None: continue if _parent not in SCENARIO_PARENTS: raise ValueError( f"SCENARIO_PARENTS['{_name}'] references unknown parent '{_parent}'." ) if _parent in _UNBRANCHING_SCENARIOS: raise ValueError( f"SCENARIO_PARENTS['{_name}'] uses '{_parent}' as a parent, " f"but '{_parent}' is never branched from." ) del _name, _info, _parent # clean up loop variables from module namespace # --------------------------------------------------------------------------- # Internal helpers # --------------------------------------------------------------------------- def _glob_scenario_files(scenario: str, variable: str, dirpath: str = DIRPATH) -> list[str]: """Return sorted list of NetCDF files for a given scenario and variable. Searches *dirpath* and NEW_DATA_DIRPATH. If both contain matching files, NEW_DATA_DIRPATH is preferred and a note is printed. """ file_scenario = f"{FILE_SCENARIO_PREFIX}{scenario}" fname_glob = f"{variable}_Amon_UKESM1-2-LL_{file_scenario}_r1i1p1f1_gn_*.nc" files_main = sorted(glob.glob(str(Path(dirpath) / fname_glob))) files_extra: list[str] = [] if NEW_DATA_DIRPATH and Path(NEW_DATA_DIRPATH).resolve() != Path(dirpath).resolve(): files_extra = sorted(glob.glob(str(Path(NEW_DATA_DIRPATH) / fname_glob))) if files_main and files_extra: print( f" Note: scenario='{scenario}', variable='{variable}' found in both " f"directories; using new_data version (longer record).\n" f" DIRPATH : {[Path(f).name for f in files_main]}\n" f" new_data : {[Path(f).name for f in files_extra]}" ) return files_extra files = files_main or files_extra if not files: raise FileNotFoundError( f"No files found for scenario='{scenario}', variable='{variable}' in:\n" f" {Path(dirpath) / fname_glob}\n" f" {Path(NEW_DATA_DIRPATH) / fname_glob}" ) return files def _fill_missing_months(ds: xr.Dataset, scenario: str = "") -> xr.Dataset: """Detect single missing months and fill them with the same month from the prior year. A gap of exactly 2 in the month-index sequence (year*12 + month-1) means one month is absent. Each such gap is filled by copying the data for that calendar month from the immediately preceding year and re-labelling the time coordinate. Raises an error if more than 5 gaps are found (which would indicate a structural problem rather than a sporadic glitch). """ time = ds.time yr = time.dt.year.values mo = time.dt.month.values month_idx = yr * 12 + (mo - 1) diffs = month_idx[1:] - month_idx[:-1] gap_pos = [i for i, d in enumerate(diffs) if d == 2] if not gap_pos: return ds if len(gap_pos) > 5: raise ValueError( f"Scenario '{scenario}': {len(gap_pos)} missing-month gaps found; " f"expected at most a few. Check the raw files." ) print(f" Filling {len(gap_pos)} missing month(s) in '{scenario}'.") pieces = [] prev = 0 for i in gap_pos: pieces.append(ds.isel(time=slice(prev, i + 1))) # Determine the missing calendar month/year t_before = time.values[i] miss_month = t_before.month % 12 + 1 miss_year = t_before.year + (1 if t_before.month == 12 else 0) # Source: same calendar month from the previous year src_mask = (ds.time.dt.year == miss_year - 1) & (ds.time.dt.month == miss_month) src = ds.sel(time=src_mask) if len(src.time) == 0: raise ValueError( f"Cannot fill missing {miss_year}-{miss_month:02d} in '{scenario}': " f"no data for the same month in year {miss_year - 1}." ) t_src = src.time.values[0] t_new = t_src.replace(year=miss_year) fill = src.assign_coords(time=[t_new]) pieces.append(fill) prev = i + 1 pieces.append(ds.isel(time=slice(prev, None))) return xr.concat(pieces, dim="time", data_vars="minimal", coords="minimal", compat="override") def _set_month_start_time(ds: xr.Dataset) -> xr.Dataset: """Normalize all timestamps to the first day of their month.""" new_time = [t.replace(day=1) for t in ds.time.values] return ds.assign_coords(time=("time", new_time)) def _open_scenario_raw(scenario: str, variable: str, dirpath: str = DIRPATH) -> xr.Dataset: """Open all files for one scenario/variable as a single Dataset (no stitching). Automatically fills up to a handful of single missing months by copying the same calendar month from the previous year. """ files = _glob_scenario_files(scenario, variable, dirpath) # Use the newer decode_times API to keep cftime support without warnings. time_coder = xr.coders.CFDatetimeCoder(use_cftime=True) ds = xr.open_mfdataset( files, combine="by_coords", decode_times=time_coder, data_vars="minimal", compat="override", coords="minimal", ) ds = _fill_missing_months(ds, scenario=f"{scenario}/{variable}") ds = _set_month_start_time(ds) return ds def _resolve_ancestry(scenario: str) -> list[tuple[str, int | None, int | None]]: """ Walk the parent chain and return an ordered list of (scenario_name, keep_from_year, keep_until_year) where keep_from_year / keep_until_year are the time-slice bounds (None means 'open-ended'). Example for 'up2p0-gwl4p0-50y-dn2p0-gwl2p0': [('hist', None, 1850), ('up2p0', 1850, 2044), ('up2p0-gwl4p0', 2044, 2095), ('up2p0-gwl4p0-50y-dn2p0', 2095, 2233), ('up2p0-gwl4p0-50y-dn2p0-gwl2p0', 2233, None)] """ if scenario not in SCENARIO_PARENTS: raise ValueError( f"Unknown scenario '{scenario}'. " f"Known scenarios: {ALL_SCENARIOS}" ) # Build chain from root to leaf chain = [] current = scenario while current is not None: info = SCENARIO_PARENTS[current] chain.append((current, info["branch_year"])) current = info["parent"] chain.reverse() # now root-first # Convert to (name, keep_from, keep_until) slices segments = [] for i, (name, branch_year) in enumerate(chain): keep_from = branch_year # year this segment starts in the final series # The segment ends when the next child branches off if i + 1 < len(chain): keep_until = chain[i + 1][1] # branch_year of the next entry else: keep_until = None # last segment: keep everything segments.append((name, keep_from, keep_until)) return segments # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- def load_scenario( scenario: str, variable: str | list[str] | None = None, dirpath: str = DIRPATH, lat: float | None = None, lon: float | None = None, ) -> xr.Dataset: """ Load a full, stitched time series for *scenario*, optionally restricted to one or more *variable* names. Parameters ---------- scenario : str One of the keys in SCENARIO_PARENTS, e.g. 'up2p0-gwl4p0'. variable : str | list[str] | None Variable(s) to load. Defaults to VARIABLES defined at module level. Pass None to load all variables found in the files. dirpath : str Root directory containing the NetCDF files. lat : float | None If provided together with *lon*, select the nearest-neighbour grid point before loading the full spatial field into memory. lon : float | None Longitude of the point to select (degrees East, -180 to 360). Returns ------- xr.Dataset A single Dataset with a continuous time coordinate spanning the scenario's full history (stitched from parent segments as needed). If lat/lon were supplied the spatial dimensions are dropped and the Dataset contains only the time series for that grid point. """ if variable is None: variable = VARIABLES if isinstance(variable, str): variable = [variable] segments = _resolve_ancestry(scenario) # Load and slice each segment, then concatenate pieces: list[xr.Dataset] = [] for seg_name, keep_from, keep_until in segments: # Load raw data for this ancestor/scenario # For multi-variable loads we open each variable separately and merge if variable is not None: ds_list = [] for var in variable: try: ds_var = _open_scenario_raw(seg_name, var, dirpath) if lat is not None and lon is not None: ds_var = ds_var.sel(lat=lat, lon=lon, method="nearest") ds_list.append(ds_var) except FileNotFoundError as e: print(f" Warning: {e}") if not ds_list: raise FileNotFoundError( f"No data found for any variable in segment '{seg_name}'." ) ds = xr.merge(ds_list, compat="override") else: # Load the first variable found (fallback, single-var glob) raise ValueError("variable=None not supported; specify variable(s) explicitly.") # Slice the *end* of this segment only — we never truncate the start. # keep_until is the branch year of the next child (exclusive upper bound); # everything from that year onward belongs to the child segment instead. if keep_until is not None: ds = ds.sel(time=ds.time.dt.year < keep_until) if len(ds.time) == 0: raise ValueError( f"Segment '{seg_name}' yielded 0 time steps after slicing to " f"year < {keep_until}. Check branch years in SCENARIO_PARENTS." ) pieces.append(ds) if not pieces: raise RuntimeError(f"No data loaded for scenario '{scenario}'.") # Concatenate along time stitched = xr.concat(pieces, dim="time", data_vars="minimal", coords="minimal", compat="override") # ------------------------------------------------------------------ # Continuity check: monthly time axis must be gapless and duplicate-free # ------------------------------------------------------------------ time = stitched.time if len(time) > 1: # Express time as (year * 12 + month) integers for arithmetic months = time.dt.year.values * 12 + (time.dt.month.values - 1) diffs = months[1:] - months[:-1] gaps = (diffs != 1).sum() if gaps > 0: bad = [(str(time.values[i]), str(time.values[i + 1])) for i, d in enumerate(diffs) if d != 1] raise ValueError( f"Stitched time series for '{scenario}' has {gaps} discontinuity/ies " f"(missing or duplicated months). First offending transition(s): {bad[:5]}" ) # Attach scenario metadata stitched.attrs["scenario"] = scenario stitched.attrs["ancestry"] = " → ".join(s[0] for s in segments) # ------------------------------------------------------------------ # Handle an incomplete final year (leaf scenario tail only). # If exactly one month is missing (11/12), fill it from the same # calendar month in the previous year. For larger incompleteness, # keep conservative behavior and crop the final year. # ------------------------------------------------------------------ last_year = int(stitched.time.dt.year.values[-1]) last_year_count = int((stitched.time.dt.year == last_year).sum()) if last_year_count == 11: months_present = set(int(m) for m in stitched.time.dt.month.where(stitched.time.dt.year == last_year, drop=True).values) missing_months = [m for m in range(1, 13) if m not in months_present] if len(missing_months) != 1: raise ValueError( f"Expected exactly one missing month in final year {last_year} for '{scenario}', " f"but found {len(missing_months)}: {missing_months}." ) miss_month = missing_months[0] src_mask = (stitched.time.dt.year == last_year - 1) & (stitched.time.dt.month == miss_month) src = stitched.sel(time=src_mask) if len(src.time) == 0: raise ValueError( f"Cannot fill missing final month {last_year}-{miss_month:02d} in '{scenario}': " f"no source month found in {last_year - 1}-{miss_month:02d}." ) t_src = src.time.values[0] t_new = t_src.replace(year=last_year) fill = src.assign_coords(time=[t_new]) stitched = xr.concat([stitched, fill], dim="time", data_vars="minimal", coords="minimal", compat="override") stitched = stitched.sortby("time") print( f" Filled missing final month {last_year}-{miss_month:02d} " f"from {last_year - 1}-{miss_month:02d} in '{scenario}'." ) elif last_year_count < 12: print(f" Cropping incomplete last year {last_year} " f"({last_year_count}/12 months) from '{scenario}'.") stitched = stitched.sel(time=stitched.time.dt.year < last_year) return stitched def load_all_scenarios( variable: str | list[str] | None = None, dirpath: str = DIRPATH, scenarios: list[str] | None = None, lat: float | None = None, lon: float | None = None, ) -> dict[str, xr.Dataset]: """ Convenience wrapper: load multiple (or all) scenarios into a dict. Parameters ---------- variable : str | list[str] | None Variable(s) to load (default: module-level VARIABLES). dirpath : str Root directory containing the NetCDF files. scenarios : list[str] | None Subset of scenarios to load; defaults to ALL_SCENARIOS. lat : float | None Nearest-neighbour latitude to select (forwarded to load_scenario). lon : float | None Nearest-neighbour longitude to select (forwarded to load_scenario). Returns ------- dict[str, xr.Dataset] Mapping scenario_name → stitched Dataset. """ if scenarios is None: scenarios = ALL_SCENARIOS return {s: load_scenario(s, variable=variable, dirpath=dirpath, lat=lat, lon=lon) for s in scenarios} def _to_monthly_dataframe(series_by_scenario: dict[str, pd.Series]) -> pd.DataFrame: """Combine per-scenario monthly Series into one DataFrame.""" if not series_by_scenario: return pd.DataFrame() df = pd.concat(series_by_scenario, axis=1) df.columns = [str(c) for c in df.columns] df.index.name = "time" return df def _log(msg: str) -> None: """Small timestamped logger for long-running CLI tasks.""" print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}", flush=True) def _write_flattened_glacier_file( ds: xr.Dataset, glacier_ds: xr.Dataset, scenario: str, out_dir: Path, ) -> Path: """Write a flattened glacier-only NetCDF for one scenario. Output variables: - tas(time, points) - pr(time, points) - glacier_area(points) - glacier_area_by_region(reg, points) - latitude(points) - longitude(points) """ # Align glacier weights to climate coordinates (handles tiny numeric # coordinate differences between files while keeping the same grid shape). glacier_on_ds = glacier_ds.sel(lat=ds.lat, lon=ds.lon, method="nearest") mask = glacier_on_ds.glacier_area.fillna(0) > 0 mask_points = mask.stack(points=("lat", "lon")) keep_idx = np.flatnonzero(mask_points.values) if len(keep_idx) == 0: raise ValueError(f"Scenario '{scenario}': no glacier points found (mask empty).") ds_flat = ds[["tas", "pr"]].stack(points=("lat", "lon")).isel(points=keep_idx) # Drop stacked MultiIndex to avoid conflicts when assigning integer points. ds_flat = ds_flat.reset_index("points", drop=False) area_flat = glacier_on_ds.glacier_area.fillna(0).stack(points=("lat", "lon")).isel(points=keep_idx) area_flat = area_flat.reset_index("points", drop=True) area_reg_flat = ( glacier_on_ds.glacier_area_by_region.fillna(0) .stack(points=("lat", "lon")) .isel(points=keep_idx) ) area_reg_flat = area_reg_flat.reset_index("points", drop=True) out = xr.Dataset( data_vars={ "tas": ds_flat["tas"], "pr": ds_flat["pr"], "glacier_area": area_flat, "glacier_area_by_region": area_reg_flat, }, coords={ "time": ds_flat.time, "points": np.arange(ds_flat.sizes["points"], dtype=np.int64), "reg": glacier_on_ds.reg, "latitude": ("points", ds_flat.lat.values), "longitude": ("points", ds_flat.lon.values), }, attrs={ "Conventions": "CF-1.7", "history": ( f"{datetime.now().isoformat(timespec='seconds')} stitched scenario and " "flattened to glacier-only points" ), "source": "UKESM1-2-LL TerraFIRMA scenario stitching", "scenario": scenario, "note": "Contains only grid points where glacier_area > 0.", }, ) out["tas"].attrs.setdefault("long_name", "Near-surface air temperature") out["pr"].attrs.setdefault("long_name", "Precipitation") out["latitude"].attrs.update({"units": "degrees_north", "standard_name": "latitude"}) out["longitude"].attrs.update({"units": "degrees_east", "standard_name": "longitude"}) out_dir.mkdir(parents=True, exist_ok=True) out_path = out_dir / f"ukesm_{scenario}_flat_glaciers.nc" out.to_netcdf(out_path) out.close() return out_path def generate_summary_files( summary_dir: str | Path = "summary", scenarios: list[str] | None = None, dirpath: str = DIRPATH, glacier_grid_path: str | Path = "glacier_grid_on_ukesm.nc", ) -> None: """Generate monthly summary CSV files for all scenarios. Outputs written in *summary_dir*: - global_tas_monthly.csv - global_pr_monthly.csv - glacier_global_tas_monthly.csv - glacier_global_pr_monthly.csv - by_region/tas_monthly_regXX.csv - by_region/pr_monthly_regXX.csv - flattened/ukesm__flat_glaciers.nc """ if scenarios is None: scenarios = ALL_SCENARIOS summary_dir = Path(summary_dir) by_region_dir = summary_dir / "by_region" flat_dir = summary_dir / "flattened" summary_dir.mkdir(parents=True, exist_ok=True) by_region_dir.mkdir(parents=True, exist_ok=True) flat_dir.mkdir(parents=True, exist_ok=True) glacier_grid_path = Path(glacier_grid_path) if not glacier_grid_path.is_absolute(): glacier_grid_path = Path(__file__).parent / glacier_grid_path if not glacier_grid_path.exists(): raise FileNotFoundError(f"Glacier grid file not found: {glacier_grid_path}") _log(f"Loading glacier grid: {glacier_grid_path}") gds = xr.open_dataset(glacier_grid_path) if "glacier_area" not in gds or "glacier_area_by_region" not in gds: raise ValueError( "Glacier grid dataset must contain 'glacier_area' and 'glacier_area_by_region'." ) region_ids = [int(r) for r in gds.reg.values] global_tas: dict[str, pd.Series] = {} global_pr: dict[str, pd.Series] = {} glacier_tas: dict[str, pd.Series] = {} glacier_pr: dict[str, pd.Series] = {} glacier_tas_by_reg: dict[int, dict[str, pd.Series]] = {r: {} for r in region_ids} glacier_pr_by_reg: dict[int, dict[str, pd.Series]] = {r: {} for r in region_ids} for scenario in scenarios: _log(f"Scenario {scenario}: loading stitched tas/pr dataset") ds = load_scenario(scenario, variable=["tas", "pr"], dirpath=dirpath) _log(f"Scenario {scenario}: loaded {ds.sizes['time']} monthly steps — reading into memory") ds.load() # force all lazy chunks into RAM once; avoids 38 repeated disk reads in the loops below _log(f"Scenario {scenario}: in-memory load complete") # Global area-weighted means (cos(latitude) weights) w_global = xr.DataArray( np.cos(np.deg2rad(ds.lat.values)), coords={"lat": ds.lat}, dims=("lat",), name="w_global", ) ts_tas = ds.tas.weighted(w_global).mean(dim=("lon", "lat")) ts_pr = ds.pr.weighted(w_global).mean(dim=("lon", "lat")) global_tas[scenario] = ts_tas.to_series() global_pr[scenario] = ts_pr.to_series() # Glacier global weighted means w_glacier_global = gds.glacier_area.fillna(0) gs_tas = ds.tas.weighted(w_glacier_global).mean(dim=("lon", "lat")) gs_pr = ds.pr.weighted(w_glacier_global).mean(dim=("lon", "lat")) glacier_tas[scenario] = gs_tas.to_series() glacier_pr[scenario] = gs_pr.to_series() # Per-region glacier weighted means for reg in region_ids: w_reg = gds.glacier_area_by_region.sel(reg=reg).fillna(0) if float(w_reg.sum()) <= 0: continue rs_tas = ds.tas.weighted(w_reg).mean(dim=("lon", "lat")) rs_pr = ds.pr.weighted(w_reg).mean(dim=("lon", "lat")) glacier_tas_by_reg[reg][scenario] = rs_tas.to_series() glacier_pr_by_reg[reg][scenario] = rs_pr.to_series() out_flat = _write_flattened_glacier_file(ds=ds, glacier_ds=gds, scenario=scenario, out_dir=flat_dir) _log(f"Scenario {scenario}: wrote flattened glacier file {out_flat}") ds.close() _log(f"Writing summary CSV files to: {summary_dir}") _to_monthly_dataframe(global_tas).to_csv(summary_dir / "global_tas_monthly.csv") _to_monthly_dataframe(global_pr).to_csv(summary_dir / "global_pr_monthly.csv") _to_monthly_dataframe(glacier_tas).to_csv(summary_dir / "glacier_global_tas_monthly.csv") _to_monthly_dataframe(glacier_pr).to_csv(summary_dir / "glacier_global_pr_monthly.csv") for reg in region_ids: reg_tag = f"reg{reg:02d}" _to_monthly_dataframe(glacier_tas_by_reg[reg]).to_csv( by_region_dir / f"tas_monthly_{reg_tag}.csv" ) _to_monthly_dataframe(glacier_pr_by_reg[reg]).to_csv( by_region_dir / f"pr_monthly_{reg_tag}.csv" ) _log("Summary generation complete.") # --------------------------------------------------------------------------- # Quick demo / smoke test # --------------------------------------------------------------------------- if __name__ == "__main__": import sys # CLI mode 1: summary generation # python terrafirma_helpers.py summaries # python terrafirma_helpers.py summaries up2p0,up2p0-gwl2p0 if len(sys.argv) > 1 and sys.argv[1] == "summaries": scenarios_arg = sys.argv[2] if len(sys.argv) > 2 else None scenarios = None if scenarios_arg is None else [s.strip() for s in scenarios_arg.split(",") if s.strip()] generate_summary_files(scenarios=scenarios) raise SystemExit(0) # CLI mode 2: existing smoke test scenario = sys.argv[1] if len(sys.argv) > 1 else "up2p0-gwl4p0" var = sys.argv[2] if len(sys.argv) > 2 else "tas" print(f"Resolving ancestry for '{scenario}':") for seg, frm, to in _resolve_ancestry(scenario): print(f" {seg:45s} [{frm} – {to}]") print(f"\nLoading variable '{var}' ...") ds = load_scenario(scenario, variable=var, lat=float(sys.argv[3]) if len(sys.argv) > 3 else None, lon=float(sys.argv[4]) if len(sys.argv) > 4 else None) print(ds) print(f"\nTime range: {ds.time.values[0]} → {ds.time.values[-1]}") print(f"Ancestry: {ds.attrs['ancestry']}")