{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "dffd70cd-400c-4634-afb7-2afdf7f23209",
   "metadata": {},
   "source": [
    "# Flatten monthly ERA5 files and select only glaciated gridpoints \n",
    "\n",
    "--> monthly: currently from 1940-01 to 2025-09, but only applied the flattening until end of 2024\n",
    "\n",
    "In `/home/www/lschuster/example_ipynb/flatten_glacier_gridpoint_tests_2025_6_new_isimip3b.ipynb`, we check if the flattening approach works as it should. \n",
    "\n",
    "- todo:\n",
    "   - create a workflow that directly checks how much got downloaded and appends the missing months ...? (eventually even starts a script to download the missing files until now) \n",
    "   - check if the files can be easily incorporated into existing OGGM methods/functions ... "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "83acffa7-6af9-402e-92df-a24ccd179136",
   "metadata": {},
   "outputs": [],
   "source": [
    "import xarray as xr\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from oggm import utils\n",
    "\n",
    "import sys\n",
    "import matplotlib.pyplot as plt\n",
    "import os\n",
    "import glob\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "5299a1f3-1437-47bc-bdda-871264d1f807",
   "metadata": {},
   "outputs": [],
   "source": [
    "resolution = 'Monthly' #'Monthly'\n",
    "y0 = 1940\n",
    "y1 = 2024 #2024"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "bbc660ee-c05c-41b1-b879-d29796fc2e48",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<xarray.DataArray 'latitude' (latitude: 721)> Size: 6kB\n",
      "array([ 90.  ,  89.75,  89.5 , ..., -89.5 , -89.75, -90.  ])\n",
      "Coordinates:\n",
      "  * latitude  (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n",
      "    number    int64 8B ...\n",
      "    expver    <U4 16B ...\n",
      "Attributes:\n",
      "    units:             degrees_north\n",
      "    standard_name:     latitude\n",
      "    long_name:         latitude\n",
      "    stored_direction:  decreasing <xarray.DataArray 'longitude' (longitude: 1440)> Size: 12kB\n",
      "array([0.0000e+00, 2.5000e-01, 5.0000e-01, ..., 3.5925e+02, 3.5950e+02,\n",
      "       3.5975e+02])\n",
      "Coordinates:\n",
      "  * longitude  (longitude) float64 12kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8\n",
      "    number     int64 8B ...\n",
      "    expver     <U4 16B ...\n",
      "Attributes:\n",
      "    units:          degrees_east\n",
      "    standard_name:  longitude\n",
      "    long_name:      longitude\n",
      "Total number of glaciers: 216502 and number of ERA5 gridpoints with glaciers in them: 11293\n"
     ]
    }
   ],
   "source": [
    "### this is the same for monthly and daily, and needs to be done just once and does not need to be repeated afterwards\n",
    "ds_inv = xr.open_dataset('/home/www/oggm/climate/wip/era5_monthly/era5_orography.nc')\n",
    "# they are named already correctly, and are correctly going already from 0 to 360 (so I removed all transformation steps)\n",
    "print(ds_inv.latitude, ds_inv.longitude)\n",
    "assert np.all(ds_inv.longitude >= 0)\n",
    "nx, ny = ds_inv.sizes['longitude'], ds_inv.sizes['latitude']\n",
    "# get the dataset where coordinates of glaciers are stored\n",
    "frgi = utils.file_downloader('https://cluster.klima.uni-bremen.de/~oggm/rgi/rgi62_stats.h5')\n",
    "odf = pd.read_hdf(frgi, index_col=0)\n",
    "# attention the rgi62 stats are in longitude from -179 to 180  \n",
    "assert odf.CenLon.max() <180\n",
    "assert odf.CenLon.min() <-175 \n",
    "cenlon_for_bins = np.where(odf['CenLon'] < 0, odf['CenLon']+360, odf['CenLon'])\n",
    "# Nearest neighbor lookup --> W5E5 is 0.5° grid, ERA5 is 0.25° grid ... \n",
    "lon_bins = np.linspace(ds_inv.longitude.data[0] - 0.125, ds_inv.longitude.data[-1] + 0.125, nx+1)\n",
    "# !!! attention W5E5 sorted from 90 to -90 !!!!\n",
    "lat_bins = np.linspace(ds_inv.latitude.data[0] + 0.125, ds_inv.latitude.data[-1] - 0.125, ny+1)\n",
    "assert np.diff(lon_bins).min() == 0.25\n",
    "assert np.diff(lon_bins).max() == 0.25\n",
    "odf['lon_id'] = np.digitize(cenlon_for_bins, lon_bins)-1\n",
    "odf['lat_id'] = np.digitize(odf['CenLat'], lat_bins)-1\n",
    "# if a glacier has a longitude above or equal 359.875, it hould be again in the first bin \n",
    "# thus if \"lon_id\" ==1440 it is actually in the first bin ... \n",
    "odf.loc[odf['lon_id'] == nx, 'lon_id'] = 0\n",
    "\n",
    "odf['lon_val'] = ds_inv.longitude.data[odf.lon_id]\n",
    "odf['lat_val'] = ds_inv.latitude.data[odf.lat_id]\n",
    "# Use unique grid points as index and compute the area per location\n",
    "odf['unique_id'] = ['{:03d}_{:03d}'.format(lon, lat) for (lon, lat) in zip(odf['lon_id'], odf['lat_id'])]\n",
    "mdf = odf.drop_duplicates(subset='unique_id').set_index('unique_id')\n",
    "mdf['Area'] = odf.groupby('unique_id').sum()['Area']\n",
    "print('Total number of glaciers: {} and number of ERA5 gridpoints with glaciers in them: {}'.format(len(odf), len(mdf)))\n",
    "\n",
    "# this is the mask that we need to remove all non-glacierized gridpoints\n",
    "mask = np.full((ny, nx), np.NaN)\n",
    "mask[mdf['lat_id'], mdf['lon_id']] = 1 \n",
    "ds_inv['glacier_mask'] = (('latitude', 'longitude'), np.isfinite(mask))\n",
    "\n",
    "# check the distance to the gridpoints-> it should never be larger than \n",
    "diff_lon = ds_inv.longitude.data[odf.lon_id] - odf.CenLon\n",
    "# if the distance is 360 -> it is the same as 0,\n",
    "diff_lon = np.where(np.abs(diff_lon - 360) < 170, diff_lon-360, diff_lon)\n",
    "odf['ll_dist_to_point'] = ((diff_lon)**2 + (ds_inv.latitude.data[odf.lat_id] - odf.CenLat)**2)**0.5\n",
    "assert odf['ll_dist_to_point'].max() < (0.125**2 + 0.125**2)**0.5\n",
    "# somehow the orography has a date, but wewant to remove tat\n",
    "ds_inv = ds_inv.sel(valid_time=\"2020-01-01\")\n",
    "\n",
    "###### create the invariant file from the first Monthly dataset using one of the two variables\n",
    "if resolution == 'Monthly':\n",
    "    var = 'tp'\n",
    "    ds = xr.open_dataset(f\"/home/www/oggm/climate/wip/era5_monthly/_raw_files/ERA5_Monthly_{var}_1940.nc\")\n",
    "    # they are named already correctly, and are correctly going already from 0 to 360 \n",
    "    # (so I removed all transformation steps)\n",
    "    assert np.all(ds.longitude >= 0)\n",
    "    assert np.all(ds.latitude <= 90)\n",
    "    ds['ASurf'] = ds_inv['orography'].copy() #ds_inv['ASurf']\n",
    "    \n",
    "    # as we use here only those gridpoints where glaciers are involved, need to put the mask on dsi as well!\n",
    "    # dsi = ds_inv.where(ds_inv['glacier_mask'], drop = True)  # this makes out of in total 6483600 points only XXX points!!!\n",
    "    dsi = ds.isel(time=[0]).where(ds_inv['glacier_mask'], drop = True)\n",
    "    # we do not want any dependency on latitude and longitude\n",
    "    dsif = dsi.stack(points=('latitude', 'longitude')).reset_index(('points'))\n",
    "    #dsif # so far still many points \n",
    "    \n",
    "    # drop the non-glacierized points\n",
    "    dsifs = dsif.where(np.isfinite(dsi[var].stack(points=('latitude',\n",
    "                                                          'longitude')).reset_index(('time',\n",
    "                                                                                     'points'))), drop=True)\n",
    "    # I have to drop the 'time_' dimension, to be equal to the era5_land example, because the invariant file should not have any time dependence !\n",
    "    dsifs = dsifs.drop_vars(var)\n",
    "    dsifs = dsifs.drop_vars(['number','valid_time', 'expver'])\n",
    "    #dsifs = dsifs.drop_vars('time')\n",
    "    # I don't want to drop the dimension as ASurf is dependent on the timestep, but I want to remove the time dependency\n",
    "    # dsifs = dsifs.drop_dims('time')\n",
    "    dsifs = dsifs.squeeze('time')\n",
    "    #dsifs = dsifs.drop('time_')\n",
    "    dsifs.attrs['version'] = '2025.11'\n",
    "    dsifs.to_netcdf('../2025.11/era5_glacier_invariant_flat.nc')\n",
    "    \n",
    "    #check if gridpoint nearest to hef is right!!!\n",
    "    # happened once that something went wrong here ...\n",
    "    lon, lat = (10.7584, 46.8003)\n",
    "    #orog = xr.open_dataset('/home/lilianschuster/Downloads/w5e5v2.0_glacier_invariant_flat.nc').ASurf\n",
    "    orog = dsifs.ASurf\n",
    "    c = (orog.longitude - lon)**2 + (orog.latitude - lat)**2\n",
    "    surf_hef = orog.isel(points=c.argmin())\n",
    "    np.testing.assert_allclose(surf_hef, 2250, rtol=0.1)\n",
    "    \n",
    "    # check another glacier\n",
    "    # RGI60-19.00124\n",
    "    lon,lat = (-70.8931 +360, -72.4474)\n",
    "    orog = dsifs.ASurf\n",
    "    c = (orog.longitude - lon)**2 + (orog.latitude - lat)**2\n",
    "    surf_g = orog.isel(points=c.argmin())\n",
    "    np.testing.assert_allclose(surf_g.longitude, lon, rtol=0.2)\n",
    "    np.testing.assert_allclose(surf_g.latitude, lat, rtol=0.2)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c95cb54d-cc4e-47cb-bf24-83d1868be478",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Now run it over all years \n",
    "try:\n",
    "    os.mkdir('/home/www/oggm/climate/wip/era5_{}/_raw_files/tmp_tp'.format(resolution.lower()))\n",
    "    os.mkdir('/home/www/oggm/climate/wip/era5_{}/_raw_files/tmp_t2m'.format(resolution.lower()))\n",
    "except:\n",
    "    pass\n",
    "\n",
    "for y in np.arange(y0,y1+1): \n",
    "    for var in ['t2m','tp']:\n",
    "        if resolution == 'Monthly':\n",
    "            ds = xr.open_dataset(f\"/home/www/oggm/climate/wip/era5_monthly/_raw_files/ERA5_{resolution}_{var}_{y}.nc\")\n",
    "        else:\n",
    "            ds = xr.open_mfdataset(f\"/home/www/oggm/climate/wip/era5_daily_lily/_raw_files/ERA5_{resolution}_{var}_{y}*.nc\")\n",
    "        ds = ds.load()\n",
    "        # they are named already correctly, and are correctly going already from 0 to 360 \n",
    "        # (so I removed all transformation steps)\n",
    "        assert np.all(ds.longitude >= 0)\n",
    "        assert np.all(ds.latitude <= 90)\n",
    "        # this takes too long !!!\n",
    "        # ds_merged_glaciers = xr_prcp.where(ds_inv['glacier_mask'], drop = True)\n",
    "        ds = ds.drop_vars(['number'])\n",
    "        try:\n",
    "            ds = ds.drop_vars(['expver'])\n",
    "        except: \n",
    "            pass\n",
    "        for t in ds.time.values[:]:  \n",
    "            # produce a temporary file for each month (if \"Monthly\" or day if \"Daily\")\n",
    "            sel_l = ds.sel(time=[t])\n",
    "            # don't do the dropping twice!!!!\n",
    "            #sel = sel_l.where(ds_inv['glacier_mask'], drop = True)\n",
    "            sel = sel_l.where(ds_inv['glacier_mask'])\n",
    "            sel = sel.stack(points=('latitude', 'longitude')).reset_index(('time', 'points'))\n",
    "            sel = sel.where(np.isfinite(sel[var]), drop=True)\n",
    "            sel = sel.drop_vars(['valid_time'])\n",
    "            sel.to_netcdf('/home/www/oggm/climate/wip/era5_{}/_raw_files/tmp_{}/tmp_{}_{}.nc'.format(resolution.lower(),var, resolution, str(t)[:10]))\n",
    "            sel.close()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "ec3ca152-e07a-438c-b63d-ae0d28a92338",
   "metadata": {},
   "outputs": [],
   "source": [
    "# make yearly files\n",
    "for var in ['t2m', 'tp']:\n",
    "    for y in np.arange(y0, y1+1):\n",
    "        dso_y = xr.open_mfdataset('/home/www/oggm/climate/wip/era5_{}/_raw_files/tmp_{}/tmp_{}_{}-*.nc'.format(resolution.lower(),var, resolution, str(y)),\n",
    "                                  concat_dim='time', combine='nested') # .rename_vars({'time_':'time'})\n",
    "        dso_y.to_netcdf('/home/www/oggm/climate/wip/era5_{}/_raw_files/tmp_{}/tmp2_{}_{}.nc'.format(resolution.lower(),var, resolution, str(y)))\n",
    "        dso_y.close()\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "19cc53cb-d690-4b12-a714-4b13abae4476",
   "metadata": {},
   "outputs": [],
   "source": [
    "#xr.open_dataset(f).drop_vars(['expver','number','valid_time'], errors='ignore').reset_coords()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "1052625d-391e-489e-9d8f-d34aa8fd8434",
   "metadata": {},
   "outputs": [],
   "source": [
    "for var in ['tp', 't2m']:\n",
    "    files = sorted(glob.glob(f'/home/www/oggm/climate/wip/era5_{resolution.lower()}/_raw_files/tmp_{var}/tmp2_{resolution}*.nc'))\n",
    "    ds_list = []\n",
    "\n",
    "    for f in files:\n",
    "        ds = xr.open_dataset(f)\n",
    "        # drop variables that could break append\n",
    "        ds = ds.drop_vars(['expver','number','valid_time'], errors='ignore')\n",
    "        ds = ds.reset_coords() # not drop =True, as I still need the latitude, longitudes\n",
    "        ds.load()\n",
    "        if resolution == 'Monthly':\n",
    "            if ds.time.size < 12:\n",
    "                if f == files[-1]: # only do this if it is the last file\n",
    "                    m_last = f'-{int(ds.time.size):02d}'\n",
    "                    # pad last file if monthly --> actually not necessary\n",
    "                    #ds = pad_months_after_last(ds, expected_n=12)\n",
    "                else:\n",
    "                    raise ValueError(f\"File {f} has only {ds.time.size} months, but it is NOT the last file!\")\n",
    "            else:\n",
    "                m_last = ''\n",
    "        if resolution == 'Daily':\n",
    "            if ds.time.size <28:\n",
    "                raise ValueError(f\"File {f} has less than 28 days, something wrong\")\n",
    "            if f == files[-1]:\n",
    "                last_timestamp = ds.time.values[-1]\n",
    "                _m = last_timestamp.astype('datetime64[M]').astype(int) % 12 + 1\n",
    "                _d = (last_timestamp - last_timestamp.astype('datetime64[M]')).astype('timedelta64[D]').astype(int) + 1\n",
    "                if _m==12 and _d==31:\n",
    "                    m_last = ''\n",
    "                else:\n",
    "                    m_last = f'-{int(_m):02d}'\n",
    "        ds_list.append(ds)\n",
    "\n",
    "    # concatenate all along time\n",
    "    ds_all = xr.concat(ds_list, dim='time', data_vars='all')\n",
    "\n",
    "    # update attributes\n",
    "    ds_all.attrs['history'] = ds_all.history + '; only glacier gridpoints chosen and flattened latitude/longitude --> points'\n",
    "    ds_all.attrs['postprocessing_date'] = str(np.datetime64('today','D'))\n",
    "    ds_all.attrs['postprocessing_scientist'] = 'lilian.schuster@uibk.ac.at'\n",
    "    ds_all.attrs['version'] = '2025.11'\n",
    "\n",
    "    # write once\n",
    "    out_path = f'/home/www/oggm/climate/wip/era5_{resolution.lower()}/flattened/2025.11/{resolution.lower()}/era5_{var}_global_{resolution.lower()}_{y0}_{y1}{m_last}_flat_glaciers.nc'\n",
    "    ds_all.to_netcdf(out_path, format='NETCDF4', engine='netcdf4', unlimited_dims=['time'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "d6fad2b9-327b-4bb4-8006-d4b7773f4580",
   "metadata": {},
   "outputs": [],
   "source": [
    "for var in ['tp', 't2m']:\n",
    "    files = glob.glob('tmp_{}/tmp_{}*'.format(var, resolution))\n",
    "    for f in files:\n",
    "        os.remove(f)\n",
    "    files = glob.glob('tmp_{}/tmp2_{}*'.format(var, resolution))\n",
    "    for f in files:\n",
    "        os.remove(f)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a56a3ad9-4ac6-4202-baaa-fb24baa2d368",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9dcc87e4-c61f-44c7-981c-d1a6bbe1ff79",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "2ec8ae41-1272-4030-951c-5aee0a616bc5",
   "metadata": {},
   "source": [
    "# Appendix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2dec896b-35ae-4b4e-b5ed-91152e7e6e0a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# necessary if there is not a full year available, but actually when using normal xr.concat not anymore necessary ... \n",
    "def pad_months_after_last(ds, expected_n=12):\n",
    "    \n",
    "    n = ds.sizes['time']\n",
    "    if n >= expected_n:\n",
    "        return ds\n",
    "    missing = expected_n - n\n",
    "    # get last time\n",
    "    last = ds.time.values[-1]\n",
    "\n",
    "    # branch depending on dtype: datetime64 vs cftime\n",
    "    if np.issubdtype(ds.time.dtype, np.datetime64):\n",
    "        last_ts = pd.to_datetime(last)\n",
    "        # compute first missing month: first day of next month\n",
    "        start = (last_ts + pd.offsets.MonthBegin()).normalize()\n",
    "        pad_times = pd.date_range(start=start, periods=missing, freq='MS')\n",
    "        pad_time_values = pad_times.values  # numpy.datetime64[ns]\n",
    "    else:\n",
    "        # use cftime: get a cftime datetime for the next month\n",
    "        # handle different cftime calendars via cftime library defaults\n",
    "        if isinstance(last, cftime.datetime):\n",
    "            year = last.year\n",
    "            month = last.month + 1\n",
    "            year += (month - 1) // 12\n",
    "            month = (month - 1) % 12 + 1\n",
    "            pad_time_values = []\n",
    "            for i in range(missing):\n",
    "                m = month + i\n",
    "                y = year + (m - 1) // 12\n",
    "                mm = (m - 1) % 12 + 1\n",
    "                pad_time_values.append(cftime.datetime(y, mm, 1))\n",
    "        else:\n",
    "            # fallback: convert to pandas and then to cftime datetimes\n",
    "            last_ts = pd.to_datetime(last)\n",
    "            start = (last_ts + pd.offsets.MonthBegin()).normalize()\n",
    "            pad_times = pd.date_range(start=start, periods=missing, freq='MS')\n",
    "            pad_time_values = [cftime.datetime(t.year, t.month, t.day) for t in pad_times]\n",
    "\n",
    "    # create a pad dataset: copy single-time slice and expand with NaNs\n",
    "    pad = ds.isel(time=slice(0, 1)).copy(deep=True) * np.nan\n",
    "    # tile to required number of missing months\n",
    "    pad = pad.isel(time=0).expand_dims(time=pad_time_values)\n",
    "    # assign correct time coordinate\n",
    "    pad = pad.assign_coords(time=('time', pad_time_values))\n",
    "    # concatenate along time\n",
    "    ds_padded = xr.concat([ds, pad], dim='time')\n",
    "    return ds_padded\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:oggm_env_2025]",
   "language": "python",
   "name": "conda-env-oggm_env_2025-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}