{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "dffd70cd-400c-4634-afb7-2afdf7f23209",
   "metadata": {},
   "source": [
    "# Flatten daily or monthly ERA5 files and select only glaciated gridpoints from RGI6, RGI7C and RGI7G\n",
    "\n",
    "--> currently from 1940-01 to 2025-12\n",
    "\n",
    "In http://nbviewer.org/urls/cluster.klima.uni-bremen.de/~oggm/climate/notebooks/flatten_glacier_gridpoint_tests.ipynb, we check if the flattening approach works as it should. \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "83acffa7-6af9-402e-92df-a24ccd179136",
   "metadata": {},
   "outputs": [],
   "source": [
    "import xarray as xr\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from oggm import utils\n",
    "\n",
    "import sys\n",
    "import matplotlib.pyplot as plt\n",
    "import os\n",
    "import glob\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "5299a1f3-1437-47bc-bdda-871264d1f807",
   "metadata": {},
   "outputs": [],
   "source": [
    "resolution = 'Daily'  # Choose here Monthly ... or Daily\n",
    "y0 = 2025 # 1940\n",
    "y1 = 2025 #2024\n",
    "### this version includes RGI6, RGI7C and RGI7G\n",
    "v = 'v2026.01.14'\n",
    "res = 0.25 # resolution of ERA5\n",
    "path = '/home/www/oggm/climate/notebooks/era5_both/'\n",
    "\n",
    "if resolution == 'Daily':\n",
    "    path_unflattened = '/home/www/oggm/climate/era5/daily/v1.2/unflattened/' \n",
    "else:\n",
    "    path_unflattened = '/home/www/oggm/climate/era5/monthly/v1.2/unflattened/' \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "ee88791e-13d8-4c11-909b-b679096e71e4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<xarray.DataArray 'latitude' (latitude: 721)> Size: 6kB\n",
      "array([ 90.  ,  89.75,  89.5 , ..., -89.5 , -89.75, -90.  ])\n",
      "Coordinates:\n",
      "  * latitude  (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n",
      "    number    int64 8B ...\n",
      "    expver    <U4 16B ...\n",
      "Attributes:\n",
      "    units:             degrees_north\n",
      "    standard_name:     latitude\n",
      "    long_name:         latitude\n",
      "    stored_direction:  decreasing <xarray.DataArray 'longitude' (longitude: 1440)> Size: 12kB\n",
      "array([0.0000e+00, 2.5000e-01, 5.0000e-01, ..., 3.5925e+02, 3.5950e+02,\n",
      "       3.5975e+02])\n",
      "Coordinates:\n",
      "  * longitude  (longitude) float64 12kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8\n",
      "    number     int64 8B ...\n",
      "    expver     <U4 16B ...\n",
      "Attributes:\n",
      "    units:          degrees_east\n",
      "    standard_name:  longitude\n",
      "    long_name:      longitude\n",
      "RGI6 ––– Total number of glaciers: 216502 and number of ERA5 gridpoints with glaciers in them: 11293\n",
      "RGI7C ––– Total number of glacier complexes: 192869 and number of ERA5 gridpoints with glacier complexes in them: 10162\n",
      "RGI7G ––– Total number of glaciers: 274531 and number of ERA5 gridpoints with glaciers in them: 11506\n",
      "All versions ––– Total number of glacier IDs: 683902 and number of ERA5 gridpoints with glaciers in them: 12080\n"
     ]
    }
   ],
   "source": [
    "### this is the same for monthly and daily, and needs to be done just once and does not need to be repeated afterwards\n",
    "ds_inv = xr.open_dataset(path +'era5_geopotential_sfc.nc')\n",
    "\n",
    "# they are named already correctly, and are correctly going already from 0 to 360 (so I removed all transformation steps)\n",
    "print(ds_inv.latitude, ds_inv.longitude)\n",
    "assert np.all(ds_inv.longitude >= 0)\n",
    "nx, ny = ds_inv.sizes['longitude'], ds_inv.sizes['latitude']\n",
    "# get the dataset where coordinates of glaciers are stored\n",
    "frgi_6 = utils.file_downloader('https://cluster.klima.uni-bremen.de/~oggm/rgi/rgi62_stats.h5')\n",
    "odf_6 = pd.read_hdf(frgi_6, index_col=0)\n",
    "#### glacier complexes\n",
    "frgi_7C = utils.file_downloader('https://cluster.klima.uni-bremen.de/~oggm/rgi/RGI2000-v7.0-C-global-attributes.csv')\n",
    "odf_7C = pd.read_csv(frgi_7C, index_col=0)\n",
    "#### glaciers \n",
    "frgi_7G = utils.file_downloader('https://cluster.klima.uni-bremen.de/~oggm/rgi/RGI2000-v7.0-G-global-attributes.csv')\n",
    "odf_7G = pd.read_csv(frgi_7G, index_col=0)\n",
    "\n",
    "odf_l = []\n",
    "for odf,n in zip([odf_6,odf_7C,odf_7G],['RGI6', 'RGI7C', 'RGI7G']):\n",
    "    # need to repeat this for every option here to have one flattened file that has the necessary points for all three versions\n",
    "    \n",
    "    if (n == 'RGI7C') or (n=='RGI7G'):\n",
    "        # has slightly different column names ... \n",
    "        odf['CenLon'] = odf['cenlon']\n",
    "        odf['CenLat'] = odf['cenlat']\n",
    "        odf['Area'] = odf['area_km2']\n",
    "    # attention the rgi62 stats are in longitude from -179 to 180  \n",
    "    assert odf.CenLon.max() <180\n",
    "    assert odf.CenLon.min() <-175 \n",
    "    cenlon_for_bins = np.where(odf['CenLon'] < 0, odf['CenLon']+360, odf['CenLon'])\n",
    "    # Nearest neighbor lookup --> W5E5 is 0.5° grid, ERA5 is 0.25° grid ... --> here we use 0.125\n",
    "    lon_bins = np.linspace(ds_inv.longitude.data[0] - res/2, ds_inv.longitude.data[-1] + res/2, nx+1)\n",
    "    # !!! attention W5E5 and ERA5 sorted from 90 to -90 !!!!\n",
    "    lat_bins = np.linspace(ds_inv.latitude.data[0] + res/2, ds_inv.latitude.data[-1] - res/2, ny+1)\n",
    "    assert np.diff(lon_bins).min() == res\n",
    "    assert np.diff(lon_bins).max() == res\n",
    "    odf['lon_id'] = np.digitize(cenlon_for_bins, lon_bins)-1\n",
    "    odf['lat_id'] = np.digitize(odf['CenLat'], lat_bins)-1\n",
    "    # if a glacier has a longitude above or equal 359.875, it hould be again in the first bin \n",
    "    # thus if \"lon_id\" ==1440 it is actually in the first bin ... \n",
    "    odf.loc[odf['lon_id'] == nx, 'lon_id'] = 0\n",
    "    \n",
    "    odf['lon_val'] = ds_inv.longitude.data[odf.lon_id]\n",
    "    odf['lat_val'] = ds_inv.latitude.data[odf.lat_id]\n",
    "    # Use unique grid points as index and compute the area per location\n",
    "    odf['unique_id'] = ['{:03d}_{:03d}'.format(lon, lat) for (lon, lat) in zip(odf['lon_id'], odf['lat_id'])]\n",
    "    mdf = odf.drop_duplicates(subset='unique_id').set_index('unique_id')\n",
    "    mdf['Area'] = odf.groupby('unique_id').sum()['Area']\n",
    "    if n == 'RGI7C':\n",
    "        print('{} ––– Total number of glacier complexes: {} and number of ERA5 gridpoints with glacier complexes in them: {}'.format(n, len(odf), len(mdf)))\n",
    "    else:\n",
    "        print('{} ––– Total number of glaciers: {} and number of ERA5 gridpoints with glaciers in them: {}'.format(n, len(odf), len(mdf)))\n",
    "\n",
    "    # at the end, we want to have one \n",
    "    odf['rgi_v'] = n\n",
    "    odf_l.append(odf.copy())\n",
    "odf_all = pd.concat(odf_l)\n",
    "mdf = odf_all.drop_duplicates(subset='unique_id').set_index('unique_id')\n",
    "mdf['Area'] = odf_all.groupby('unique_id').sum()['Area']\n",
    "print('All versions ––– Total number of glacier IDs: {} and number of ERA5 gridpoints with glaciers in them: {}'.format(len(odf_all), len(mdf)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "96fa21bb-6162-4ea5-8091-de185d58be12",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "532\n",
      "CenLon   -141.670274\n",
      "CenLat     69.166921\n",
      "Name: 873_083, dtype: object\n"
     ]
    }
   ],
   "source": [
    "# let's look into those gripoints that are new for RGI7C\n",
    "print(len(mdf.loc[mdf.rgi_v == 'RGI7C']))\n",
    "print(mdf.loc[mdf.rgi_v == 'RGI7C'].iloc[0][['CenLon','CenLat']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "42363681-5200-4f73-96cc-0f179f83aa25",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "255\n",
      "CenLon   -66.855668\n",
      "CenLat   -67.535551\n",
      "Name: 1173_630, dtype: object\n"
     ]
    }
   ],
   "source": [
    "# let's look into those gripoints that are new for RGI7G\n",
    "print(len(mdf.loc[mdf.rgi_v == 'RGI7G']))\n",
    "print(mdf.loc[mdf.rgi_v == 'RGI7G'].iloc[-1][['CenLon','CenLat']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "3813435a-d58f-4582-819c-676e1dc017e7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# check the distance to the gridpoints-> it should never be larger than the resolution of the chosen dataset (for ERA5 : 0.125)\n",
    "diff_lon = ds_inv.longitude.data[odf_all.lon_id] - odf_all.CenLon\n",
    "# if the distance is 360 -> it is the same as 0,\n",
    "diff_lon = np.where(np.abs(diff_lon - 360) < 170, diff_lon-360, diff_lon)\n",
    "odf_all['ll_dist_to_point'] = ((diff_lon)**2 + (ds_inv.latitude.data[odf_all.lat_id] - odf_all.CenLat)**2)**0.5\n",
    "assert odf_all['ll_dist_to_point'].max() < ((res/2)**2 + (res/2)**2)**0.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "bbc660ee-c05c-41b1-b879-d29796fc2e48",
   "metadata": {},
   "outputs": [],
   "source": [
    "# this is the mask that we need to remove all non-glacierized gridpoints\n",
    "mask = np.full((ny, nx), np.NaN)\n",
    "mask[mdf['lat_id'], mdf['lon_id']] = 1 \n",
    "ds_inv['glacier_mask'] = (('latitude', 'longitude'), np.isfinite(mask))\n",
    "\n",
    "# check the distance to the gridpoints-> it should never be larger than the resolution of the chosen dataset (for ERA5 : 0.125)\n",
    "diff_lon = ds_inv.longitude.data[odf_all.lon_id] - odf_all.CenLon\n",
    "# if the distance is 360 -> it is the same as 0,\n",
    "diff_lon = np.where(np.abs(diff_lon - 360) < 170, diff_lon-360, diff_lon)\n",
    "odf_all['ll_dist_to_point'] = ((diff_lon)**2 + (ds_inv.latitude.data[odf_all.lat_id] - odf_all.CenLat)**2)**0.5\n",
    "assert odf_all['ll_dist_to_point'].max() < ((res/2)**2 + (res/2)**2)**0.5\n",
    "\n",
    "# somehow the geopotential has a date, but we want to remove tat\n",
    "ds_inv['z'] = ds_inv['z'].sel(valid_time=\"2020-01-01\") \n",
    "\n",
    "###### create the invariant file from the first Monthly dataset using one of the two variables\n",
    "if resolution == 'Monthly':\n",
    "    var = 'tp'\n",
    "    ds = xr.open_dataset(path_unflattened +f\"ERA5_Monthly_{var}_1940.nc\").isel(time=[0])\n",
    "    # they are named already correctly, and are correctly going already from 0 to 360 \n",
    "    # (so I removed all transformation steps)\n",
    "    assert np.all(ds.longitude >= 0)\n",
    "    assert np.all(ds.latitude <= 90)\n",
    "    ds['z'] = ds_inv['z'].copy() #ds_inv['ASurf']\n",
    "    ds['z'].attrs['name'] = 'ERA5 geopotential surface'\n",
    "    # as we use here only those gridpoints where glaciers are involved, need to put the mask on dsi as well!\n",
    "    # dsi = ds_inv.where(ds_inv['glacier_mask'], drop = True)  # this makes out of in total 6483600 points only XXX points!!!\n",
    "    dsi = ds.where(ds_inv['glacier_mask'], drop = True)\n",
    "    # we do not want any dependency on latitude and longitude\n",
    "    dsif = dsi.stack(points=('latitude', 'longitude')).reset_index(('points'))\n",
    "    #dsif # so far still many points \n",
    "    \n",
    "    # drop the non-glacierized points\n",
    "    dsifs = dsif.where(np.isfinite(dsi[var].stack(points=('latitude',\n",
    "                                                          'longitude')).reset_index(('time',\n",
    "                                                                                     'points'))), drop=True)\n",
    "    # I have to drop the 'time_' dimension, to be equal to the era5_land example, because the invariant file should not have any time dependence !\n",
    "    dsifs = dsifs.drop_vars(var)\n",
    "    dsifs = dsifs.drop_vars(['number','expver']) # valid_time\n",
    "    #dsifs = dsifs.drop_vars('time')\n",
    "    # I don't want to drop the dimension as \"z\" is dependent on the timestep, but I want to remove the time dependency\n",
    "    dsifs = dsifs.squeeze('time')\n",
    "    dsifs = dsifs.drop_vars('time')\n",
    "    #dsifs = dsifs.drop('time_')\n",
    "    dsifs.attrs['version'] = v\n",
    "    dsifs.attrs['history'] = dsifs.history + '; only glacier gridpoints chosen and flattened latitude/longitude --> points'\n",
    "    dsifs.attrs['postprocessing_date'] = str(np.datetime64('today','D'))\n",
    "    dsifs.attrs['postprocessing_scientist'] = 'lilian.schuster@uibk.ac.at'\n",
    "    dsifs.attrs['note'] = 'includes points from RGI6, RGI7C and RGI7G central longitude and latitude'\n",
    "    utils.mkdir(path + f'flattened/{v[1:8]}')\n",
    "    dsifs.to_netcdf(f'../flattened/{v[1:8]}/era5_glacier_invariant_flat_{v}.nc')\n",
    "    \n",
    "    #check if gridpoint nearest to hef is right!!!\n",
    "    # happened once that something went wrong here ...\n",
    "    lon, lat = (10.7584, 46.8003)\n",
    "    geo = dsifs.z\n",
    "    c = (geo.longitude - lon)**2 + (geo.latitude - lat)**2\n",
    "    geo_hef = geo.isel(points=c.argmin())\n",
    "    np.testing.assert_allclose(geo_hef/9.8, 2250, rtol=0.1)\n",
    "    \n",
    "    # check another glacier\n",
    "    # RGI60-19.00124\n",
    "    lon,lat = (-70.8931 +360, -72.4474)\n",
    "    geo = dsifs.z\n",
    "    c = (geo.longitude - lon)**2 + (geo.latitude - lat)**2\n",
    "    surf_g = geo.isel(points=c.argmin())\n",
    "    np.testing.assert_allclose(surf_g.longitude, lon, atol=res/2)\n",
    "    np.testing.assert_allclose(surf_g.latitude, lat, atol=res/2)\n",
    "    \n",
    "    ## check another glacier RGI60-19.01497\n",
    "    lon,lat = (-179.917000+360, -67.400600)\n",
    "    geo = dsifs.z\n",
    "    c = (geo.longitude - lon)**2 + (geo.latitude - lat)**2\n",
    "    surf_g = geo.isel(points=c.argmin())\n",
    "    np.testing.assert_allclose(surf_g.longitude, lon, atol=res/2)\n",
    "    np.testing.assert_allclose(surf_g.latitude, lat, atol=res/2)\n",
    "\n",
    "    ##RGI60-11.03228 \t11 \t11-02 \tTaillon \t\n",
    "    lon,lat = (-0.039683+360, 42.695419) \t\n",
    "    if lon>360-res/2:\n",
    "        # in that case we actually want to select longitude zero!\n",
    "        lon = 0\n",
    "    geo = dsifs.z\n",
    "    c = (geo.longitude - lon)**2 + (geo.latitude - lat)**2\n",
    "    surf_g = geo.isel(points=c.argmin())\n",
    "    np.testing.assert_allclose(surf_g.longitude, lon, atol=res/2)\n",
    "    np.testing.assert_allclose(surf_g.latitude, lat, atol=res/2)\n",
    "\n",
    "    # RGI60-11.03229 \t-0.057456 \t\n",
    "    lon,lat = (-0.057456+360, 42.695129) \t\n",
    "    if lon>360-res/2:\n",
    "        # in that case we actually want to select longitude zero!\n",
    "        lon = 0\n",
    "    geo = dsifs.z\n",
    "    c = (geo.longitude - lon)**2 + (geo.latitude - lat)**2\n",
    "    surf_g = geo.isel(points=c.argmin())\n",
    "    np.testing.assert_allclose(surf_g.longitude, lon, atol=res/2)\n",
    "    np.testing.assert_allclose(surf_g.latitude, lat, atol=res/2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "c95cb54d-cc4e-47cb-bf24-83d1868be478",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Now run it over all years \n",
    "try:\n",
    "    os.mkdir(path + '/_raw_files/tmp_{}'.format(resolution.lower()))\n",
    "except:\n",
    "    pass\n",
    "\n",
    "for y in np.arange(y0,y1+1): \n",
    "    for var in ['t2m','tp']:\n",
    "        if resolution == 'Monthly':\n",
    "            ds = xr.open_dataset(path_unflattened + f\"ERA5_{resolution}_{var}_{y}.nc\")\n",
    "        else:\n",
    "            ds = xr.open_mfdataset(path_unflattened + f\"ERA5_{resolution}_{var}_{y}*.nc\")\n",
    "        ds = ds.load()\n",
    "        # they are named already correctly, and are correctly going already from 0 to 360 \n",
    "        # (so I removed all transformation steps)\n",
    "        assert np.all(ds.longitude >= 0)\n",
    "        assert np.all(ds.latitude <= 90)\n",
    "        # this takes too long !!!\n",
    "        # ds_merged_glaciers = xr_prcp.where(ds_inv['glacier_mask'], drop = True)\n",
    "        ds = ds.drop_vars(['number'])\n",
    "        try:\n",
    "            ds = ds.drop_vars(['expver'])\n",
    "        except: \n",
    "            pass\n",
    "        for t in ds.time.values[:]:  \n",
    "            # produce a temporary file for each month (if \"Monthly\" or day if \"Daily\")\n",
    "            sel_l = ds.sel(time=[t])\n",
    "            # don't do the dropping twice!!!!\n",
    "            #sel = sel_l.where(ds_inv['glacier_mask'], drop = True)\n",
    "            sel = sel_l.where(ds_inv['glacier_mask'])\n",
    "            sel = sel.stack(points=('latitude', 'longitude')).reset_index(('time', 'points'))\n",
    "            sel = sel.where(np.isfinite(sel[var]), drop=True)\n",
    "            try: \n",
    "                sel = sel.drop_vars(['valid_time'])\n",
    "            except:\n",
    "                pass\n",
    "            sel.to_netcdf(path+'_raw_files/tmp_{}/tmp_{}_{}.nc'.format(resolution.lower(),var, str(t)[:10]))\n",
    "            sel.close()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "ec3ca152-e07a-438c-b63d-ae0d28a92338",
   "metadata": {},
   "outputs": [],
   "source": [
    "# make yearly files\n",
    "for var in ['t2m', 'tp']:\n",
    "    for y in np.arange(y0, y1+1):\n",
    "        dso_y = xr.open_mfdataset(path+'_raw_files/tmp_{}/tmp_{}_{}-*.nc'.format(resolution.lower(),var, str(y)),\n",
    "                                  concat_dim='time', combine='nested') # .rename_vars({'time_':'time'})\n",
    "        dso_y.to_netcdf(path+'_raw_files/tmp_{}/tmp2_{}_{}.nc'.format(resolution.lower(),var, str(y)))\n",
    "        dso_y.close()\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "1052625d-391e-489e-9d8f-d34aa8fd8434",
   "metadata": {},
   "outputs": [],
   "source": [
    "for var in ['tp', 't2m']:\n",
    "    files = sorted(glob.glob(path + f'/_raw_files/tmp_{resolution.lower()}/tmp2_{var}*.nc'))\n",
    "    ds_list = []\n",
    "\n",
    "    for f in files:\n",
    "        ds = xr.open_dataset(f)\n",
    "        # drop variables that could break append\n",
    "        ds = ds.drop_vars(['expver','number','valid_time'], errors='ignore')\n",
    "        ds = ds.reset_coords() # not drop =True, as I still need the latitude, longitudes\n",
    "        ds.load()\n",
    "        if resolution == 'Monthly':\n",
    "            if ds.time.size < 12:\n",
    "                if f == files[-1]: # only do this if it is the last file\n",
    "                    m_last = f'-{int(ds.time.size):02d}'\n",
    "                    # pad last file if monthly --> actually not necessary\n",
    "                    #ds = pad_months_after_last(ds, expected_n=12)\n",
    "                else:\n",
    "                    raise ValueError(f\"File {f} has only {ds.time.size} months, but it is NOT the last file!\")\n",
    "            else:\n",
    "                m_last = ''\n",
    "        if resolution == 'Daily':\n",
    "            if ds.time.size <28:\n",
    "                raise ValueError(f\"File {f} has less than 28 days, something wrong\")\n",
    "                # I will do the complete test later ...\n",
    "            if f == files[-1]:\n",
    "                # this is only if some months are missing for the most recent year\n",
    "                last_timestamp = ds.time.values[-1]\n",
    "                _m = last_timestamp.astype('datetime64[M]').astype(int) % 12 + 1\n",
    "                _d = (last_timestamp - last_timestamp.astype('datetime64[M]')).astype('timedelta64[D]').astype(int) + 1\n",
    "                if _m==12 and _d==31:\n",
    "                    m_last = ''\n",
    "                else:\n",
    "                    m_last = f'-{int(_m):02d}'\n",
    "        ds_list.append(ds)\n",
    "\n",
    "    # concatenate all along time\n",
    "    ds_all = xr.concat(ds_list, dim='time', data_vars='all')\n",
    "\n",
    "    # make lat/lon time-independent ... (I don't know at which step, lat/lon time-dependence got inside ...)\n",
    "    lon_1d = ds_all['longitude'].isel(time=0).drop_vars('time') \n",
    "    lat_1d = ds_all['latitude'].isel(time=0).drop_vars('time')  \n",
    "    # first check that they are really time-independent\n",
    "    assert np.all(ds_all['latitude'].values == ds_all['latitude'].isel(time=0).values)\n",
    "    assert np.all(ds_all['longitude'].values == ds_all['longitude'].isel(time=0).values)\n",
    "    ds_all = ds_all.assign_coords(\n",
    "        longitude=lon_1d,\n",
    "        latitude=lat_1d)\n",
    "\n",
    "    \n",
    "    # update attributes\n",
    "    ds_all.attrs['history'] = ds_all.history + '; only glacier gridpoints chosen and flattened latitude/longitude --> points'\n",
    "    ds_all.attrs['postprocessing_date'] = str(np.datetime64('today','D'))\n",
    "    ds_all.attrs['postprocessing_scientist'] = 'lilian.schuster@uibk.ac.at'\n",
    "    ds_all.attrs['version'] = v\n",
    "    ds_all.attrs['note'] = 'includes points from RGI6, RGI7C and RGI7G central longitude and latitude'\n",
    "\n",
    "    # write once\n",
    "    utils.mkdir(path + f'flattened/{v[1:8]}/{resolution.lower()}')\n",
    "    out_path = path + f'flattened/{v[1:8]}/{resolution.lower()}/era5_{var}_global_{resolution.lower()}_{y0}_{y1}{m_last}_flat_glaciers_{v}.nc'\n",
    "    # this got manually moved then to e.g.: '/home/www/oggm/climate/era5/daily/v1.2/flattened/era5_t2m_global_daily_1940_2025_flat_glaciers_v2026.01.14.nc'\n",
    "    ds_all.to_netcdf(out_path, format='NETCDF4', engine='netcdf4', unlimited_dims=['time'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dd07b61a-8b79-408a-b9bb-ea8acb1621bd",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "9f6dde4d-aaf6-4577-87de-a5d5e307ccfe",
   "metadata": {},
   "outputs": [],
   "source": [
    "adding_single_year = False\n",
    "if adding_single_year:\n",
    "    for var in ['tp', 't2m']:\n",
    "        out_path = path + f'flattened/{v[1:8]}/{resolution.lower()}/era5_{var}_global_{resolution.lower()}_{y0}_{y1}{m_last}_flat_glaciers_{v}.nc'\n",
    "        ds_all = xr.open_dataset(out_path)\n",
    "        \n",
    "        ds = xr.open_dataset(f'/home/www/oggm/climate/era5/daily/v1.2/flattened/era5_{var}_global_daily_1940_2024_flat_glaciers_v2025.11.25.nc')\n",
    "        mask = ~ds.time.isin(ds_all.time)\n",
    "        ds_not_in_all = ds.sel(time=mask)\n",
    "        ds_all.attrs['postprocessing_date'] = ds.attrs['postprocessing_date']\n",
    "        ds_correct = xr.concat([ds_not_in_all, ds_all], dim='time')\n",
    "        ds_correct.to_netcdf(f'/home/www/oggm/climate/era5/daily/v1.2/flattened/era5_{var}_global_daily_1940_{y1}_flat_glaciers_{v}.nc',\n",
    "                             format='NETCDF4', engine='netcdf4', unlimited_dims=['time'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "id": "65d8bac5-ec7d-4399-98ed-b1c77781ae78",
   "metadata": {},
   "outputs": [],
   "source": [
    "# check if all years / months / days are there ... \n",
    "_y0 = 1940 \n",
    "for var in ['tp', 't2m']: \n",
    "    ds = xr.open_dataset(f'/home/www/oggm/climate/era5/daily/v1.2/flattened/era5_{var}_global_daily_{_y0}_{y1}_flat_glaciers_{v}.nc')\n",
    "    \n",
    "    # Expected full daily range\n",
    "    expected_daily = pd.date_range(\n",
    "        start=f\"{_y0}-01-01\",\n",
    "        end=f\"{y1}-12-31\",\n",
    "        freq=\"D\"\n",
    "    )\n",
    "    # actual time series \n",
    "    actual = pd.to_datetime(ds_correct[\"time\"].values)\n",
    "    # missing \n",
    "    missing = expected.difference(actual)\n",
    "    assert len(missing) == 0\n",
    "\n",
    "    ds_m = xr.open_dataset(f'/home/www/oggm/climate/era5/monthly/v1.2/flattened/era5_{var}_global_monthly_{_y0}_{y1}_flat_glaciers_{v}.nc')\n",
    "    assert len(ds_m.time)/12 == y1+1-_y0 #y1+1-y0\n",
    "\n",
    "    assert ds_m.time[0]['time.year'] == _y0\n",
    "    assert ds_m.time[0]['time.month'] == 1\n",
    "    assert ds_m.time[-1]['time.year'] == y1\n",
    "    assert ds_m.time[-1]['time.month'] == 12\n",
    "\n",
    "    assert ds.time[0] == pd.Timestamp(f'{_y0}-01-01')\n",
    "    assert ds.time[-1] == pd.Timestamp(f'{y1}-12-31')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "id": "d6fad2b9-327b-4bb4-8006-d4b7773f4580",
   "metadata": {},
   "outputs": [],
   "source": [
    "for var in ['tp', 't2m']:\n",
    "    files = glob.glob(path+'_raw_files/tmp_{}/tmp_{}*'.format(resolution.lower(), var))\n",
    "    for f in files:\n",
    "        os.remove(f)\n",
    "    files = glob.glob(path+'_raw_files/tmp_{}/tmp2_{}*'.format(resolution.lower(), var))\n",
    "    for f in files:\n",
    "        os.remove(f)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9dcc87e4-c61f-44c7-981c-d1a6bbe1ff79",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "2ec8ae41-1272-4030-951c-5aee0a616bc5",
   "metadata": {},
   "source": [
    "# Appendix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2dec896b-35ae-4b4e-b5ed-91152e7e6e0a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# necessary if there is not a full year available, but actually when using normal xr.concat not anymore necessary ... \n",
    "def pad_months_after_last(ds, expected_n=12):\n",
    "    \n",
    "    n = ds.sizes['time']\n",
    "    if n >= expected_n:\n",
    "        return ds\n",
    "    missing = expected_n - n\n",
    "    # get last time\n",
    "    last = ds.time.values[-1]\n",
    "\n",
    "    # branch depending on dtype: datetime64 vs cftime\n",
    "    if np.issubdtype(ds.time.dtype, np.datetime64):\n",
    "        last_ts = pd.to_datetime(last)\n",
    "        # compute first missing month: first day of next month\n",
    "        start = (last_ts + pd.offsets.MonthBegin()).normalize()\n",
    "        pad_times = pd.date_range(start=start, periods=missing, freq='MS')\n",
    "        pad_time_values = pad_times.values  # numpy.datetime64[ns]\n",
    "    else:\n",
    "        # use cftime: get a cftime datetime for the next month\n",
    "        # handle different cftime calendars via cftime library defaults\n",
    "        if isinstance(last, cftime.datetime):\n",
    "            year = last.year\n",
    "            month = last.month + 1\n",
    "            year += (month - 1) // 12\n",
    "            month = (month - 1) % 12 + 1\n",
    "            pad_time_values = []\n",
    "            for i in range(missing):\n",
    "                m = month + i\n",
    "                y = year + (m - 1) // 12\n",
    "                mm = (m - 1) % 12 + 1\n",
    "                pad_time_values.append(cftime.datetime(y, mm, 1))\n",
    "        else:\n",
    "            # fallback: convert to pandas and then to cftime datetimes\n",
    "            last_ts = pd.to_datetime(last)\n",
    "            start = (last_ts + pd.offsets.MonthBegin()).normalize()\n",
    "            pad_times = pd.date_range(start=start, periods=missing, freq='MS')\n",
    "            pad_time_values = [cftime.datetime(t.year, t.month, t.day) for t in pad_times]\n",
    "\n",
    "    # create a pad dataset: copy single-time slice and expand with NaNs\n",
    "    pad = ds.isel(time=slice(0, 1)).copy(deep=True) * np.nan\n",
    "    # tile to required number of missing months\n",
    "    pad = pad.isel(time=0).expand_dims(time=pad_time_values)\n",
    "    # assign correct time coordinate\n",
    "    pad = pad.assign_coords(time=('time', pad_time_values))\n",
    "    # concatenate along time\n",
    "    ds_padded = xr.concat([ds, pad], dim='time')\n",
    "    return ds_padded\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:oggm_env_2025]",
   "language": "python",
   "name": "conda-env-oggm_env_2025-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}