{ "cells": [ { "cell_type": "markdown", "id": "dffd70cd-400c-4634-afb7-2afdf7f23209", "metadata": {}, "source": [ "# Flatten monthly ERA5 files and select only glaciated gridpoints \n", "\n", "--> monthly: currently from 1940-01 to 2025-09, but only applied the flattening until end of 2024\n", "\n", "In `/home/www/lschuster/example_ipynb/flatten_glacier_gridpoint_tests_2025_6_new_isimip3b.ipynb`, we check if the flattening approach works as it should. \n", "\n", "- todo:\n", " - create a workflow that directly checks how much got downloaded and appends the missing months ...? (eventually even starts a script to download the missing files until now) \n", " - check if the files can be easily incorporated into existing OGGM methods/functions ... " ] }, { "cell_type": "code", "execution_count": 1, "id": "83acffa7-6af9-402e-92df-a24ccd179136", "metadata": {}, "outputs": [], "source": [ "import xarray as xr\n", "import numpy as np\n", "import pandas as pd\n", "from oggm import utils\n", "\n", "import sys\n", "import matplotlib.pyplot as plt\n", "import os\n", "import glob\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "5299a1f3-1437-47bc-bdda-871264d1f807", "metadata": {}, "outputs": [], "source": [ "resolution = 'Monthly' #'Monthly'\n", "y0 = 1940\n", "y1 = 2024 #2024" ] }, { "cell_type": "code", "execution_count": 3, "id": "bbc660ee-c05c-41b1-b879-d29796fc2e48", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Size: 6kB\n", "array([ 90. , 89.75, 89.5 , ..., -89.5 , -89.75, -90. ])\n", "Coordinates:\n", " * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", " number int64 8B ...\n", " expver Size: 12kB\n", "array([0.0000e+00, 2.5000e-01, 5.0000e-01, ..., 3.5925e+02, 3.5950e+02,\n", " 3.5975e+02])\n", "Coordinates:\n", " * longitude (longitude) float64 12kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8\n", " number int64 8B ...\n", " expver = 0)\n", "nx, ny = ds_inv.sizes['longitude'], ds_inv.sizes['latitude']\n", "# get the dataset where coordinates of glaciers are stored\n", "frgi = utils.file_downloader('https://cluster.klima.uni-bremen.de/~oggm/rgi/rgi62_stats.h5')\n", "odf = pd.read_hdf(frgi, index_col=0)\n", "# attention the rgi62 stats are in longitude from -179 to 180 \n", "assert odf.CenLon.max() <180\n", "assert odf.CenLon.min() <-175 \n", "cenlon_for_bins = np.where(odf['CenLon'] < 0, odf['CenLon']+360, odf['CenLon'])\n", "# Nearest neighbor lookup --> W5E5 is 0.5° grid, ERA5 is 0.25° grid ... \n", "lon_bins = np.linspace(ds_inv.longitude.data[0] - 0.125, ds_inv.longitude.data[-1] + 0.125, nx+1)\n", "# !!! attention W5E5 sorted from 90 to -90 !!!!\n", "lat_bins = np.linspace(ds_inv.latitude.data[0] + 0.125, ds_inv.latitude.data[-1] - 0.125, ny+1)\n", "assert np.diff(lon_bins).min() == 0.25\n", "assert np.diff(lon_bins).max() == 0.25\n", "odf['lon_id'] = np.digitize(cenlon_for_bins, lon_bins)-1\n", "odf['lat_id'] = np.digitize(odf['CenLat'], lat_bins)-1\n", "# if a glacier has a longitude above or equal 359.875, it hould be again in the first bin \n", "# thus if \"lon_id\" ==1440 it is actually in the first bin ... \n", "odf.loc[odf['lon_id'] == nx, 'lon_id'] = 0\n", "\n", "odf['lon_val'] = ds_inv.longitude.data[odf.lon_id]\n", "odf['lat_val'] = ds_inv.latitude.data[odf.lat_id]\n", "# Use unique grid points as index and compute the area per location\n", "odf['unique_id'] = ['{:03d}_{:03d}'.format(lon, lat) for (lon, lat) in zip(odf['lon_id'], odf['lat_id'])]\n", "mdf = odf.drop_duplicates(subset='unique_id').set_index('unique_id')\n", "mdf['Area'] = odf.groupby('unique_id').sum()['Area']\n", "print('Total number of glaciers: {} and number of ERA5 gridpoints with glaciers in them: {}'.format(len(odf), len(mdf)))\n", "\n", "# this is the mask that we need to remove all non-glacierized gridpoints\n", "mask = np.full((ny, nx), np.NaN)\n", "mask[mdf['lat_id'], mdf['lon_id']] = 1 \n", "ds_inv['glacier_mask'] = (('latitude', 'longitude'), np.isfinite(mask))\n", "\n", "# check the distance to the gridpoints-> it should never be larger than \n", "diff_lon = ds_inv.longitude.data[odf.lon_id] - odf.CenLon\n", "# if the distance is 360 -> it is the same as 0,\n", "diff_lon = np.where(np.abs(diff_lon - 360) < 170, diff_lon-360, diff_lon)\n", "odf['ll_dist_to_point'] = ((diff_lon)**2 + (ds_inv.latitude.data[odf.lat_id] - odf.CenLat)**2)**0.5\n", "assert odf['ll_dist_to_point'].max() < (0.125**2 + 0.125**2)**0.5\n", "# somehow the orography has a date, but wewant to remove tat\n", "ds_inv = ds_inv.sel(valid_time=\"2020-01-01\")\n", "\n", "###### create the invariant file from the first Monthly dataset using one of the two variables\n", "if resolution == 'Monthly':\n", " var = 'tp'\n", " ds = xr.open_dataset(f\"/home/www/oggm/climate/wip/era5_monthly/_raw_files/ERA5_Monthly_{var}_1940.nc\")\n", " # they are named already correctly, and are correctly going already from 0 to 360 \n", " # (so I removed all transformation steps)\n", " assert np.all(ds.longitude >= 0)\n", " assert np.all(ds.latitude <= 90)\n", " ds['ASurf'] = ds_inv['orography'].copy() #ds_inv['ASurf']\n", " \n", " # as we use here only those gridpoints where glaciers are involved, need to put the mask on dsi as well!\n", " # dsi = ds_inv.where(ds_inv['glacier_mask'], drop = True) # this makes out of in total 6483600 points only XXX points!!!\n", " dsi = ds.isel(time=[0]).where(ds_inv['glacier_mask'], drop = True)\n", " # we do not want any dependency on latitude and longitude\n", " dsif = dsi.stack(points=('latitude', 'longitude')).reset_index(('points'))\n", " #dsif # so far still many points \n", " \n", " # drop the non-glacierized points\n", " dsifs = dsif.where(np.isfinite(dsi[var].stack(points=('latitude',\n", " 'longitude')).reset_index(('time',\n", " 'points'))), drop=True)\n", " # I have to drop the 'time_' dimension, to be equal to the era5_land example, because the invariant file should not have any time dependence !\n", " dsifs = dsifs.drop_vars(var)\n", " dsifs = dsifs.drop_vars(['number','valid_time', 'expver'])\n", " #dsifs = dsifs.drop_vars('time')\n", " # I don't want to drop the dimension as ASurf is dependent on the timestep, but I want to remove the time dependency\n", " # dsifs = dsifs.drop_dims('time')\n", " dsifs = dsifs.squeeze('time')\n", " #dsifs = dsifs.drop('time_')\n", " dsifs.attrs['version'] = '2025.11'\n", " dsifs.to_netcdf('../2025.11/era5_glacier_invariant_flat.nc')\n", " \n", " #check if gridpoint nearest to hef is right!!!\n", " # happened once that something went wrong here ...\n", " lon, lat = (10.7584, 46.8003)\n", " #orog = xr.open_dataset('/home/lilianschuster/Downloads/w5e5v2.0_glacier_invariant_flat.nc').ASurf\n", " orog = dsifs.ASurf\n", " c = (orog.longitude - lon)**2 + (orog.latitude - lat)**2\n", " surf_hef = orog.isel(points=c.argmin())\n", " np.testing.assert_allclose(surf_hef, 2250, rtol=0.1)\n", " \n", " # check another glacier\n", " # RGI60-19.00124\n", " lon,lat = (-70.8931 +360, -72.4474)\n", " orog = dsifs.ASurf\n", " c = (orog.longitude - lon)**2 + (orog.latitude - lat)**2\n", " surf_g = orog.isel(points=c.argmin())\n", " np.testing.assert_allclose(surf_g.longitude, lon, rtol=0.2)\n", " np.testing.assert_allclose(surf_g.latitude, lat, rtol=0.2)\n" ] }, { "cell_type": "code", "execution_count": 4, "id": "c95cb54d-cc4e-47cb-bf24-83d1868be478", "metadata": {}, "outputs": [], "source": [ "# Now run it over all years \n", "try:\n", " os.mkdir('/home/www/oggm/climate/wip/era5_{}/_raw_files/tmp_tp'.format(resolution.lower()))\n", " os.mkdir('/home/www/oggm/climate/wip/era5_{}/_raw_files/tmp_t2m'.format(resolution.lower()))\n", "except:\n", " pass\n", "\n", "for y in np.arange(y0,y1+1): \n", " for var in ['t2m','tp']:\n", " if resolution == 'Monthly':\n", " ds = xr.open_dataset(f\"/home/www/oggm/climate/wip/era5_monthly/_raw_files/ERA5_{resolution}_{var}_{y}.nc\")\n", " else:\n", " ds = xr.open_mfdataset(f\"/home/www/oggm/climate/wip/era5_daily_lily/_raw_files/ERA5_{resolution}_{var}_{y}*.nc\")\n", " ds = ds.load()\n", " # they are named already correctly, and are correctly going already from 0 to 360 \n", " # (so I removed all transformation steps)\n", " assert np.all(ds.longitude >= 0)\n", " assert np.all(ds.latitude <= 90)\n", " # this takes too long !!!\n", " # ds_merged_glaciers = xr_prcp.where(ds_inv['glacier_mask'], drop = True)\n", " ds = ds.drop_vars(['number'])\n", " try:\n", " ds = ds.drop_vars(['expver'])\n", " except: \n", " pass\n", " for t in ds.time.values[:]: \n", " # produce a temporary file for each month (if \"Monthly\" or day if \"Daily\")\n", " sel_l = ds.sel(time=[t])\n", " # don't do the dropping twice!!!!\n", " #sel = sel_l.where(ds_inv['glacier_mask'], drop = True)\n", " sel = sel_l.where(ds_inv['glacier_mask'])\n", " sel = sel.stack(points=('latitude', 'longitude')).reset_index(('time', 'points'))\n", " sel = sel.where(np.isfinite(sel[var]), drop=True)\n", " sel = sel.drop_vars(['valid_time'])\n", " sel.to_netcdf('/home/www/oggm/climate/wip/era5_{}/_raw_files/tmp_{}/tmp_{}_{}.nc'.format(resolution.lower(),var, resolution, str(t)[:10]))\n", " sel.close()\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "ec3ca152-e07a-438c-b63d-ae0d28a92338", "metadata": {}, "outputs": [], "source": [ "# make yearly files\n", "for var in ['t2m', 'tp']:\n", " for y in np.arange(y0, y1+1):\n", " dso_y = xr.open_mfdataset('/home/www/oggm/climate/wip/era5_{}/_raw_files/tmp_{}/tmp_{}_{}-*.nc'.format(resolution.lower(),var, resolution, str(y)),\n", " concat_dim='time', combine='nested') # .rename_vars({'time_':'time'})\n", " dso_y.to_netcdf('/home/www/oggm/climate/wip/era5_{}/_raw_files/tmp_{}/tmp2_{}_{}.nc'.format(resolution.lower(),var, resolution, str(y)))\n", " dso_y.close()\n", " " ] }, { "cell_type": "code", "execution_count": 6, "id": "19cc53cb-d690-4b12-a714-4b13abae4476", "metadata": {}, "outputs": [], "source": [ "#xr.open_dataset(f).drop_vars(['expver','number','valid_time'], errors='ignore').reset_coords()" ] }, { "cell_type": "code", "execution_count": 7, "id": "1052625d-391e-489e-9d8f-d34aa8fd8434", "metadata": {}, "outputs": [], "source": [ "for var in ['tp', 't2m']:\n", " files = sorted(glob.glob(f'/home/www/oggm/climate/wip/era5_{resolution.lower()}/_raw_files/tmp_{var}/tmp2_{resolution}*.nc'))\n", " ds_list = []\n", "\n", " for f in files:\n", " ds = xr.open_dataset(f)\n", " # drop variables that could break append\n", " ds = ds.drop_vars(['expver','number','valid_time'], errors='ignore')\n", " ds = ds.reset_coords() # not drop =True, as I still need the latitude, longitudes\n", " ds.load()\n", " if resolution == 'Monthly':\n", " if ds.time.size < 12:\n", " if f == files[-1]: # only do this if it is the last file\n", " m_last = f'-{int(ds.time.size):02d}'\n", " # pad last file if monthly --> actually not necessary\n", " #ds = pad_months_after_last(ds, expected_n=12)\n", " else:\n", " raise ValueError(f\"File {f} has only {ds.time.size} months, but it is NOT the last file!\")\n", " else:\n", " m_last = ''\n", " if resolution == 'Daily':\n", " if ds.time.size <28:\n", " raise ValueError(f\"File {f} has less than 28 days, something wrong\")\n", " if f == files[-1]:\n", " last_timestamp = ds.time.values[-1]\n", " _m = last_timestamp.astype('datetime64[M]').astype(int) % 12 + 1\n", " _d = (last_timestamp - last_timestamp.astype('datetime64[M]')).astype('timedelta64[D]').astype(int) + 1\n", " if _m==12 and _d==31:\n", " m_last = ''\n", " else:\n", " m_last = f'-{int(_m):02d}'\n", " ds_list.append(ds)\n", "\n", " # concatenate all along time\n", " ds_all = xr.concat(ds_list, dim='time', data_vars='all')\n", "\n", " # update attributes\n", " ds_all.attrs['history'] = ds_all.history + '; only glacier gridpoints chosen and flattened latitude/longitude --> points'\n", " ds_all.attrs['postprocessing_date'] = str(np.datetime64('today','D'))\n", " ds_all.attrs['postprocessing_scientist'] = 'lilian.schuster@uibk.ac.at'\n", " ds_all.attrs['version'] = '2025.11'\n", "\n", " # write once\n", " out_path = f'/home/www/oggm/climate/wip/era5_{resolution.lower()}/flattened/2025.11/{resolution.lower()}/era5_{var}_global_{resolution.lower()}_{y0}_{y1}{m_last}_flat_glaciers.nc'\n", " ds_all.to_netcdf(out_path, format='NETCDF4', engine='netcdf4', unlimited_dims=['time'])\n" ] }, { "cell_type": "code", "execution_count": 13, "id": "d6fad2b9-327b-4bb4-8006-d4b7773f4580", "metadata": {}, "outputs": [], "source": [ "for var in ['tp', 't2m']:\n", " files = glob.glob('tmp_{}/tmp_{}*'.format(var, resolution))\n", " for f in files:\n", " os.remove(f)\n", " files = glob.glob('tmp_{}/tmp2_{}*'.format(var, resolution))\n", " for f in files:\n", " os.remove(f)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "a56a3ad9-4ac6-4202-baaa-fb24baa2d368", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "9dcc87e4-c61f-44c7-981c-d1a6bbe1ff79", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "2ec8ae41-1272-4030-951c-5aee0a616bc5", "metadata": {}, "source": [ "# Appendix" ] }, { "cell_type": "code", "execution_count": null, "id": "2dec896b-35ae-4b4e-b5ed-91152e7e6e0a", "metadata": {}, "outputs": [], "source": [ "# necessary if there is not a full year available, but actually when using normal xr.concat not anymore necessary ... \n", "def pad_months_after_last(ds, expected_n=12):\n", " \n", " n = ds.sizes['time']\n", " if n >= expected_n:\n", " return ds\n", " missing = expected_n - n\n", " # get last time\n", " last = ds.time.values[-1]\n", "\n", " # branch depending on dtype: datetime64 vs cftime\n", " if np.issubdtype(ds.time.dtype, np.datetime64):\n", " last_ts = pd.to_datetime(last)\n", " # compute first missing month: first day of next month\n", " start = (last_ts + pd.offsets.MonthBegin()).normalize()\n", " pad_times = pd.date_range(start=start, periods=missing, freq='MS')\n", " pad_time_values = pad_times.values # numpy.datetime64[ns]\n", " else:\n", " # use cftime: get a cftime datetime for the next month\n", " # handle different cftime calendars via cftime library defaults\n", " if isinstance(last, cftime.datetime):\n", " year = last.year\n", " month = last.month + 1\n", " year += (month - 1) // 12\n", " month = (month - 1) % 12 + 1\n", " pad_time_values = []\n", " for i in range(missing):\n", " m = month + i\n", " y = year + (m - 1) // 12\n", " mm = (m - 1) % 12 + 1\n", " pad_time_values.append(cftime.datetime(y, mm, 1))\n", " else:\n", " # fallback: convert to pandas and then to cftime datetimes\n", " last_ts = pd.to_datetime(last)\n", " start = (last_ts + pd.offsets.MonthBegin()).normalize()\n", " pad_times = pd.date_range(start=start, periods=missing, freq='MS')\n", " pad_time_values = [cftime.datetime(t.year, t.month, t.day) for t in pad_times]\n", "\n", " # create a pad dataset: copy single-time slice and expand with NaNs\n", " pad = ds.isel(time=slice(0, 1)).copy(deep=True) * np.nan\n", " # tile to required number of missing months\n", " pad = pad.isel(time=0).expand_dims(time=pad_time_values)\n", " # assign correct time coordinate\n", " pad = pad.assign_coords(time=('time', pad_time_values))\n", " # concatenate along time\n", " ds_padded = xr.concat([ds, pad], dim='time')\n", " return ds_padded\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:oggm_env_2025]", "language": "python", "name": "conda-env-oggm_env_2025-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.14" } }, "nbformat": 4, "nbformat_minor": 5 }