import xarray as xr import numpy as np import pandas as pd import os import glob import matplotlib.pyplot as plt from datetime import date from oggm.utils import mkdir import oggm import geopandas as gpd import sys ### start via: `sbatch postprocessing_fill_missing_glaciers_rounce.slurm '01' ` ... today = '07/02/2022' model_author = 'Rounce' gcms = ['gfdl-esm4', 'ipsl-cm6a-lr', 'mpi-esm1-2-hr', 'mri-esm2-0', 'ukesm1-0-ll' ] scenarios = ['hist', 'ssp126', 'ssp370', 'ssp585'] y0_times = [1851, 1901, 1951, 1995, 2021, 2041, 2061, 2081] #y0_times = [2021, 2041] #, #y0_times = [2021, 2041] #y0_times = [1995] print(y0_times) # get the RGI area / ITMIX volumes of the glaciers rgi_regs = [] for rgi_reg in np.arange(1,20,1): if rgi_reg < 10: rgi_reg = '0'+str(rgi_reg) else: rgi_reg = str(rgi_reg) rgi_regs.append(rgi_reg) df_itmix = pd.read_hdf(oggm.utils.get_demo_file('rgi62_itmix_df.h5')) rgidf_dict = {} for rgi_reg in rgi_regs: path_rgi = oggm.utils.get_rgi_region_file(rgi_reg, version='6') rgidf = gpd.read_file(path_rgi) #Greenland periphery : all glaciers with connectivity level 0 and 1 should be included, level 2 glaciers should be excluded (as was the case for GlacierMIP2) # total RGI area: 89,651km2 if rgi_reg == '05': rgidf = rgidf.loc[(rgidf['Connect'] == 0) | (rgidf['Connect'] ==1)] rgidf_dict[rgi_reg] = rgidf rgidf_dict[rgi_reg] = rgidf_dict[rgi_reg].set_index('RGIId') rgi_reg_sel = sys.argv[1] ### get the per glacier volumes and areas for each rgi_region, period, scenario and gcm: ### then estimate which glaciers are missing and get the missing_vs_working rgi area ### we also have to enlarge our dataset with the missing glaciers (first just put NaN values inside, those will be filled later) ### then fill the missing glaciers with the mean statistics from glaciers with a relative glacier area difference of less than 1% folder_path = '/home/www/drounce/per_glacier-filled-norunaway/' for rgi_reg in [rgi_reg_sel]: print(f'rgi_reg: {rgi_reg} started') for y0_time in y0_times: period = f'{y0_time}-{y0_time+19}' if y0_time in [1851, 1901, 1951, 1995]: scenarios_sel = ['hist'] else: scenarios_sel = scenarios[1:] # just because I have to repeat the 1995 again !!! for scenario in scenarios_sel: for gcm in gcms: path = folder_path + f'{rgi_reg}/{model_author}_rgi{rgi_reg}_glaciers_{period}_{gcm}_{scenario}*.nc' ds = xr.open_mfdataset(path) ds = ds[['volume_m3','area_m2']] ds = ds.load() rgi_reg_working = ds.volume_m3.isel(simulation_year=0).dropna(dim='rgi_id').rgi_id.values # this is much faster bad_rgi = list(set(rgidf_dict[rgi_reg].index).difference(rgi_reg_working)) #bad_rgi = [] #for r in rgidf_dict[rgi_reg].index: # if r not in rgi_reg_working: # bad_rgi.append(r) bad = rgidf_dict[rgi_reg].loc[bad_rgi] good = rgidf_dict[rgi_reg].loc[rgi_reg_working] # working glaciers bad_ratio = bad.Area.sum() / good.Area.sum() #bad_glacier_count_perc = (len(bad)/len(good)) #print('Correcting for {:.2f}% bad area'.format(bad_ratio * 100)) assert len(bad)+len(good) == len(rgidf_dict[rgi_reg].index) # create dummy xarray dataset with empty data for the missing glaciers # that will be filled afterwards ds_dummy = ds.isel(rgi_id=0).load().copy(deep=True) ds_dummy['volume_m3'].data[...] = np.NaN ds_dummy['area_m2'].data[...] = np.NaN ds_dummy = ds_dummy.drop('rgi_id') ds_dummy = ds_dummy.expand_dims({'rgi_id':bad.index}).transpose() # add the dummy xarray dataset to the existing glaciers ds_sel_full = xr.concat([ds, ds_dummy], dim='rgi_id') # Repair the missing glaciers ds_rep_sel = ds_sel_full.copy(deep=True) ds_rep_sel = ds_rep_sel.squeeze() ds_index = ds_rep_sel.rgi_id.data # need to loop over every glacier for rid, g in bad.iterrows(): area_diff = np.abs(good.Area - g.Area) / g.Area # if possible, take all glacers of that RGI region that # have a smaller relative area difference than 1 % sel = area_diff.loc[area_diff < 0.01] if len(sel) < 3: # if less than 3 glaciers have a similar area, # take instead the 9 area-most similar glaciers sel = area_diff.sort_values().iloc[:9] # Now, we take the average over the selected area-similar glaciers # and add them as fake statistics for that missing glacier # (in our case it is area and volume that that # are estimated for each simulation year like that) new = ds.sel(rgi_id=sel.index).mean(dim='rgi_id').squeeze() rep_id = np.nonzero(ds_index == rid)[0][0] for vn in ds_rep_sel.data_vars: ds_rep_sel[vn].data[...,rep_id] = new[vn].data # Convert to f64 for exact summing ds_rep_sel = ds_rep_sel.astype(np.float64) # add some statistics about how much we had to fill up this RGI region ds_rep_sel.attrs['ratio_area_missing_filled_vs_working'] = bad_ratio #*100 ds_rep_sel.attrs['n_glaciers_missing_filled'] = len(bad) # create a dataset that has the same form as the "raw" uncorrected dataset # but we add an information that the data was corrected ds_rep_sel.attrs['contributor'] = model_author ds_rep_sel.attrs['contributor_email'] = 'drounce@cmu.edu' ds_rep_sel.attrs['creation_date'] = str(today) ds_rep_sel.attrs['rgi-region'] = f'rgi{rgi_reg}' ds_rep_sel.attrs['aggregation-level'] = 'sum' ds_rep_sel.attrs['period'] = period ds_rep_sel.attrs['gcm'] = gcm ds_rep_sel.attrs['ssp'] = scenario add = ('\n(missing glaciers were filled by Lilian Schuster with average glacier' 'statistics from similar-area glaciers of that RGI region, additional filling done before by David Rounce if glacier exists for one gcm and runaway effect removed)') ds_rep_sel.attrs['information'] = 'PyGEM for mass balance and calibration with OGGM for glacier dynamics'+ add ds_rep_sel.attrs['stop_criterion'] = 'Simulations were stopped if volume was 0 for 20 years or 100-yr avg mb was within +/- 10 mm w.e.' # check if there are any NaN values (should not be the case, all glaciers should have been filled!) assert set(ds_rep_sel.dropna(dim='rgi_id', how='any').rgi_id.values) == set(rgidf_dict[rgi_reg].index) # we only keep the regional summed up filled files for the moment ds_rep_sel = ds_rep_sel.sum(dim='rgi_id', keep_attrs=True) ## save the data out_dir = '/home/users/lschuster/www_lschuster/glacierMIP3_analysis/Rounce/regional_filled/' out_path = out_dir + f'{rgi_reg}/{model_author}_rgi{rgi_reg}_sum_{period}_{gcm}_{scenario}_filled.nc' mkdir(out_dir + rgi_reg) ds_rep_sel.to_netcdf(out_path) # close the datasets to be sure ... ds_rep_sel.close() ds_sel_full.close() ds.close()