# updated script version for flattening W5E5 v2025.11.25
# before it was 2023.3 (for 11900 glaciers not the nearest climate gridpoint was chosen in v2022.2, but should now work 
# in this v2023.2)

#- from: https://data.isimip.org/search/query/w5e5/climate_forcing/w5e5v2.0/climate_variable/pr/climate_variable/orog/climate_variable/tas/
#- download in terminal: `wget -i download_W5E5_script.txt` 

###
# for the monthly files, I had to do that here to remove the time variable from lon/lat
# with xr.open_dataset(w5e5_files[n]) as _d:
#    _d['latitude'] = _d.latitude.isel(time=0, drop=True)
#    _d['longitude'] = _d.longitude.isel(time=0, drop=True)
#    _d.to_netcdf(isimip3a_files[n][:-3]+'x.nc')

import xarray as xr
import numpy as np
import pandas as pd
from oggm import utils

import sys
import matplotlib.pyplot as plt
import os

version = '2025.11.25'
v = '_v'+version
res = 0.5

#### this part here is the same for all W5E5 / ISIMIP3a/ISIMIP3b files
ds_inv = xr.open_dataset('orog_W5E5v2.0.nc')
ds_inv = ds_inv.rename({'lat': 'latitude'})
ds_inv = ds_inv.rename({'lon': 'longitude'})
ds_inv.coords['longitude'] = np.where(ds_inv.longitude.values < 0,
                                      ds_inv.longitude.values + 360,
                                      ds_inv.longitude.values)
ds_inv = ds_inv.sortby(ds_inv.longitude)
ds_inv.longitude.attrs['units'] = 'degrees_onlypositive'

nx, ny = ds_inv.sizes['longitude'], ds_inv.sizes['latitude']
# get the dataset where coordinates of glaciers are stored
frgi_6 = utils.file_downloader('https://cluster.klima.uni-bremen.de/~oggm/rgi/rgi62_stats.h5')
odf_6 = pd.read_hdf(frgi_6, index_col=0)
#### glacier complexes
frgi_7C = utils.file_downloader('https://cluster.klima.uni-bremen.de/~oggm/rgi/RGI2000-v7.0-C-global-attributes.csv')
odf_7C = pd.read_csv(frgi_7C, index_col=0)
#### glaciers 
frgi_7G = utils.file_downloader('https://cluster.klima.uni-bremen.de/~oggm/rgi/RGI2000-v7.0-G-global-attributes.csv')
odf_7G = pd.read_csv(frgi_7G, index_col=0)

odf_l = []
for odf,n in zip([odf_6,odf_7C,odf_7G],['RGI6', 'RGI7C', 'RGI7G']):
    # need to repeat this for every option here to have one flattened file that has the necessary points for all three versions
    
    if (n == 'RGI7C') or (n=='RGI7G'):
        # has slightly different column names ... 
        odf['CenLon'] = odf['cenlon']
        odf['CenLat'] = odf['cenlat']
        odf['Area'] = odf['area_km2']
    # attention the rgi62 stats are in longitude from -179 to 180  
    assert odf.CenLon.max() <180
    assert odf.CenLon.min() <-175 
    cenlon_for_bins = np.where(odf['CenLon'] < 0, odf['CenLon']+360, odf['CenLon'])
    # Nearest neighbor lookup --> W5E5 is 0.5° grid, ERA5 is 0.25° grid ... --> here we use 0.125
    lon_bins = np.linspace(ds_inv.longitude.data[0] - res/2, ds_inv.longitude.data[-1] + res/2, nx+1)
    # !!! attention W5E5 and ERA5 sorted from 90 to -90 !!!!
    lat_bins = np.linspace(ds_inv.latitude.data[0] + res/2, ds_inv.latitude.data[-1] - res/2, ny+1)
    assert np.diff(lon_bins).min() == res
    assert np.diff(lon_bins).max() == res
    odf['lon_id'] = np.digitize(cenlon_for_bins, lon_bins)-1
    odf['lat_id'] = np.digitize(odf['CenLat'], lat_bins)-1
    # if a glacier has a longitude above or equal 359.875, it hould be again in the first bin 
    # thus if "lon_id" ==1440 it is actually in the first bin ... 
    odf.loc[odf['lon_id'] == nx, 'lon_id'] = 0
    
    odf['lon_val'] = ds_inv.longitude.data[odf.lon_id]
    odf['lat_val'] = ds_inv.latitude.data[odf.lat_id]
    # Use unique grid points as index and compute the area per location
    odf['unique_id'] = ['{:03d}_{:03d}'.format(lon, lat) for (lon, lat) in zip(odf['lon_id'], odf['lat_id'])]
    mdf = odf.drop_duplicates(subset='unique_id').set_index('unique_id')
    mdf['Area'] = odf.groupby('unique_id').sum()['Area']
    if n == 'RGI7C':
        print('{} ––– Total number of glacier complexes: {} and number of W5E5/ISIMIP3ab gridpoints with glacier complexes in them: {}'.format(n, len(odf), len(mdf)))
    else:
        print('{} ––– Total number of glaciers: {} and number of W5E5/ISIMIP3ab gridpoints with glaciers in them: {}'.format(n, len(odf), len(mdf)))

    # at the end, we want to have one 
    odf['rgi_v'] = n
    odf_l.append(odf.copy())
odf_all = pd.concat(odf_l)
mdf = odf_all.drop_duplicates(subset='unique_id').set_index('unique_id')
mdf['Area'] = odf_all.groupby('unique_id').sum()['Area']
print('All versions ––– Total number of glacier IDs: {} and number of W5E5/ISIMIP3ab gridpoints with glaciers in them: {}'.format(len(odf_all), len(mdf)))

# this is the mask that we need to remove all non-glacierized gridpoints
mask = np.full((ny, nx), np.NaN)
mask[mdf['lat_id'], mdf['lon_id']] = 1 
ds_inv['glacier_mask'] = (('latitude', 'longitude'), np.isfinite(mask))

# check the distance to the gridpoints-> it should never be larger than 
diff_lon = ds_inv.longitude.data[odf.lon_id] - odf.CenLon
# if the distance is 360 -> it is the same as 0,
diff_lon = np.where(np.abs(diff_lon - 360) < 170, diff_lon-360, diff_lon)
odf['ll_dist_to_point'] = ((diff_lon)**2 + (ds_inv.latitude.data[odf.lat_id] - odf.CenLat)**2)**0.5
assert odf['ll_dist_to_point'].max() < ((res/2)**2 + (res/2)**2)**0.5

### from here it is different for every script ...


make_daily = True
if make_daily:
    
    try:
        os.mkdir('tmp_pr')
        os.mkdir('tmp_tas')
    except:
        pass

    yss = [1979, 1981, 1991, 2001, 2011]
    yee = [1980, 1990, 2000, 2010, 2019]
    for var in ['pr', 'tas']:
        for ys, ye in zip(yss, yee):
            ds = xr.open_dataset('{}_W5E5v2.0_{}0101-{}1231.nc'.format(var, ys, ye))

            ds = ds.rename({'lon':'longitude'}).rename({'lat':'latitude'})
            ds.coords['longitude'] = np.where(ds.longitude.values < 0,
                                              ds.longitude.values + 360,
                                              ds.longitude.values)

            ds = ds.sortby(ds.longitude)
            ds.longitude.attrs['units'] = 'degrees_onlypositive'

            # this takes too long !!!
            # ds_merged_glaciers = xr_prcp.where(ds_inv['glacier_mask'], drop = True)

            ds['ASurf'] = ds_inv['orog'] #ds_inv['ASurf']
            if ys==1979 and var =='pr':

                # as we use here only those gridpoints where glaciers are involved, need to put the mask on dsi as well!
                # dsi = ds_inv.where(ds_inv['glacier_mask'], drop = True)  # this makes out of in total 6483600 points only 116280 points!!!
                dsi = ds.isel(time=[0]).where(ds_inv['glacier_mask'], drop = True)
                # we do not want any dependency on latitude and longitude
                dsif = dsi.stack(points=('latitude', 'longitude')).reset_index(('points'))
                #dsif # so far still many points 

                # drop the non-glacierized points
                dsifs = dsif.where(np.isfinite(dsi[var].stack(points=('latitude',
                                                                      'longitude')).reset_index(('time',
                                                                                                 'points'))), drop=True)
                # I have to drop the 'time_' dimension, to be equal to the era5_land example, because the invariant file should not have any time dependence !
                dsifs = dsifs.drop_vars(var)
                dsifs = dsifs.drop('time')
                try:
                    dsifs = dsifs.drop('time_')
                except:
                    pass
                dsifs.attrs['version'] = version
                if version == '2025.11.25':
                    dsifs.attrs['note'] = 'includes points from RGI6, RGI7C and RGI7G central longitude and latitude'
                dsifs.to_netcdf(f'../flattened/{version}/w5e5v2.0_glacier_invariant_flat{v}.nc')

                #check if gridpoint nearest to hef is right!!!
                # happened once that something went wrong here ...
                lon, lat = (10.7584, 46.8003)
                #orog = xr.open_dataset('/home/lilianschuster/Downloads/w5e5v2.0_glacier_invariant_flat.nc').ASurf
                orog = dsifs.ASurf
                c = (orog.longitude - lon)**2 + (orog.latitude - lat)**2
                surf_hef = orog.isel(points=c.argmin())
                np.testing.assert_allclose(surf_hef, 2250, rtol=0.1)
                
                # check wrong glacier in previous version
                # RGI60-19.00124
                lon,lat = (-70.8931 +360, -72.4474)
                orog = dsifs.ASurf
                c = (orog.longitude - lon)**2 + (orog.latitude - lat)**2
                surf_g = orog.isel(points=c.argmin())
                np.testing.assert_allclose(surf_g.longitude, lon, rtol=0.2)
                np.testing.assert_allclose(surf_g.latitude, lat, rtol=0.2)
                
                
            ds = ds.drop_vars('ASurf')

            for t in ds.time.values[:]:  # ds_merged_glaciers.time.values: .sel(time=slice('1901-04','2016-12'))
                # produce a temporary file for each month
                sel_l = ds.sel(time=[t])
                # don't do the dropping twice!!!!
                #sel = sel_l.where(ds_inv['glacier_mask'], drop = True)
                sel = sel_l.where(ds_inv['glacier_mask'])
                sel = sel.stack(points=('latitude', 'longitude')).reset_index(('time', 'points'))
                sel = sel.where(np.isfinite(sel[var]), drop=True)
                sel.to_netcdf('tmp_{}/tmp_{}.nc'.format(var, str(t)[:10]))


###

# rename precipitation to 'pr'
# make yearly files
for var in ['pr', 'tas']:
    for y in np.arange(1979, 2020):
        dso_y = xr.open_mfdataset('tmp_{}/tmp_{}-*.nc'.format(var, str(y)),
                                  concat_dim='time', combine='nested') # .rename_vars({'time_':'time'})
        dso_y.to_netcdf('tmp_{}/tmp2_{}.nc'.format(var, str(y)))
        dso_y.close()
    
# aggregate yearly files
for var in ['pr', 'tas']:
    dso_all2 = xr.open_mfdataset('tmp_{}/tmp2_*.nc'.format(var),
                        concat_dim='time', combine='nested', parallel = True) #.rename_vars({'time_':'time'})
    
    dso_all2.attrs['history'] = 'longitudes to 0 -> 360,  only glacier gridpoints chosen and flattened latitude/longitude --> points'
    dso_all2.attrs['postprocessing_date'] = str(np.datetime64('today','D'))
    dso_all2.attrs['postprocessing_scientist'] = 'lilian.schuster@uibk.ac.at'
    dso_all2.attrs['version'] = version
    if version == '2025.11.25':
        dso_all2.attrs['note'] = 'includes points from RGI6, RGI7C and RGI7G central longitude and latitude'
    dso_all2.to_netcdf(f'../flattened/{version}/daily/w5e5v2.0_{var}_global_daily_flat_glaciers_1979_2019{v}.nc')
        
# make monthly files 
for var in ['pr', 'tas']:

    pathi= f'../flattened/{version}/daily/w5e5v2.0_{var}_global_daily_flat_glaciers_1979_2019{v}.nc'
    ds = xr.open_dataset(pathi)


    ds_monthly = ds.resample(time='MS').mean(keep_attrs=True)

    ds_monthly.attrs['postprocessing_scientist'] = 'lilian.schuster@.uibk.ac.at'
    ds_monthly.attrs['postprocessing_actions'] =  ("using xarray: ds_monthly = ds.resample(time='MS', keep_attrs=True).mean(keep_attrs=True)\n"                                                                      "ds_monthly.to_netcdf()\n")
    ds_monthly.attrs['version'] = version
    if version == '2025.11.25':
        ds_monthly.attrs['note'] = 'includes points from RGI6, RGI7C and RGI7G central longitude and latitude'
    ds_monthly.to_netcdf(f'../flattened/{version}/monthly/w5e5v2.0_{var}_global_monthly_flat_glaciers_1979_2019{v}.nc')

    if var == 'tas':
        # also compute monthly daily std:
        
        # attention- >this below is slightly wrong, it creates lon/ lat =0 because the std of lon/lat is zero
        # ds_tas_daily_std = _d.resample(time='MS').std(keep_attrs=True)
        # instead only do std over tas and add lat/lon afterwards again ...
        ds_tas_daily_std = ds.tas.resample(time='MS').std(keep_attrs=True)
        ds_tas_daily_std['latitude'] = ds.latitude
        ds_tas_daily_std['longitude'] = ds.longitude
        ds_tas_daily_std = ds_tas_daily_std.to_dataset()
        
        ds_tas_daily_std = ds_tas_daily_std.rename_vars(dict(tas='tas_std'))
        # now have to change variable tas to tas_std and its attributes 
        ds_tas_daily_std.tas_std.attrs['standard_name'] = 'air_temperature_daily_std'
        ds_tas_daily_std.tas_std.attrs['long_name'] = 'Near-Surface Air Temperature daily standard deviation'
        ds_tas_daily_std.attrs['postprocessing_date'] = str(np.datetime64('today','D'))
        ds_tas_daily_std.attrs['postprocessing_scientist'] = 'lilian.schuster@.uibk.ac.at'
        ds_tas_daily_std.attrs['version'] = version
        if version == '2025.11.25':
            ds_tas_daily_std.attrs['note'] = 'includes points from RGI6, RGI7C and RGI7G central longitude and latitude'
        ds_tas_daily_std.to_netcdf(f'../flattened/{version}/monthly/w5e5v2.0_tas_std_global_monthly_flat_glaciers_1979_2019{v}.nc')
        
        
#import os
import glob
for var in ['pr', 'tas']:
    files = glob.glob('tmp_{}/tmp*'.format(var))
    for f in files:
        os.remove(f)