In [112]:
import pandas as pd
import os
from glob import glob
import xarray as xr
import numpy as np

In [82]:
fdir = '/home/www/oggm/cmip5-ng'
fl = []
fl_2100 = []
fl_2300 = []
for x in os.walk(fdir):
    for y in glob(os.path.join(x[0], '*.nc')):
        # exclude the redundant folder!
        if 'redundant' not in y:
            # the gcsm until 2300 are separately in 2300/ subfolders
            if '2300' in y:
                fl_2300.append(y)
            else:
                fl_2100.append(y)

In [83]:
df_2100 = pd.DataFrame(fl_2100, columns=['path'])
df_2100.columns = ['path']

df_2300 = pd.DataFrame(fl_2300, columns=['path'])
df_2300.columns = ['path']

In [84]:
for i, p in df_2100.iterrows():
    p = p['path'].replace(fdir, '')
    fn = p.split('/')[-1]
    df_2100.loc[i, 'fname'] = fn
    assert len(fn.split('_')) == 6
    assert fn.split('_')[1] == 'mon'
    df_2100.loc[i, 'var'] = fn.split('_')[0]
    df_2100.loc[i, 'gcm'] = fn.split('_')[2]
    df_2100.loc[i, 'rcp'] = fn.split('_')[3]
    df_2100.loc[i, 'rea'] = fn.split('_')[4]

###  
for i, p in df_2300.iterrows():
    p = p['path'].replace(fdir, '')
    fn = p.split('/')[-1]
    df_2300.loc[i, 'fname'] = fn
    # there is no _g025 in the 2300 GCM names
    assert len(fn.split('_')) == 5
    assert fn.split('_')[1] == 'mon'
    df_2300.loc[i, 'var'] = fn.split('_')[0]
    df_2300.loc[i, 'gcm'] = fn.split('_')[2]
    df_2300.loc[i, 'rcp'] = fn.split('_')[3]
    df_2300.loc[i, 'rea'] = fn.split('_')[4][:-3]

**Note that the newer downloaded 2300 GCMs have another grid than the 2100 GCMs which are 2.5° bilinear interpolated (and are thus named `_g025.nc`**
Therefore, we use for the 2100 simulations the `_g025.nc` simulations where possible. There is just one GCM until 2300, which has not yet existed until 2100. We will add that one to the `all_gcm_list_2100.csv` 

In [123]:
xr.open_dataset(df_2300.loc[df_2300.gcm=='NorESM1-M'].iloc[0].path)

In [87]:
# here you see the different grids
print('2300 files: '), print(xr.open_dataset(df_2300.loc[df_2300.gcm=='NorESM1-M'].iloc[0].path).lat[:10].values)
print('_g025.nc files: '), print(xr.open_dataset(df_2100.loc[df_2100.gcm=='NorESM1-M'].iloc[0].path).lat[:10].values)

2300 files: 
[-90.         -88.10526316 -86.21052632 -84.31578947 -82.42105263
 -80.52631579 -78.63157895 -76.73684211 -74.84210526 -72.94736842]
_g025.nc files: 
[-88.75 -86.25 -83.75 -81.25 -78.75 -76.25 -73.75 -71.25 -68.75 -66.25]


(None, None)

In [91]:
df_2100['interpolation'] = '_g025'  # bilinear latitude-longitude grid
df_2300['interpolation'] = 'regular' # "Regular latitude-longitude grid"

for i, p in df_2100.iterrows():
    with xr.open_dataset(p['path'], use_cftime=True) as ds:
        df_2100.loc[i, 'y0'] = str(ds['time.year'][0].data)
        assert str(ds['time.month'][0].data) == '1'
        df_2100.loc[i, 'y1'] = str(ds['time.year'][-1].data)
        assert str(ds['time.month'][-1].data) == '12'
        df_2100.loc[i, 'lon_resolution'] = '{:.2f}'.format(float(ds.lon[1] - ds.lon[0]))
        
for i, p in df_2300.iterrows():
    with xr.open_dataset(p['path'], use_cftime=True) as ds:
        df_2300.loc[i, 'y0'] = str(ds['time.year'][0].data)
        assert str(ds['time.month'][0].data) == '1'
        df_2300.loc[i, 'y1'] = str(ds['time.year'][-1].data)
        assert str(ds['time.month'][-1].data) == '12'
        df_2300.loc[i, 'lon_resolution'] = '{:.2f}'.format(float(ds.lon[1] - ds.lon[0]))


In [92]:
for gcm_rcp in (df_2300['gcm'] + df_2300['rcp']).unique():
    if gcm_rcp not in (df_2100['gcm'] + df_2100['rcp']).unique():
        print(gcm_rcp)
# CESM1-CAM5 is missing in the GCMs until 2100      
df_2100x = pd.concat([df_2100, df_2300.loc[df_2300.gcm == 'CESM1-CAM5']])

CESM1-CAM5rcp26
CESM1-CAM5rcp45
CESM1-CAM5rcp60


In [93]:
df_2100x.rea.unique(), df_2300.rea.unique()

(array(['r1i1p1'], dtype=object), array(['r1i1p1'], dtype=object))

In [94]:
df_2100x.rcp.unique(), df_2300.rcp.unique()

(array(['rcp45', 'historicalGHG', 'rcp60', 'historicalNat', 'rcp26',
        'rcp85'], dtype=object),
 array(['rcp26', 'rcp85', 'rcp45', 'rcp60'], dtype=object))

In [95]:
df_2100x['var'].unique(), df_2300['var'].unique()

(array(['pr', 'tas'], dtype=object), array(['pr', 'tas'], dtype=object))

In [96]:
df_2100x.gcm.unique()

array(['NorESM1-M', 'CanESM2', 'CCSM4', 'IPSL-CM5A-LR', 'CNRM-CM5',
       'CSIRO-Mk3-6-0', 'GISS-E2-R', 'MPI-ESM-LR', 'GFDL-CM3',
       'GFDL-ESM2G', 'CESM1-CAM5'], dtype=object)

In [97]:
df_2300.gcm.unique()

array(['CanESM2', 'MPI-ESM-LR', 'CESM1-CAM5', 'CSIRO-Mk3-6-0',
       'NorESM1-M', 'CCSM4'], dtype=object)

In [114]:
assert np.all(df_2300.y1=='2300')
df_2100x = df_2100x.loc[df_2100x.y1!='2005']
df_2100x.to_csv(os.path.join(fdir, 'all_gcm_list_2100.csv'))
df_2300.to_csv(os.path.join(fdir, 'all_gcm_list_2300.csv'))
df = pd.concat([df_2100, df_2300])
df.to_csv(os.path.join(fdir, 'all_gcm_list.csv'))

In [115]:
df = df.sort_values(by=['fname',
                        'var'], ascending=True)
with open(os.path.join(fdir, 'all_gcm_table.html'), 'w') as fo:
    df.to_html(fo, columns=["fname", "gcm", "rcp", "rea", "var", "interpolation", 'lon_resolution', "y0", "y1"])

In [120]:
for _df,endyr in zip([df_2100x, df_2300], ['2100', '2300']):
    odf = pd.DataFrame()
    for gcm in _df.gcm.unique():
        s = _df.loc[_df.gcm == gcm]
        for rcp in s.rcp.unique():
            ss = s.loc[s.rcp == rcp]
            assert ss['var'].str.contains('pr').sum() == 1
            assert ss['var'].str.contains('tas').sum() == 1
            odf.loc[gcm, rcp] = 'X'
    odf = odf[sorted(odf.columns)]
    odf = odf.fillna('')
    odf = odf.sort_index()
    with open(os.path.join(fdir, f'gcm_table_{endyr}.html'), 'w') as fo:
        odf.to_html(fo)
    print(endyr)
    print(odf)
    print('\n')

2100
              rcp26 rcp45 rcp60 rcp85
CCSM4             X     X     X     X
CESM1-CAM5        X     X     X      
CNRM-CM5          X     X           X
CSIRO-Mk3-6-0     X     X     X     X
CanESM2           X     X           X
GFDL-CM3          X     X     X     X
GFDL-ESM2G        X     X     X     X
GISS-E2-R         X     X     X     X
IPSL-CM5A-LR      X     X     X     X
MPI-ESM-LR        X     X           X
NorESM1-M         X     X     X     X


2300
              rcp26 rcp45 rcp60 rcp85
CCSM4                         X     X
CESM1-CAM5        X     X     X      
CSIRO-Mk3-6-0           X           X
CanESM2           X                  
MPI-ESM-LR        X     X           X
NorESM1-M               X            


