{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## New attempt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In the end, we have an HDF file of GlaThiDa per RGI sub-region."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import geopandas as gpd\n",
    "import shapely.geometry as shpg\n",
    "import numpy as np\n",
    "import os\n",
    "import glob\n",
    "import progressbar\n",
    "import time\n",
    "from oggm import utils, cfg\n",
    "import warnings\n",
    "import tables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {},
   "outputs": [],
   "source": [
    "gtd_dir = './glathida-main/data'\n",
    "# gtd_dir = './GlaThiDa_2016'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read the GTD files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "./glathida-main/data/lippl-2019-gourdon/point.csv\n",
      "./glathida-main/data/ncdc-2023/point.csv\n",
      "./glathida-main/data/ben-pelto-columbia-basin/point.csv\n",
      "./glathida-main/data/oneel-2011-columbia-bering/point.csv\n",
      "./glathida-main/data/braun-2018-antarctic-peninsula/point.csv\n",
      "./glathida-main/data/luyendyk-2003-marie-byrd-land/point.csv\n",
      "./glathida-main/data/corr-2020-antarctic-peninsula/point.csv\n",
      "./glathida-main/data/oberreuter-2021-artesonraju/point.csv\n",
      "./glathida-main/data/benham-2020-canadian-arctic/point.csv\n",
      "./glathida-main/data/franke-2020-dronning-maud-land/point.csv\n",
      "./glathida-main/data/goel-2019-ice-rises/point.csv\n",
      "./glathida-main/data/lambrecht-2019-fedchenko/point.csv\n",
      "./glathida-main/data/24k-glacier-2019/point.csv\n",
      "./glathida-main/data/swiss-glacier-thickness-r2020/point.csv\n",
      "./glathida-main/data/heinrichs-1995-black-rapids/point.csv\n",
      "./glathida-main/data/conway-2016-black-rapids/point.csv\n",
      "./glathida-main/data/gacitua-2020-schiaparelli/point.csv\n"
     ]
    }
   ],
   "source": [
    "dfs = [pd.read_csv(os.path.join(gtd_dir, 'point.csv'), dtype={'date': 'str', 'elevation_date': 'str', 'flag': 'str'}, low_memory=False)]\n",
    "\n",
    "for path in glob.glob(os.path.join(gtd_dir, '*', 'point.csv')):\n",
    "    print(path)\n",
    "    dfs.append(pd.read_csv(path, dtype={'date': 'str', 'elevation_date': 'str', 'flag': 'str'}, low_memory=False))\n",
    "\n",
    "df = pd.concat(dfs, ignore_index=True)\n",
    "\n",
    "df = gpd.GeoDataFrame(\n",
    "  df, geometry=gpd.points_from_xy(df['longitude'], df['latitude'], crs=4326)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "survey_id int64\n",
      "glacier_id float64\n",
      "profile_id object\n",
      "point_id object\n",
      "date object\n",
      "max_date object\n",
      "elevation_date object\n",
      "max_elevation_date object\n",
      "latitude float64\n",
      "longitude float64\n",
      "elevation float64\n",
      "thickness int64\n",
      "thickness_uncertainty float64\n",
      "flag object\n",
      "remarks object\n",
      "date_max object\n",
      "geometry geometry\n"
     ]
    }
   ],
   "source": [
    "for c in df:\n",
    "    print(c, df[c].dtype)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1432760"
      ]
     },
     "execution_count": 139,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df) - 3854279"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5287039"
      ]
     },
     "execution_count": 140,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.03687640662382101, 194967)"
      ]
     },
     "execution_count": 141,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df.loc[df.thickness == 0]) / len(df), len(df.loc[df.thickness == 0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read RGI, assign and write"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "62 0.8385355205437297\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_2029233/185255322.py:26: PerformanceWarning: \n",
      "your performance may suffer as PyTables will pickle object types that it cannot\n",
      "map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['date', 'elevation_date', 'flag', 'rgi_id'], dtype='object')]\n",
      "\n",
      "  to_write.to_hdf(gtd_dir + f'/glathida_2023-11-16_rgi_{rgi_version}.h5', key='data', complevel=5)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "70G 0.8368438364082429\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_2029233/185255322.py:26: PerformanceWarning: \n",
      "your performance may suffer as PyTables will pickle object types that it cannot\n",
      "map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['date', 'elevation_date', 'flag', 'rgi_id'], dtype='object')]\n",
      "\n",
      "  to_write.to_hdf(gtd_dir + f'/glathida_2023-11-16_rgi_{rgi_version}.h5', key='data', complevel=5)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "70C 0.8368438364082429\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_2029233/185255322.py:26: PerformanceWarning: \n",
      "your performance may suffer as PyTables will pickle object types that it cannot\n",
      "map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['date', 'elevation_date', 'flag', 'rgi_id'], dtype='object')]\n",
      "\n",
      "  to_write.to_hdf(gtd_dir + f'/glathida_2023-11-16_rgi_{rgi_version}.h5', key='data', complevel=5)\n"
     ]
    }
   ],
   "source": [
    "for rgi_version in ['62', '70G', '70C']:\n",
    "    \n",
    "    rdf = []\n",
    "    for reg in range(1, 20):\n",
    "        rdf.append(gpd.read_file(utils.get_rgi_region_file(f'{reg:02d}', version=rgi_version)))\n",
    "    rdf = pd.concat(rdf)\n",
    "    \n",
    "    if rgi_version == '62':\n",
    "        rdf = rdf.loc[rdf['Connect'] != 2]\n",
    "        rdf['rgi_id'] = rdf['RGIId']\n",
    "    \n",
    "    joined = gpd.sjoin(df, rdf, how='left', predicate='within')\n",
    "    \n",
    "    no_join = joined.loc[joined.rgi_id.isnull()]\n",
    "    ok_join = joined.loc[~joined.rgi_id.isnull()]\n",
    "    \n",
    "    print(rgi_version, len(ok_join) / len(df))\n",
    "    \n",
    "    to_write = ok_join[['survey_id', 'date', 'elevation_date', \n",
    "                        'latitude', 'longitude', 'elevation', 'thickness', \n",
    "                        'thickness_uncertainty', 'flag',\n",
    "                        'rgi_id']]\n",
    "    \n",
    "    with warnings.catch_warnings():\n",
    "        warnings.simplefilter('ignore', tables.PerformanceWarning)\n",
    "        to_write.to_hdf(gtd_dir + f'/glathida_2023-11-16_rgi_{rgi_version}.h5', key='data', complevel=5)\n",
    "    \n",
    "    file = gtd_dir + f'/glathida_2023-11-16_rgi_{rgi_version}_per_id.h5'\n",
    "    if os.path.exists(file):\n",
    "        os.remove(file)\n",
    "\n",
    "    rids = to_write.rgi_id.unique()\n",
    "    for rid in rids:\n",
    "        tt = to_write.loc[to_write.rgi_id == rid].reset_index(drop=True)\n",
    "        with warnings.catch_warnings():\n",
    "            warnings.simplefilter('ignore', tables.NaturalNameWarning)\n",
    "            tt.to_hdf(file, key=rid, append=True, complevel=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "metadata": {},
   "outputs": [],
   "source": [
    "with pd.HDFStore('glathida-main/data/glathida_2023-11-16_rgi_70C_per_id.h5') as store:\n",
    "    rgi_ids = list(store.keys())\n",
    "    rgi_ids = np.array([s[1:] for s in rgi_ids])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1278"
      ]
     },
     "execution_count": 154,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(rgi_ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfa = pd.read_hdf('glathida-main/data/glathida_2023-11-16_rgi_70C.h5')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfa = dfa.loc[['-03-' in c for c in dfa.rgi_id]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfa = gpd.GeoDataFrame(\n",
    "  dfa, geometry=gpd.points_from_xy(dfa['longitude'], dfa['latitude'], crs=4326)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 165,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_2029233/2911545548.py:1: UserWarning: Column names longer than 10 characters will be truncated when saved to ESRI Shapefile.\n",
      "  dfa.to_file(filename='gtd_reg03.shp.zip', driver='ESRI Shapefile')\n"
     ]
    }
   ],
   "source": [
    "dfa.to_file(filename='gtd_reg03.shp.zip', driver='ESRI Shapefile')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 166,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t - 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {},
   "outputs": [],
   "source": [
    "t = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "survey_id int64\n",
      "date object\n",
      "elevation_date object\n",
      "latitude float64\n",
      "longitude float64\n",
      "elevation float64\n",
      "thickness int64\n",
      "thickness_uncertainty float64\n",
      "flag object\n",
      "rgi_id object\n"
     ]
    }
   ],
   "source": [
    "\n",
    "for c in to_write:\n",
    "    print(c, to_write[c].dtype)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [],
   "source": [
    "to_write = to_write.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_2029233/271614285.py:1: PerformanceWarning: \n",
      "your performance may suffer as PyTables will pickle object types that it cannot\n",
      "map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['date', 'elevation_date', 'flag', 'rgi_id'], dtype='object')]\n",
      "\n",
      "  to_write.to_hdf(gtd_dir + '/glathida_2023-11-16_rgi_70G.h5', key='data')\n"
     ]
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_rids():\n",
    "    with pd.HDFStore('glathida-v3.1.0/data/TTT_RGI_v70C_per_id.h5') as store:\n",
    "        rgi_ids = list(store.keys())\n",
    "        return np.array([s[1:] for s in rgi_ids])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "946 ms ± 10.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit get_rids()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_data(rid):\n",
    "    out = None\n",
    "    try:\n",
    "        out = pd.read_hdf('glathida-v3.1.0/data/TTT_RGI_v70C_per_id.h5', key=rid)\n",
    "    except KeyError:\n",
    "        pass\n",
    "    return out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "16.5 ms ± 218 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit read_data('RGI2000-v7.0-C-02-10810')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "503 µs ± 11.3 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit read_data('RGI2000-v7.0-C-02-10812')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "t = pd.read_hdf('glathida-v3.1.0/data/TTT_RGI_v70C.h5')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.96 s ± 99.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit  pd.read_hdf('glathida-v3.1.0/data/TTT_RGI_v70C.h5')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "191 ms ± 8.26 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit t.loc[t.rgi_id == 'RGI2000-v7.0-C-02-10810']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "192 ms ± 5.84 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit t.loc[t.rgi_id == 'RGI2000-v7.0-C-02-10210']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GlaThiDa_ID</th>\n",
       "      <th>POLITICAL_UNIT</th>\n",
       "      <th>GLACIER_NAME</th>\n",
       "      <th>SURVEY_DATE</th>\n",
       "      <th>PROFILE_ID</th>\n",
       "      <th>POINT_ID</th>\n",
       "      <th>POINT_LAT</th>\n",
       "      <th>POINT_LON</th>\n",
       "      <th>ELEVATION</th>\n",
       "      <th>THICKNESS</th>\n",
       "      <th>THICKNESS_UNCERTAINTY</th>\n",
       "      <th>DATA_FLAG</th>\n",
       "      <th>REMARKS</th>\n",
       "      <th>rgi_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>33</td>\n",
       "      <td>US</td>\n",
       "      <td>EASTON</td>\n",
       "      <td>19929999</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>48.767380</td>\n",
       "      <td>-121.819644</td>\n",
       "      <td>2962.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>RGI2000-v7.0-C-02-10810</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>33</td>\n",
       "      <td>US</td>\n",
       "      <td>EASTON</td>\n",
       "      <td>19929999</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>48.764904</td>\n",
       "      <td>-121.821909</td>\n",
       "      <td>2813.0</td>\n",
       "      <td>29</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>RGI2000-v7.0-C-02-10810</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>33</td>\n",
       "      <td>US</td>\n",
       "      <td>EASTON</td>\n",
       "      <td>19929999</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3</td>\n",
       "      <td>48.761662</td>\n",
       "      <td>-121.825264</td>\n",
       "      <td>2598.0</td>\n",
       "      <td>41</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>RGI2000-v7.0-C-02-10810</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>33</td>\n",
       "      <td>US</td>\n",
       "      <td>EASTON</td>\n",
       "      <td>19929999</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4</td>\n",
       "      <td>48.757063</td>\n",
       "      <td>-121.829107</td>\n",
       "      <td>2383.0</td>\n",
       "      <td>71</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>RGI2000-v7.0-C-02-10810</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>33</td>\n",
       "      <td>US</td>\n",
       "      <td>EASTON</td>\n",
       "      <td>19929999</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5</td>\n",
       "      <td>48.753715</td>\n",
       "      <td>-121.832006</td>\n",
       "      <td>2284.0</td>\n",
       "      <td>82</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>RGI2000-v7.0-C-02-10810</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19089</th>\n",
       "      <td>502</td>\n",
       "      <td>US</td>\n",
       "      <td>SHERMAN CRATER</td>\n",
       "      <td>20109999</td>\n",
       "      <td>NaN</td>\n",
       "      <td>76</td>\n",
       "      <td>48.768840</td>\n",
       "      <td>-121.816270</td>\n",
       "      <td>2931.0</td>\n",
       "      <td>59</td>\n",
       "      <td>5.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>RGI2000-v7.0-C-02-10810</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19090</th>\n",
       "      <td>502</td>\n",
       "      <td>US</td>\n",
       "      <td>SHERMAN CRATER</td>\n",
       "      <td>20109999</td>\n",
       "      <td>NaN</td>\n",
       "      <td>77</td>\n",
       "      <td>48.768892</td>\n",
       "      <td>-121.816151</td>\n",
       "      <td>2928.0</td>\n",
       "      <td>54</td>\n",
       "      <td>5.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>RGI2000-v7.0-C-02-10810</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19091</th>\n",
       "      <td>502</td>\n",
       "      <td>US</td>\n",
       "      <td>SHERMAN CRATER</td>\n",
       "      <td>20109999</td>\n",
       "      <td>NaN</td>\n",
       "      <td>78</td>\n",
       "      <td>48.768944</td>\n",
       "      <td>-121.816032</td>\n",
       "      <td>2926.0</td>\n",
       "      <td>51</td>\n",
       "      <td>5.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>RGI2000-v7.0-C-02-10810</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19092</th>\n",
       "      <td>502</td>\n",
       "      <td>US</td>\n",
       "      <td>SHERMAN CRATER</td>\n",
       "      <td>20109999</td>\n",
       "      <td>NaN</td>\n",
       "      <td>79</td>\n",
       "      <td>48.768990</td>\n",
       "      <td>-121.815914</td>\n",
       "      <td>2923.0</td>\n",
       "      <td>49</td>\n",
       "      <td>5.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>RGI2000-v7.0-C-02-10810</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19093</th>\n",
       "      <td>502</td>\n",
       "      <td>US</td>\n",
       "      <td>SHERMAN CRATER</td>\n",
       "      <td>20109999</td>\n",
       "      <td>NaN</td>\n",
       "      <td>80</td>\n",
       "      <td>48.769043</td>\n",
       "      <td>-121.815795</td>\n",
       "      <td>2921.0</td>\n",
       "      <td>43</td>\n",
       "      <td>5.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>RGI2000-v7.0-C-02-10810</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2533 rows × 14 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       GlaThiDa_ID POLITICAL_UNIT    GLACIER_NAME SURVEY_DATE PROFILE_ID  \\\n",
       "0               33             US          EASTON    19929999        NaN   \n",
       "1               33             US          EASTON    19929999        NaN   \n",
       "2               33             US          EASTON    19929999        NaN   \n",
       "3               33             US          EASTON    19929999        NaN   \n",
       "4               33             US          EASTON    19929999        NaN   \n",
       "...            ...            ...             ...         ...        ...   \n",
       "19089          502             US  SHERMAN CRATER    20109999        NaN   \n",
       "19090          502             US  SHERMAN CRATER    20109999        NaN   \n",
       "19091          502             US  SHERMAN CRATER    20109999        NaN   \n",
       "19092          502             US  SHERMAN CRATER    20109999        NaN   \n",
       "19093          502             US  SHERMAN CRATER    20109999        NaN   \n",
       "\n",
       "      POINT_ID  POINT_LAT   POINT_LON  ELEVATION  THICKNESS  \\\n",
       "0            1  48.767380 -121.819644     2962.0          0   \n",
       "1            2  48.764904 -121.821909     2813.0         29   \n",
       "2            3  48.761662 -121.825264     2598.0         41   \n",
       "3            4  48.757063 -121.829107     2383.0         71   \n",
       "4            5  48.753715 -121.832006     2284.0         82   \n",
       "...        ...        ...         ...        ...        ...   \n",
       "19089       76  48.768840 -121.816270     2931.0         59   \n",
       "19090       77  48.768892 -121.816151     2928.0         54   \n",
       "19091       78  48.768944 -121.816032     2926.0         51   \n",
       "19092       79  48.768990 -121.815914     2923.0         49   \n",
       "19093       80  48.769043 -121.815795     2921.0         43   \n",
       "\n",
       "       THICKNESS_UNCERTAINTY  DATA_FLAG REMARKS                   rgi_id  \n",
       "0                        NaN        NaN     NaN  RGI2000-v7.0-C-02-10810  \n",
       "1                        NaN        NaN     NaN  RGI2000-v7.0-C-02-10810  \n",
       "2                        NaN        NaN     NaN  RGI2000-v7.0-C-02-10810  \n",
       "3                        NaN        NaN     NaN  RGI2000-v7.0-C-02-10810  \n",
       "4                        NaN        NaN     NaN  RGI2000-v7.0-C-02-10810  \n",
       "...                      ...        ...     ...                      ...  \n",
       "19089                    5.0        NaN     NaN  RGI2000-v7.0-C-02-10810  \n",
       "19090                    5.0        NaN     NaN  RGI2000-v7.0-C-02-10810  \n",
       "19091                    5.0        NaN     NaN  RGI2000-v7.0-C-02-10810  \n",
       "19092                    5.0        NaN     NaN  RGI2000-v7.0-C-02-10810  \n",
       "19093                    5.0        NaN     NaN  RGI2000-v7.0-C-02-10810  \n",
       "\n",
       "[2533 rows x 14 columns]"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t.loc[t.rgi_id == 'RGI2000-v7.0-C-02-10810']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Add the RGI Region attribute "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100% (3854279 of 3854279) |##############| Elapsed Time: 0:01:44 Time:  0:01:44\n"
     ]
    }
   ],
   "source": [
    "reg = np.ones(len(df), dtype=int) * -1\n",
    "prev_reg = None\n",
    "for i, p in progressbar.progressbar(enumerate(df.geometry), max_value=len(df)):\n",
    "    if prev_reg is not None and prev_reg.contains(p):\n",
    "        reg[i] = reg[i-1]\n",
    "        continue\n",
    "    try:\n",
    "        sel = rgi_reg.loc[rgi_reg.contains(p)].iloc[0]\n",
    "        reg[i] = sel.RGI_CODE\n",
    "        prev_reg = sel.geometry\n",
    "    except:\n",
    "        prev_reg = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[1, 2, 3, 4, 5, 7, 8, 10, 11, 12, 13, 16, 17, 18, 19]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['RGI_REG'] = reg\n",
    "sorted(df['RGI_REG'].unique())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Separate the data in RGI Regions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "dfs = OrderedDict()\n",
    "for r in sorted(df['RGI_REG'].unique()):\n",
    "    dfs[r] = df.loc[df.RGI_REG == r].copy()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Prepare for writing and write to file "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i, (k, d) in enumerate(dfs.items()):\n",
    "    d.drop(['geometry'], axis=1, inplace=True)\n",
    "    d['RGI_REG'] = d['RGI_REG'].astype(str)\n",
    "    d['ELEVATION'] = d['ELEVATION'].astype(str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "GlaThiDa_ID int64\n",
      "POLITICAL_UNIT object\n",
      "GLACIER_NAME object\n",
      "SURVEY_DATE object\n",
      "PROFILE_ID object\n",
      "POINT_ID object\n",
      "POINT_LAT float64\n",
      "POINT_LON float64\n",
      "ELEVATION object\n",
      "THICKNESS int64\n",
      "THICKNESS_UNCERTAINTY float64\n",
      "DATA_FLAG float64\n",
      "REMARKS object\n",
      "RGI_REG object\n"
     ]
    }
   ],
   "source": [
    "for c in d:\n",
    "    print(c, d[c].dtype)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Writing 01 87157\n",
      "Writing 02 3406\n",
      "Writing 03 868346\n",
      "Writing 04 309453\n",
      "Writing 05 557136\n",
      "Writing 07 966408\n",
      "Writing 08 10801\n",
      "Writing 10 7726\n",
      "Writing 11 478312\n",
      "Writing 12 2278\n",
      "Writing 13 15327\n",
      "Writing 16 1287\n",
      "Writing 17 8463\n",
      "Writing 18 619\n",
      "Writing 19 537560\n"
     ]
    }
   ],
   "source": [
    "outf = os.path.join(gtd_dir, 'TTT_per_reg.h5')\n",
    "if os.path.exists(outf):\n",
    "    os.remove(outf)\n",
    "count = 0\n",
    "for i, (k, d) in enumerate(dfs.items()):\n",
    "    key = '{:02d}'.format(int(k))\n",
    "    print('Writing', key, len(d))\n",
    "    with warnings.catch_warnings():\n",
    "        warnings.simplefilter('ignore', tables.NaturalNameWarning)\n",
    "        d.to_hdf(outf, key, append=True, complevel=5)\n",
    "        count += len(d)\n",
    "assert count == len(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "hide_input": false,
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": false,
   "sideBar": true,
   "skip_h1_title": true,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}