From 2ffdf3695379876dcb05510b541dbe4d3a4612e5 Mon Sep 17 00:00:00 2001 From: Wei Ji Date: Fri, 8 May 2020 14:57:41 +1200 Subject: [PATCH] :art: Pair notebook with .py script and lint code with black Pair up the jupyter notebook with a .py script, and lint it with black. Nicer to look at and easier to diff! --- atl06_play.ipynb | 208 +++++++++++------ atl06_play.py | 576 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 719 insertions(+), 65 deletions(-) create mode 100644 atl06_play.py diff --git a/atl06_play.ipynb b/atl06_play.ipynb index d6f7443..d73d68a 100644 --- a/atl06_play.ipynb +++ b/atl06_play.ipynb @@ -117,7 +117,9 @@ "metadata": {}, "outputs": [], "source": [ - "catalog = intake.open_catalog(uri=\"catalog.yaml\") # open the local catalog file containing ICESAT2 stuff" + "catalog = intake.open_catalog(\n", + " uri=\"catalog.yaml\"\n", + ") # open the local catalog file containing ICESAT2 stuff" ] }, { @@ -894,12 +896,16 @@ "try:\n", " netrc.netrc()\n", "except FileNotFoundError as error_msg:\n", - " print(f\"{error_msg}, please follow instructions to create one at \"\n", - " \"https://nsidc.org/support/faq/what-options-are-available-bulk-downloading-data-https-earthdata-login-enabled \"\n", - " 'basically using `echo \"machine urs.earthdata.nasa.gov login password \" >> ~/.netrc`')\n", + " print(\n", + " f\"{error_msg}, please follow instructions to create one at \"\n", + " \"https://nsidc.org/support/faq/what-options-are-available-bulk-downloading-data-https-earthdata-login-enabled \"\n", + " 'basically using `echo \"machine urs.earthdata.nasa.gov login password \" >> ~/.netrc`'\n", + " )\n", " raise\n", "\n", - "dataset = catalog.icesat2atl06.to_dask().unify_chunks() # depends on .netrc file in home folder\n", + "dataset = (\n", + " catalog.icesat2atl06.to_dask().unify_chunks()\n", + ") # depends on .netrc file in home folder\n", "dataset" ] }, @@ -909,10 +915,10 @@ "metadata": {}, "outputs": [], "source": [ - "#dataset.hvplot.points(\n", + "# dataset.hvplot.points(\n", "# x=\"longitude\", y=\"latitude\", datashade=True, width=800, height=500, hover=True,\n", "# #geo=True, coastline=True, crs=cartopy.crs.PlateCarree(), #projection=cartopy.crs.Stereographic(central_latitude=-71),\n", - "#)\n", + "# )\n", "catalog.icesat2atl06.hvplot.quickview()" ] }, @@ -950,8 +956,7 @@ "for date in dates:\n", " source = catalog.icesat2atlasdownloader(date=date)\n", " future = client.submit(\n", - " func=source.discover,\n", - " key=f\"download-{date}\",\n", + " func=source.discover, key=f\"download-{date}\",\n", " ) # triggers download of the file(s), or loads from cache\n", " futures.append(future)" ] @@ -989,7 +994,9 @@ "source": [ "# Check download progress here, https://stackoverflow.com/a/37901797/6611055\n", "responses = []\n", - "for f in tqdm.tqdm(iterable=dask.distributed.as_completed(futures=futures), total=len(futures)):\n", + "for f in tqdm.tqdm(\n", + " iterable=dask.distributed.as_completed(futures=futures), total=len(futures)\n", + "):\n", " responses.append(f.result())" ] }, @@ -1080,28 +1087,34 @@ { "cell_type": "code", "execution_count": 7, - "metadata": {}, + "metadata": { + "lines_to_next_cell": 1 + }, "outputs": [], "source": [ - "dataset = catalog.icesat2atl06.to_dask() # unfortunately, we have to load this in dask to get the path...\n", + "dataset = (\n", + " catalog.icesat2atl06.to_dask()\n", + ") # unfortunately, we have to load this in dask to get the path...\n", "root_directory = os.path.dirname(os.path.dirname(dataset.encoding[\"source\"]))" ] }, { "cell_type": "code", "execution_count": 8, - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "outputs": [], "source": [ "def get_crossing_dates(\n", " catalog_entry: intake.catalog.local.LocalCatalogEntry,\n", " root_directory: str,\n", - " referencegroundtrack: str=\"????\",\n", + " referencegroundtrack: str = \"????\",\n", " datetime=\"*\",\n", " cyclenumber=\"??\",\n", " orbitalsegment=\"??\",\n", " version=\"002\",\n", - " revision=\"01\"\n", + " revision=\"01\",\n", "):\n", " \"\"\"\n", " Given a 4-digit reference groundtrack (e.g. 1234),\n", @@ -1109,23 +1122,28 @@ " key is the date in \"YYYY.MM.DD\" format when an ICESAT2 crossing was made and the\n", " value is the filepath to the HDF5 data file.\n", " \"\"\"\n", - " \n", + "\n", " # Get a glob string that looks like \"ATL06_??????????????_XXXX????_002_01.h5\"\n", " globpath = catalog_entry.path_as_pattern\n", " if datetime == \"*\":\n", " globpath = globpath.replace(\"{datetime:%Y%m%d%H%M%S}\", \"??????????????\")\n", " globpath = globpath.format(\n", - " referencegroundtrack=referencegroundtrack, cyclenumber=cyclenumber, orbitalsegment=orbitalsegment,\n", - " version=version, revision=revision\n", + " referencegroundtrack=referencegroundtrack,\n", + " cyclenumber=cyclenumber,\n", + " orbitalsegment=orbitalsegment,\n", + " version=version,\n", + " revision=revision,\n", " )\n", - " \n", + "\n", " # Get list of filepaths (dates are contained in the filepath)\n", " globedpaths = glob.glob(os.path.join(root_directory, \"??????????\", globpath))\n", - " \n", + "\n", " # Pick out just the dates in \"YYYY.MM.DD\" format from the globedpaths\n", " # crossingdates = [os.path.basename(os.path.dirname(p=p)) for p in globedpaths]\n", - " crossingdates = {os.path.basename(os.path.dirname(p=p)): p for p in sorted(globedpaths)}\n", - " \n", + " crossingdates = {\n", + " os.path.basename(os.path.dirname(p=p)): p for p in sorted(globedpaths)\n", + " }\n", + "\n", " return crossingdates" ] }, @@ -1136,10 +1154,12 @@ "outputs": [], "source": [ "crossing_dates_dict = {}\n", - "for rgt in range(0,1388): # ReferenceGroundTrack goes from 0001 to 1387\n", + "for rgt in range(0, 1388): # ReferenceGroundTrack goes from 0001 to 1387\n", " referencegroundtrack = f\"{rgt}\".zfill(4)\n", " crossing_dates = dask.delayed(get_crossing_dates)(\n", - " catalog_entry=catalog.icesat2atl06, root_directory=root_directory, referencegroundtrack=referencegroundtrack\n", + " catalog_entry=catalog.icesat2atl06,\n", + " root_directory=root_directory,\n", + " referencegroundtrack=referencegroundtrack,\n", " )\n", " crossing_dates_dict[referencegroundtrack] = crossing_dates\n", "crossing_dates_dict = dask.compute(crossing_dates_dict)[0]" @@ -1201,7 +1221,7 @@ " concatenate all points from all crossing dates into one xr.Dataset\n", " \"\"\"\n", " lasers = [\"gt1l\", \"gt1r\", \"gt2l\", \"gt2r\", \"gt3l\", \"gt3r\"]\n", - " \n", + "\n", " objs = [\n", " xr.open_mfdataset(\n", " paths=crossing_dates.values(),\n", @@ -1213,13 +1233,17 @@ " ).assign_coords(coords={\"laser\": laser})\n", " for laser in lasers\n", " ]\n", - " \n", + "\n", " try:\n", - " da = xr.concat(objs=objs, dim=\"laser\") # dim=pd.Index(data=lasers, name=\"laser\")\n", + " da = xr.concat(\n", + " objs=objs, dim=\"laser\"\n", + " ) # dim=pd.Index(data=lasers, name=\"laser\")\n", " df = da.unify_chunks().to_dask_dataframe()\n", " except ValueError:\n", " # ValueError: cannot reindex or align along dimension 'delta_time' because the index has duplicate values\n", - " df = dask.dataframe.concat([obj.unify_chunks().to_dask_dataframe() for obj in objs])\n", + " df = dask.dataframe.concat(\n", + " [obj.unify_chunks().to_dask_dataframe() for obj in objs]\n", + " )\n", "\n", " return df" ] @@ -1231,8 +1255,10 @@ "outputs": [], "source": [ "dataset_dict = {}\n", - "#for referencegroundtrack in list(crossing_dates_dict)[349:350]: # ReferenceGroundTrack goes from 0001 to 1387\n", - "for referencegroundtrack in list(crossing_dates_dict)[340:350]: # ReferenceGroundTrack goes from 0001 to 1387\n", + "# for referencegroundtrack in list(crossing_dates_dict)[349:350]: # ReferenceGroundTrack goes from 0001 to 1387\n", + "for referencegroundtrack in list(crossing_dates_dict)[\n", + " 340:350\n", + "]: # ReferenceGroundTrack goes from 0001 to 1387\n", " # print(referencegroundtrack)\n", " if len(crossing_dates_dict[referencegroundtrack]) > 0:\n", " da = dask.delayed(six_laser_beams)(\n", @@ -1401,7 +1427,9 @@ "metadata": {}, "outputs": [], "source": [ - "dataset_dict = dask.compute(dataset_dict)[0] # compute every referencegroundtrack, slow... though somewhat parallelized" + "dataset_dict = dask.compute(dataset_dict)[\n", + " 0\n", + "] # compute every referencegroundtrack, slow... though somewhat parallelized" ] }, { @@ -1426,9 +1454,19 @@ "metadata": {}, "outputs": [], "source": [ - "da.sel(crossingdates=\"2018.10.21\").h_li.unify_chunks().drop(labels=[\"longitude\", \"datetime\", \"cyclenumber\"]).hvplot(\n", - " kind=\"scatter\", x=\"latitude\", by=\"crossingdates\", datashade=True, dynspread=True,\n", - " width=800, height=500, dynamic=True, flip_xaxis=True, hover=True\n", + "da.sel(crossingdates=\"2018.10.21\").h_li.unify_chunks().drop(\n", + " labels=[\"longitude\", \"datetime\", \"cyclenumber\"]\n", + ").hvplot(\n", + " kind=\"scatter\",\n", + " x=\"latitude\",\n", + " by=\"crossingdates\",\n", + " datashade=True,\n", + " dynspread=True,\n", + " width=800,\n", + " height=500,\n", + " dynamic=True,\n", + " flip_xaxis=True,\n", + " hover=True,\n", ")" ] }, @@ -1738,9 +1776,12 @@ "outputs": [], "source": [ "dfs.hvplot.scatter(\n", - " x=\"longitude\", y=\"latitude\", by=\"laser\", hover_cols=[\"delta_time\", \"segment_id\"],\n", - " #datashade=True, dynspread=True,\n", - " #width=800, height=500, colorbar=True\n", + " x=\"longitude\",\n", + " y=\"latitude\",\n", + " by=\"laser\",\n", + " hover_cols=[\"delta_time\", \"segment_id\"],\n", + " # datashade=True, dynspread=True,\n", + " # width=800, height=500, colorbar=True\n", ")" ] }, @@ -1759,7 +1800,11 @@ "metadata": {}, "outputs": [], "source": [ - "transformer = pyproj.Transformer.from_crs(crs_from=pyproj.CRS.from_epsg(4326), crs_to=pyproj.CRS.from_epsg(3031), always_xy=True)" + "transformer = pyproj.Transformer.from_crs(\n", + " crs_from=pyproj.CRS.from_epsg(4326),\n", + " crs_to=pyproj.CRS.from_epsg(3031),\n", + " always_xy=True,\n", + ")" ] }, { @@ -1781,7 +1826,9 @@ } ], "source": [ - "dfs[\"x\"], dfs[\"y\"] = transformer.transform(xx=dfs.longitude.values, yy=dfs.latitude.values)" + "dfs[\"x\"], dfs[\"y\"] = transformer.transform(\n", + " xx=dfs.longitude.values, yy=dfs.latitude.values\n", + ")" ] }, { @@ -2054,9 +2101,12 @@ "outputs": [], "source": [ "dfs.hvplot.scatter(\n", - " x=\"x\", y=\"y\", by=\"laser\", hover_cols=[\"delta_time\", \"segment_id\", \"h_li\"],\n", - " #datashade=True, dynspread=True,\n", - " #width=800, height=500, colorbar=True\n", + " x=\"x\",\n", + " y=\"y\",\n", + " by=\"laser\",\n", + " hover_cols=[\"delta_time\", \"segment_id\", \"h_li\"],\n", + " # datashade=True, dynspread=True,\n", + " # width=800, height=500, colorbar=True\n", ")" ] }, @@ -2110,15 +2160,15 @@ "# https://github.com/ICESAT-2HackWeek/gridding/blob/master/notebook/utils.py#L23\n", "def make_grid(xmin, xmax, ymin, ymax, dx, dy):\n", " \"\"\"Construct output grid-coordinates.\"\"\"\n", - " \n", + "\n", " # Setup grid dimensions\n", " Nn = int((np.abs(ymax - ymin)) / dy) + 1\n", " Ne = int((np.abs(xmax - xmin)) / dx) + 1\n", - " \n", + "\n", " # Initiate x/y vectors for grid\n", " x_i = np.linspace(xmin, xmax, num=Ne)\n", " y_i = np.linspace(ymin, ymax, num=Nn)\n", - " \n", + "\n", " return np.meshgrid(x_i, y_i)" ] }, @@ -2128,7 +2178,9 @@ "metadata": {}, "outputs": [], "source": [ - "xi, yi = make_grid(xmin=dfs.x.min(), xmax=dfs.x.max(), ymin=dfs.y.max(), ymax=dfs.y.min(), dx=10, dy=10)" + "xi, yi = make_grid(\n", + " xmin=dfs.x.min(), xmax=dfs.x.max(), ymin=dfs.y.max(), ymax=dfs.y.min(), dx=10, dy=10\n", + ")" ] }, { @@ -2223,7 +2275,7 @@ "outputs": [], "source": [ "# https://xrviz.readthedocs.io/en/latest/set_initial_parameters.html\n", - "initial_params={\n", + "initial_params = {\n", " # Select variable to plot\n", " \"Variables\": \"h_li\",\n", " # Set coordinates\n", @@ -2231,13 +2283,13 @@ " # Axes\n", " \"x\": \"longitude\",\n", " \"y\": \"latitude\",\n", - " #\"sigma\": \"animate\",\n", + " # \"sigma\": \"animate\",\n", " # Projection\n", - " #\"is_geo\": True,\n", - " #\"basemap\": True,\n", - " #\"crs\": \"PlateCarree\"\n", + " # \"is_geo\": True,\n", + " # \"basemap\": True,\n", + " # \"crs\": \"PlateCarree\"\n", "}\n", - "dashboard = xrviz.dashboard.Dashboard(data=dataset) #, initial_params=initial_params)" + "dashboard = xrviz.dashboard.Dashboard(data=dataset) # , initial_params=initial_params)" ] }, { @@ -2288,13 +2340,16 @@ "outputs": [], "source": [ "# Paste the OpenAltimetry selection parameters here\n", - "OA_REFERENCE_URL = 'minx=-177.64275595145213&miny=-88.12014866942751&maxx=-128.25920892322736&maxy=-85.52394234080862&date=2019-05-02&trackId=515'\n", - "# We populate a list with the photon data using the OpenAltimetry API, no HDF! \n", - "OA_URL = 'https://openaltimetry.org/data/icesat2/getPhotonData?client=jupyter&' + OA_REFERENCE_URL\n", - "OA_PHOTONS = ['Noise', 'Low', 'Medium', 'High']\n", + "OA_REFERENCE_URL = \"minx=-177.64275595145213&miny=-88.12014866942751&maxx=-128.25920892322736&maxy=-85.52394234080862&date=2019-05-02&trackId=515\"\n", + "# We populate a list with the photon data using the OpenAltimetry API, no HDF!\n", + "OA_URL = (\n", + " \"https://openaltimetry.org/data/icesat2/getPhotonData?client=jupyter&\"\n", + " + OA_REFERENCE_URL\n", + ")\n", + "OA_PHOTONS = [\"Noise\", \"Low\", \"Medium\", \"High\"]\n", "# OA_PLOTTED_BEAMS = [1,2,3,4,5,6] you can select up to 6 beams for each ground track.\n", "# Some beams may not be usable due cloud covering or QC issues.\n", - "OA_BEAMS = [3,4]" + "OA_BEAMS = [3, 4]" ] }, { @@ -2304,10 +2359,19 @@ "outputs": [], "source": [ "minx, miny, maxx, maxy = [-156, -88, -127, -84]\n", - "date = \"2019-05-02\" # UTC date?\n", - "track = 515 # \n", - "beam = 1 # 1 to 6\n", - "params = {\"client\": \"jupyter\", \"minx\": minx, \"miny\": miny, \"maxx\": maxx, \"maxy\": maxy, \"date\": date, \"trackId\": str(track), \"beam\": str(beam)}" + "date = \"2019-05-02\" # UTC date?\n", + "track = 515 #\n", + "beam = 1 # 1 to 6\n", + "params = {\n", + " \"client\": \"jupyter\",\n", + " \"minx\": minx,\n", + " \"miny\": miny,\n", + " \"maxx\": maxx,\n", + " \"maxy\": maxy,\n", + " \"date\": date,\n", + " \"trackId\": str(track),\n", + " \"beam\": str(beam),\n", + "}" ] }, { @@ -2316,7 +2380,9 @@ "metadata": {}, "outputs": [], "source": [ - "r = requests.get(url=\"https://openaltimetry.org/data/icesat2/getPhotonData\", params=params)" + "r = requests.get(\n", + " url=\"https://openaltimetry.org/data/icesat2/getPhotonData\", params=params\n", + ")" ] }, { @@ -2327,11 +2393,15 @@ "source": [ "# OpenAltimetry Data cleansing\n", "df = pd.io.json.json_normalize(data=r.json()[\"series\"], meta=\"name\", record_path=\"data\")\n", - "df.name = df.name.str.split().str.get(0) # Get e.g. just \"Low\" instead of \"Low [12345]\"\n", - "df.query(expr=\"name in ('Low', 'Medium', 'High')\", inplace=True) # filter out Noise and Buffer points\n", + "df.name = df.name.str.split().str.get(0) # Get e.g. just \"Low\" instead of \"Low [12345]\"\n", + "df.query(\n", + " expr=\"name in ('Low', 'Medium', 'High')\", inplace=True\n", + ") # filter out Noise and Buffer points\n", "\n", "df.rename(columns={0: \"latitude\", 1: \"elevation\", 2: \"longitude\"}, inplace=True)\n", - "df = df.reindex(columns=[\"longitude\", \"latitude\", \"elevation\", \"name\"]) # reorder columns\n", + "df = df.reindex(\n", + " columns=[\"longitude\", \"latitude\", \"elevation\", \"name\"]\n", + ") # reorder columns\n", "df.reset_index(inplace=True)\n", "df" ] @@ -2354,6 +2424,14 @@ } ], "metadata": { + "jupytext": { + "text_representation": { + "extension": ".py", + "format_name": "hydrogen", + "format_version": "1.3", + "jupytext_version": "1.4.2" + } + }, "kernelspec": { "display_name": "deepicedrain", "language": "python", diff --git a/atl06_play.py b/atl06_play.py new file mode 100644 index 0000000..4758f9b --- /dev/null +++ b/atl06_play.py @@ -0,0 +1,576 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: hydrogen +# format_version: '1.3' +# jupytext_version: 1.4.2 +# kernelspec: +# display_name: deepicedrain +# language: python +# name: deepicedrain +# --- + +# %% [markdown] +# # **ATLAS/ICESat-2 Land Ice Height [ATL06](https://nsidc.org/data/atl06/) Exploratory Data Analysis** +# +# [Yet another](https://xkcd.com/927) take on playing with ICESat-2's Land Ice Height ATL06 data, +# specfically with a focus on analyzing ice elevation changes over Antarctica. +# Specifically, this jupyter notebook will cover: +# +# - Downloading datasets from the web via [intake](https://intake.readthedocs.io) +# - Performing [Exploratory Data Analysis](https://en.wikipedia.org/wiki/Exploratory_data_analysis) +# using the [PyData](https://pydata.org) stack (e.g. [xarray](http://xarray.pydata.org), [dask](https://dask.org)) +# - Plotting figures using [Hvplot](https://hvplot.holoviz.org) and [PyGMT](https://www.pygmt.org) (TODO) +# +# This is in contrast with the [icepyx](https://github.com/icesat2py/icepyx) package +# and 'official' 2019/2020 [ICESat-2 Hackweek tutorials](https://github.com/ICESAT-2HackWeek/ICESat2_hackweek_tutorials) (which are also awesome!) +# that tends to use a slightly different approach (e.g. handcoded download scripts, [h5py](http://www.h5py.org) for data reading, etc). +# The core concept here is to run things in a more intuitive and scalable (parallelizable) manner on a continent scale (rather than just a specific region). + +# %% +import glob +import json +import logging +import netrc +import os + +import dask +import dask.distributed +import hvplot.dask +import hvplot.pandas +import hvplot.xarray +import intake +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import requests +import tqdm +import xarray as xr + +# %matplotlib inline + +# %% +# Configure intake and set number of compute cores for data download +intake.config.conf["cache_dir"] = "catdir" # saves data to current folder +intake.config.conf["download_progress"] = False # disable automatic tqdm progress bars + +logging.basicConfig(level=logging.WARNING) + +# Limit compute to 8 cores for download part using intake +# Can possibly go up to 10 because there are 10 DPs? +# See https://n5eil02u.ecs.nsidc.org/opendap/hyrax/catalog.xml +client = dask.distributed.Client(n_workers=10, threads_per_worker=1) +client + +# %% [markdown] +# ## Quick view +# +# Use our [intake catalog](https://intake.readthedocs.io/en/latest/catalog.html) to get some sample ATL06 data +# (while making sure we have our Earthdata credentials set up properly), +# and view it using [xarray](https://xarray.pydata.org) and [hvplot](https://hvplot.pyviz.org). + +# %% +catalog = intake.open_catalog( + uri="catalog.yaml" +) # open the local catalog file containing ICESAT2 stuff + +# %% +try: + netrc.netrc() +except FileNotFoundError as error_msg: + print( + f"{error_msg}, please follow instructions to create one at " + "https://nsidc.org/support/faq/what-options-are-available-bulk-downloading-data-https-earthdata-login-enabled " + 'basically using `echo "machine urs.earthdata.nasa.gov login password " >> ~/.netrc`' + ) + raise + +dataset = ( + catalog.icesat2atl06.to_dask().unify_chunks() +) # depends on .netrc file in home folder +dataset + +# %% +# dataset.hvplot.points( +# x="longitude", y="latitude", datashade=True, width=800, height=500, hover=True, +# #geo=True, coastline=True, crs=cartopy.crs.PlateCarree(), #projection=cartopy.crs.Stereographic(central_latitude=-71), +# ) +catalog.icesat2atl06.hvplot.quickview() + +# %% [markdown] +# ## Data intake +# +# Pulling in all of the raw ATL06 data (HDF5 format) from the NSIDC servers via an intake catalog file. +# Note that this will involve 100s if not 1000s of GBs of data, so make sure there's enough storage!! + +# %% +# Download all ICESAT2 ATLAS hdf files from start to end date +dates1 = pd.date_range(start="2018.10.14", end="2018.12.08") # 1st batch +dates2 = pd.date_range(start="2018.12.10", end="2019.06.26") # 2nd batch +dates3 = pd.date_range(start="2019.07.26", end="2020.03.06") # 3rd batch +dates = dates1.append(other=dates2).append(other=dates3) + +# %% +# Submit download jobs to Client +futures = [] +for date in dates: + source = catalog.icesat2atlasdownloader(date=date) + future = client.submit( + func=source.discover, key=f"download-{date}", + ) # triggers download of the file(s), or loads from cache + futures.append(future) + +# %% +# Check download progress here, https://stackoverflow.com/a/37901797/6611055 +responses = [] +for f in tqdm.tqdm( + iterable=dask.distributed.as_completed(futures=futures), total=len(futures) +): + responses.append(f.result()) + +# %% +# In case of error, check which downloads are unfinished +# Manually delete those folders and retry +unfinished = [] +for foo in futures: + if foo.status != "finished": + print(foo) + unfinished.append(foo) + # foo.retry() + +# %% +try: + assert len(unfinished) == 0 +except AssertionError: + for task in unfinished: + print(task) + raise ValueError( + f"{len(unfinished)} download tasks are unfinished," + " please delete those folders and retry again!" + ) + +# %% [raw] +# with tqdm.tqdm(total=len(dates)) as pbar: +# for date in dates: +# source = catalog.icesat2atlasdownloader(date=date) +# source_urlpath = source.urlpath +# try: +# pbar.set_postfix_str(f"Obtaining files from {source_urlpath}") +# source.discover() # triggers download of the file(s), or loads from cache +# except (requests.HTTPError, OSError, KeyError, TypeError) as error: +# # clear cache and try again +# print(f"Errored: {error}, trying again") +# source.cache[0].clear_cache(urlpath=source_urlpath) +# source.discover() +# except (ValueError, pd.core.index.InvalidIndexError) as error: +# print(f"Errored: {error}, ignoring") +# pass +# pbar.update(n=1) +# #finally: +# # source.close() +# # del source + +# %% [raw] +# catalog.icesat2atl06(date="2019.06.24", laser="gt1l").discover() # ValueError?? +# catalog.icesat2atl06(date="2019.02.28", laser="gt2l").discover() # InvalidIndexError +# catalog.icesat2atl06(date="2019.11.13", laser="gt2l").discover() # ValueError + +# %% + +# %% [markdown] +# ## Exploratory data analysis on local files +# +# Now that we've downloaded a good chunk of data and cached them locally, +# we can have some fun with visualizing the point clouds! + +# %% +dataset = ( + catalog.icesat2atl06.to_dask() +) # unfortunately, we have to load this in dask to get the path... +root_directory = os.path.dirname(os.path.dirname(dataset.encoding["source"])) + +# %% +def get_crossing_dates( + catalog_entry: intake.catalog.local.LocalCatalogEntry, + root_directory: str, + referencegroundtrack: str = "????", + datetime="*", + cyclenumber="??", + orbitalsegment="??", + version="003", + revision="01", +): + """ + Given a 4-digit reference groundtrack (e.g. 1234), + we output a dictionary where the + key is the date in "YYYY.MM.DD" format when an ICESAT2 crossing was made and the + value is the filepath to the HDF5 data file. + """ + + # Get a glob string that looks like "ATL06_??????????????_XXXX????_002_01.h5" + globpath = catalog_entry.path_as_pattern + if datetime == "*": + globpath = globpath.replace("{datetime:%Y%m%d%H%M%S}", "??????????????") + globpath = globpath.format( + referencegroundtrack=referencegroundtrack, + cyclenumber=cyclenumber, + orbitalsegment=orbitalsegment, + version=version, + revision=revision, + ) + + # Get list of filepaths (dates are contained in the filepath) + globedpaths = glob.glob(os.path.join(root_directory, "??????????", globpath)) + + # Pick out just the dates in "YYYY.MM.DD" format from the globedpaths + # crossingdates = [os.path.basename(os.path.dirname(p=p)) for p in globedpaths] + crossingdates = { + os.path.basename(os.path.dirname(p=p)): p for p in sorted(globedpaths) + } + + return crossingdates + + +# %% +crossing_dates_dict = {} +for rgt in range(0, 1388): # ReferenceGroundTrack goes from 0001 to 1387 + referencegroundtrack = f"{rgt}".zfill(4) + crossing_dates = dask.delayed(get_crossing_dates)( + catalog_entry=catalog.icesat2atl06, + root_directory=root_directory, + referencegroundtrack=referencegroundtrack, + ) + crossing_dates_dict[referencegroundtrack] = crossing_dates +crossing_dates_dict = dask.compute(crossing_dates_dict)[0] + +# %% +crossing_dates_dict["0349"].keys() + + +# %% [markdown] +# ![ICESat-2 Laser Beam Pattern](https://ars.els-cdn.com/content/image/1-s2.0-S0034425719303712-gr1.jpg) + +# %% [raw] +# # For one laser along one reference ground track, +# # concatenate all points from all dates into one xr.Dataset +# da = xr.concat( +# objs=( +# catalog.icesat2atl06(date=date, laser="gt1r") +# .to_dask() +# .sel(referencegroundtrack=referencegroundtrack) +# for date in crossing_dates +# ), +# dim=pd.Index(data=crossing_dates, name="crossingdates"), +# ) + +# %% +def six_laser_beams(crossing_dates: list): + """ + For all 6 lasers along one reference ground track, + concatenate all points from all crossing dates into one xr.Dataset + """ + lasers = ["gt1l", "gt1r", "gt2l", "gt2r", "gt3l", "gt3r"] + + objs = [ + xr.open_mfdataset( + paths=crossing_dates.values(), + combine="nested", + engine="h5netcdf", + concat_dim="delta_time", + group=f"{laser}/land_ice_segments", + parallel=True, + ).assign_coords(coords={"laser": laser}) + for laser in lasers + ] + + try: + da = xr.concat( + objs=objs, dim="laser" + ) # dim=pd.Index(data=lasers, name="laser") + df = da.unify_chunks().to_dask_dataframe() + except ValueError: + # ValueError: cannot reindex or align along dimension 'delta_time' because the index has duplicate values + df = dask.dataframe.concat( + [obj.unify_chunks().to_dask_dataframe() for obj in objs] + ) + + return df + + +# %% +dataset_dict = {} +# for referencegroundtrack in list(crossing_dates_dict)[349:350]: # ReferenceGroundTrack goes from 0001 to 1387 +for referencegroundtrack in list(crossing_dates_dict)[ + 340:350 +]: # ReferenceGroundTrack goes from 0001 to 1387 + # print(referencegroundtrack) + if len(crossing_dates_dict[referencegroundtrack]) > 0: + da = dask.delayed(six_laser_beams)( + crossing_dates=crossing_dates_dict[referencegroundtrack] + ) + # da = six_laser_beams(crossing_dates=crossing_dates_dict[referencegroundtrack]) + dataset_dict[referencegroundtrack] = da + +# %% +df = dataset_dict["0349"].compute() # loads into a dask dataframe (lazy) + +# %% +df + +# %% + +# %% +dataset_dict = dask.compute(dataset_dict)[ + 0 +] # compute every referencegroundtrack, slow... though somewhat parallelized + +# %% +bdf = dask.dataframe.concat(dfs=list(dataset_dict.values())) + +# %% + +# %% +da.sel(crossingdates="2018.10.21").h_li.unify_chunks().drop( + labels=["longitude", "datetime", "cyclenumber"] +).hvplot( + kind="scatter", + x="latitude", + by="crossingdates", + datashade=True, + dynspread=True, + width=800, + height=500, + dynamic=True, + flip_xaxis=True, + hover=True, +) + +# %% + +# %% [raw] +# # https://xarray.pydata.org/en/stable/combining.html#concatenate +# # For all 6 lasers one one date ~~along one reference ground track~~, +# # concatenate all points ~~from one dates~~ into one xr.Dataset +# lasers = ["gt1l", "gt1r", "gt2l", "gt2r", "gt3l", "gt3r"] +# da = xr.concat( +# objs=( +# catalog.icesat2atl06(laser=laser) +# .to_dask() +# #.sel(referencegroundtrack=referencegroundtrack) +# for laser in lasers +# ), +# dim=pd.Index(data=lasers, name="laser") +# ) + +# %% + +# %% [markdown] +# ## Plot them points! + +# %% +# convert dask.dataframe to pd.DataFrame +df = df.compute() + +# %% +df = df.dropna(subset=["h_li"]).query(expr="atl06_quality_summary == 0").reset_index() + +# %% +dfs = df.query(expr="0 <= segment_id - 1443620 < 900") +dfs + +# %% +dfs.hvplot.scatter( + x="longitude", + y="latitude", + by="laser", + hover_cols=["delta_time", "segment_id"], + # datashade=True, dynspread=True, + # width=800, height=500, colorbar=True +) + +# %% +import pyproj + +# %% +transformer = pyproj.Transformer.from_crs( + crs_from=pyproj.CRS.from_epsg(4326), + crs_to=pyproj.CRS.from_epsg(3031), + always_xy=True, +) + +# %% +dfs["x"], dfs["y"] = transformer.transform( + xx=dfs.longitude.values, yy=dfs.latitude.values +) + +# %% +dfs + +# %% +dfs.hvplot.scatter( + x="x", + y="y", + by="laser", + hover_cols=["delta_time", "segment_id", "h_li"], + # datashade=True, dynspread=True, + # width=800, height=500, colorbar=True +) + +# %% +dfs.hvplot.scatter(x="x", y="h_li", by="laser") + +# %% +dfs.to_pickle(path="icesat2_sample.pkl") + +# %% + +# %% [markdown] +# ## Old making a DEM grid surface from points + +# %% +import scipy + + +# %% +# https://github.com/ICESAT-2HackWeek/gridding/blob/master/notebook/utils.py#L23 +def make_grid(xmin, xmax, ymin, ymax, dx, dy): + """Construct output grid-coordinates.""" + + # Setup grid dimensions + Nn = int((np.abs(ymax - ymin)) / dy) + 1 + Ne = int((np.abs(xmax - xmin)) / dx) + 1 + + # Initiate x/y vectors for grid + x_i = np.linspace(xmin, xmax, num=Ne) + y_i = np.linspace(ymin, ymax, num=Nn) + + return np.meshgrid(x_i, y_i) + + +# %% +xi, yi = make_grid( + xmin=dfs.x.min(), xmax=dfs.x.max(), ymin=dfs.y.max(), ymax=dfs.y.min(), dx=10, dy=10 +) + +# %% +ar = scipy.interpolate.griddata(points=(dfs.x, dfs.y), values=dfs.h_li, xi=(xi, yi)) + +# %% +plt.imshow(ar, extent=(dfs.x.min(), dfs.x.max(), dfs.y.min(), dfs.y.max())) + +# %% + +# %% +import plotly.express as px + +# %% +px.scatter_3d(data_frame=dfs, x="longitude", y="latitude", z="h_li", color="laser") + +# %% + +# %% [markdown] +# ### Play using XrViz +# +# Install the PyViz JupyterLab extension first using the [extension manager](https://jupyterlab.readthedocs.io/en/stable/user/extensions.html#using-the-extension-manager) or via the command below: +# +# ```bash +# jupyter labextension install @pyviz/jupyterlab_pyviz@v0.8.0 --no-build +# jupyter labextension list # check to see that extension is installed +# jupyter lab build --debug # build extension ??? with debug messages printed +# ``` +# +# Note: Had to add `network-timeout 600000` to `.yarnrc` file to resolve university network issues. + +# %% +import xrviz + +# %% +xrviz.example() + +# %% +# https://xrviz.readthedocs.io/en/latest/set_initial_parameters.html +initial_params = { + # Select variable to plot + "Variables": "h_li", + # Set coordinates + "Set Coords": ["longitude", "latitude"], + # Axes + "x": "longitude", + "y": "latitude", + # "sigma": "animate", + # Projection + # "is_geo": True, + # "basemap": True, + # "crs": "PlateCarree" +} +dashboard = xrviz.dashboard.Dashboard(data=dataset) # , initial_params=initial_params) + +# %% +dashboard.panel + +# %% +dashboard.show() + +# %% + +# %% [markdown] +# ## OpenAltimetry + +# %% +"minx=-154.56678505984297&miny=-88.82881451427136&maxx=-125.17872921546498&maxy=-81.34051361301398&date=2019-05-02&trackId=516" + +# %% +# Paste the OpenAltimetry selection parameters here +OA_REFERENCE_URL = "minx=-177.64275595145213&miny=-88.12014866942751&maxx=-128.25920892322736&maxy=-85.52394234080862&date=2019-05-02&trackId=515" +# We populate a list with the photon data using the OpenAltimetry API, no HDF! +OA_URL = ( + "https://openaltimetry.org/data/icesat2/getPhotonData?client=jupyter&" + + OA_REFERENCE_URL +) +OA_PHOTONS = ["Noise", "Low", "Medium", "High"] +# OA_PLOTTED_BEAMS = [1,2,3,4,5,6] you can select up to 6 beams for each ground track. +# Some beams may not be usable due cloud covering or QC issues. +OA_BEAMS = [3, 4] + +# %% +minx, miny, maxx, maxy = [-156, -88, -127, -84] +date = "2019-05-02" # UTC date? +track = 515 # +beam = 1 # 1 to 6 +params = { + "client": "jupyter", + "minx": minx, + "miny": miny, + "maxx": maxx, + "maxy": maxy, + "date": date, + "trackId": str(track), + "beam": str(beam), +} + +# %% +r = requests.get( + url="https://openaltimetry.org/data/icesat2/getPhotonData", params=params +) + +# %% +# OpenAltimetry Data cleansing +df = pd.io.json.json_normalize(data=r.json()["series"], meta="name", record_path="data") +df.name = df.name.str.split().str.get(0) # Get e.g. just "Low" instead of "Low [12345]" +df.query( + expr="name in ('Low', 'Medium', 'High')", inplace=True +) # filter out Noise and Buffer points + +df.rename(columns={0: "latitude", 1: "elevation", 2: "longitude"}, inplace=True) +df = df.reindex( + columns=["longitude", "latitude", "elevation", "name"] +) # reorder columns +df.reset_index(inplace=True) +df + +# %% +df.hvplot.scatter(x="latitude", y="elevation") + +# %%