Skip to content

Commit

Permalink
📦 Refresh ATL11 Zarr data to 20201111
Browse files Browse the repository at this point in the history
Reprocessed 2147/4160 ATL11 files and downloaded the rest from NSIDC! This is ATL11 version 2 (ATL11.002). Browser crashed again at 31%, but that's ok as it's only about 4 days of processing. CPU time saved on processing was spent instead on coding up the magical ATL11 download script (ATL11 has a crazy folder structure, if only STAC worked better!) that requires a manual trip to https://nsidc.org/data/ATL11?qt-data_set_tabs=1#qt-data_set_tabs with the input search string "1?_0308_002_01.h5"". Note that the download also required a downgrade to fsspec 0.7.4 and intake-xarray 0.3.2. This is a new refresh over 66dc627. No major schema change was noted, and mixing and matching the official and locally processed ATL11 files seems ok. Refactored the atl06_to_atl11.ipynb script quite a bit, but more needs to be done (read: use intake better).
  • Loading branch information
weiji14 committed Jan 20, 2021
1 parent 2c755d5 commit d9ad9db
Show file tree
Hide file tree
Showing 7 changed files with 6,573 additions and 4,261 deletions.
5 changes: 2 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,9 @@ MANIFEST

# Data files and folders
**/*.h5
**/*.tsv
ATL06.003
ATL11.001
ATL11.001z123
ATL11.00?
ATL11.00?z123
ATLXI
Quantarctica3

Expand Down
6,308 changes: 2,147 additions & 4,161 deletions ATL06_to_ATL11_Antarctica.sh

Large diffs are not rendered by default.

4,159 changes: 4,159 additions & 0 deletions ATL11_to_download.txt

Large diffs are not rendered by default.

187 changes: 133 additions & 54 deletions atl06_to_atl11.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"source": [
"import os\n",
"import glob\n",
"import shutil\n",
"import sys\n",
"import subprocess\n",
"\n",
Expand Down Expand Up @@ -52,23 +53,23 @@
"<td style=\"vertical-align: top; border: 0px solid white\">\n",
"<h3 style=\"text-align: left;\">Client</h3>\n",
"<ul style=\"text-align: left; list-style: none; margin: 0; padding: 0;\">\n",
" <li><b>Scheduler: </b>tcp://127.0.0.1:37523</li>\n",
" <li><b>Scheduler: </b>tcp://127.0.0.1:35153</li>\n",
" <li><b>Dashboard: </b><a href='http://127.0.0.1:8787/status' target='_blank'>http://127.0.0.1:8787/status</a></li>\n",
"</ul>\n",
"</td>\n",
"<td style=\"vertical-align: top; border: 0px solid white\">\n",
"<h3 style=\"text-align: left;\">Cluster</h3>\n",
"<ul style=\"text-align: left; list-style:none; margin: 0; padding: 0;\">\n",
" <li><b>Workers: </b>72</li>\n",
" <li><b>Cores: </b>72</li>\n",
" <li><b>Workers: </b>10</li>\n",
" <li><b>Cores: </b>10</li>\n",
" <li><b>Memory: </b>201.22 GB</li>\n",
"</ul>\n",
"</td>\n",
"</tr>\n",
"</table>"
],
"text/plain": [
"<Client: 'tcp://127.0.0.1:37523' processes=72 threads=72, memory=201.22 GB>"
"<Client: 'tcp://127.0.0.1:35153' processes=10 threads=10, memory=201.22 GB>"
]
},
"execution_count": 2,
Expand All @@ -77,29 +78,97 @@
}
],
"source": [
"client = dask.distributed.Client(n_workers=72, threads_per_worker=1)\n",
"client = dask.distributed.Client(n_workers=10, threads_per_worker=1)\n",
"client"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Download ATL11 from [NSIDC](https://doi.org/10.5067/ATLAS/ATL11.002) up to cycle 8"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"136"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Submit download jobs to Client\n",
"catalog = intake.open_catalog(\"deepicedrain/atlas_catalog.yaml\")\n",
"with open(file=\"ATL11_to_download.txt\", mode=\"r\") as f:\n",
" urlpaths = f.readlines()\n",
"dates: set = set(url.split(\"/\")[-2] for url in urlpaths)\n",
"len(dates)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def first_last_cycle_numbers(referencegroundtrack: int, orbitalsegment: int):\n",
" \"\"\"\n",
" Obtain the first and last cycle numbers for an ATL06 track, given the\n",
" reference ground track and orbital segment number as input.\n",
" \"\"\"\n",
" files = glob.glob(\n",
" f\"ATL06.003/**/ATL06*_*_{referencegroundtrack:04d}??{orbitalsegment:02d}_*.h5\"\n",
" )\n",
"\n",
" first_cycle = min(files)[-14:-12] # e.g. '02'\n",
" last_cycle = max(files)[-14:-12] # e.g. '07'\n",
"\n",
" return first_cycle, last_cycle"
"# Submit download jobs to Client\n",
"futures = []\n",
"for date in dates:\n",
" # date = \"2019.11.01\" # sorted(dates)[-1]\n",
" source = catalog.icesat2atl11(date=date)\n",
" future = client.submit(\n",
" func=source.discover, key=f\"download-{date}\"\n",
" ) # triggers download of the file(s), or loads from cache\n",
" futures.append(future)\n",
" # break\n",
" # source.urlpath"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"lines_to_next_cell": 2
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 136/136 [1:16:59<00:00, 33.97s/it]\n"
]
}
],
"source": [
"# Check download progress here, https://stackoverflow.com/a/37901797/6611055\n",
"responses = []\n",
"for f in tqdm.tqdm(\n",
" iterable=dask.distributed.as_completed(futures=futures), total=len(futures)\n",
"):\n",
" responses.append(f.result())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Process ATL06 to ATL11 for cycle 9 or newer"
]
},
{
Expand All @@ -113,40 +182,53 @@
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 4161/4161 [02:24<00:00, 28.78it/s]\n"
"100%|██████████| 4161/4161 [02:21<00:00, 29.44it/s]"
]
}
],
"source": [
"# Create ATL06_to_ATL11 processing script, if not already present\n",
"if not os.path.exists(\"ATL06_to_ATL11_Antarctica.sh\"):\n",
" # find first and last cycles for each reference ground track and each orbital segment\n",
" # find last cycle for each reference ground track and each orbital segment\n",
" last_cycle_func = lambda rgt, ost: int(\n",
" max(glob.glob(f\"ATL06.003/**/ATL06*_*_{rgt:04d}??{ost:02d}_*.h5\"))[-14:-12]\n",
" )\n",
" futures = []\n",
" for referencegroundtrack in range(1387, 0, -1):\n",
" for orbitalsegment in [10, 11, 12]: # loop through Antarctic orbital segments\n",
" cyclenums = client.submit(\n",
" first_last_cycle_numbers,\n",
" referencegroundtrack,\n",
" orbitalsegment,\n",
" key=f\"{referencegroundtrack:04d}-{orbitalsegment}\",\n",
" )\n",
" futures.append(cyclenums)\n",
" for referencegroundtrack, orbitalsegment in itertools.product(\n",
" range(1387, 0, -1), [10, 11, 12]\n",
" ):\n",
" cyclenum = client.submit(\n",
" last_cycle_func,\n",
" referencegroundtrack,\n",
" orbitalsegment,\n",
" key=f\"{referencegroundtrack:04d}-{orbitalsegment}\",\n",
" )\n",
" futures.append(cyclenum)\n",
"\n",
" # Prepare string to write into ATL06_to_ATL11_Antarctica.sh bash script\n",
" writelines = []\n",
" for f in tqdm.tqdm(\n",
" iterable=dask.distributed.as_completed(futures=futures), total=len(futures)\n",
" ):\n",
" referencegroundtrack, orbitalsegment = f.key.split(\"-\")\n",
" first_cycle, last_cycle = f.result()\n",
" writelines.append(\n",
" f\"python3 ATL11/ATL06_to_ATL11.py\"\n",
" f\" {referencegroundtrack} {orbitalsegment}\"\n",
" f\" --cycles {first_cycle} {last_cycle}\"\n",
" f\" --Release 3\"\n",
" f\" --directory 'ATL06.003/**/'\"\n",
" f\" --out_dir ATL11.001\\n\"\n",
" )\n",
" last_cycle = f.result()\n",
" if last_cycle > 8: # Only process those with Cycle 9 and newer locally\n",
" writelines.append(\n",
" f\"python3 ATL11/ATL06_to_ATL11.py\"\n",
" f\" {referencegroundtrack} {orbitalsegment}\"\n",
" f\" --cycles 03 {last_cycle:02d}\"\n",
" f\" --Release 2\"\n",
" f\" --directory 'ATL06.003/**/'\"\n",
" f\" --out_dir ATL11.002\\n\"\n",
" )\n",
" fname = f\"ATL11_{referencegroundtrack}{orbitalsegment}_0308_002_01.h5\"\n",
" if not os.path.exists(f\"ATL11.002/official/{fname}\"):\n",
" try:\n",
" shutil.move(src=f\"ATL11.002/{fname}\", dst=\"ATL11.002/official\")\n",
" except FileNotFoundError:\n",
" pass\n",
" # else: # Just use official NSIDC version for Cycle 8 or older\n",
" # pass\n",
" writelines.sort() # sort writelines in place\n",
"\n",
" # Finally create the bash script\n",
Expand Down Expand Up @@ -175,7 +257,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[7m16% 684:3477=\u001b[0m5d09h24m16s python3 ATL11/ATL06_to_ATL11.py 0255 11 --cycles 03 08 \u001b[0m"
"\u001b[7m31% 686:1461=1d22h49m21s \u001b[0mpython3 ATL11/ATL06_to_ATL11.py 0266 11 --cycles 03 09 \u001b[0m"
]
}
],
Expand All @@ -185,7 +267,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -228,10 +310,7 @@
}
],
"source": [
"# for atl11file in tqdm.tqdm(iterable=sorted(glob.glob(\"ATL11.001/*.h5\"))):\n",
"# name = os.path.basename(p=os.path.splitext(p=atl11file)[0])\n",
"\n",
"max_cycles: int = max(int(f[-11:-10]) for f in glob.glob(\"ATL11.001/*.h5\"))\n",
"max_cycles: int = max(int(f[-11:-10]) for f in glob.glob(\"ATL11.002/*.h5\"))\n",
"print(f\"{max_cycles} ICESat-2 cycles available\")"
]
},
Expand All @@ -248,7 +327,7 @@
" - Mask values using _FillValue ??\n",
" - Convert attribute format from binary to str\n",
" \"\"\"\n",
" ds = xr.open_dataset(\n",
" ds: xr.Dataset = xr.open_dataset(\n",
" filename_or_obj=atl11file, group=group, engine=\"h5netcdf\", mask_and_scale=True\n",
" )\n",
"\n",
Expand Down Expand Up @@ -281,7 +360,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1387/1387 [00:08<00:00, 158.58it/s]\n"
"100%|██████████| 1387/1387 [00:06<00:00, 206.61it/s]\n"
]
}
],
Expand All @@ -290,7 +369,7 @@
"# Also consolidate all three laser pairs pt1, pt2, pt3 into one file\n",
"atl11_dict = {}\n",
"for rgt in tqdm.trange(1387):\n",
" atl11files: list = glob.glob(f\"ATL11.001/ATL11_{rgt+1:04d}1?_????_00?_0?.h5\")\n",
" atl11files: list = glob.glob(f\"ATL11.002/ATL11_{rgt+1:04d}1?_????_00?_0?.h5\")\n",
"\n",
" try:\n",
" assert len(atl11files) == 3 # Should be 3 files for Orbital Segments 10,11,12\n",
Expand All @@ -303,10 +382,10 @@
"\n",
" if atl11files:\n",
" pattern: dict = intake.source.utils.reverse_format(\n",
" format_string=\"ATL11.001/ATL11_{referencegroundtrack:4}{orbitalsegment:2}_{cycles:4}_{version:3}_{revision:2}.h5\",\n",
" format_string=\"ATL11.002/ATL11_{referencegroundtrack:4}{orbitalsegment:2}_{cycles:4}_{version:3}_{revision:2}.h5\",\n",
" resolved_string=sorted(atl11files)[1], # get the '11' one, not '10' or '12'\n",
" )\n",
" zarrfilepath: str = \"ATL11.001z123/ATL11_{referencegroundtrack}1x_{cycles}_{version}_{revision}.zarr\".format(\n",
" zarrfilepath: str = \"ATL11.002z123/ATL11_{referencegroundtrack}1x_{cycles}_{version}_{revision}.zarr\".format(\n",
" **pattern\n",
" )\n",
" atl11_dict[zarrfilepath] = atl11files"
Expand Down Expand Up @@ -347,7 +426,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1387/1387 [00:16<00:00, 85.90it/s]\n"
"100%|██████████| 1387/1387 [00:18<00:00, 75.32it/s]\n"
]
}
],
Expand Down Expand Up @@ -385,7 +464,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 1387/1387 [14:23<00:00, 1.61it/s] \n"
"100%|██████████| 1387/1387 [31:27<00:00, 1.36s/it]\n"
]
}
],
Expand All @@ -409,7 +488,7 @@
{
"data": {
"text/plain": [
"(185450, 8)"
"(183932, 6)"
]
},
"execution_count": 13,
Expand All @@ -430,11 +509,11 @@
"source": [
"# Note, this raw conversion below takes about 11 hours\n",
"# because HDF5 files work on a single thread...\n",
"for atl11file in tqdm.tqdm(iterable=sorted(glob.glob(\"ATL11.001/*.h5\"))):\n",
"for atl11file in tqdm.tqdm(iterable=sorted(glob.glob(\"ATL11.002/*.h5\"))):\n",
" name = os.path.basename(p=os.path.splitext(p=atl11file)[0])\n",
" zarr.convenience.copy_all(\n",
" source=h5py.File(name=atl11file, mode=\"r\"),\n",
" dest=zarr.open_group(store=f\"ATL11.001z/{name}.zarr\", mode=\"w\"),\n",
" dest=zarr.open_group(store=f\"ATL11.002z/{name}.zarr\", mode=\"w\"),\n",
" if_exists=\"skip\",\n",
" without_attrs=True,\n",
" )"
Expand Down
Loading

0 comments on commit d9ad9db

Please sign in to comment.