diff --git a/.github/workflows/ensure_clean_notebooks.py b/.github/workflows/ensure_clean_notebooks.py index 102c7a2..4ecf35c 100644 --- a/.github/workflows/ensure_clean_notebooks.py +++ b/.github/workflows/ensure_clean_notebooks.py @@ -30,13 +30,13 @@ results = [] for notebook in ipynbs: - #if not notebook in exclude_notebooks: - print(f'Checking {notebook}...') - nb = nbformat.read(notebook, as_version=nbformat.NO_CONVERT) - result = nbc.check_notebook(nb, - remove_empty_cells=False, - preserve_cell_metadata=True) - results.append(result) + if not notebook in exclude_notebooks: + print(f'Checking {notebook}...') + nb = nbformat.read(notebook, as_version=nbformat.NO_CONVERT) + result = nbc.check_notebook(nb, + remove_empty_cells=False, + preserve_cell_metadata=True) + results.append(result) if False in results: sys.exit(1) diff --git a/book/_config.yml b/book/_config.yml index 168c9cb..f19fa9d 100644 --- a/book/_config.yml +++ b/book/_config.yml @@ -37,6 +37,8 @@ execute: execute_notebooks: 'force' exclude_patterns: - "**/geospatial-advanced.ipynb" + - "cloud-computing/04-cloud-optimized-icesat2.ipynb" + - "cloud-computing/atl08_parquet_files/atl08_parquet.ipynb" allow_errors: false # Per-cell notebook execution limit (seconds) timeout: 300 diff --git a/book/_toc.yml b/book/_toc.yml index 6419cf7..e377ee4 100644 --- a/book/_toc.yml +++ b/book/_toc.yml @@ -16,11 +16,21 @@ parts: - file: preliminary/checklist - file: preliminary/git - caption: Tutorials + maxdepth: 1 chapters: - file: tutorials/index sections: - file: tutorials/example/tutorial-notebook - file: tutorials/nb-to-package/index.md + - file: tutorials/cloud-computing/00-goals-and-outline + sections: + - file: tutorials/cloud-computing/01-cloud-computing + - file: tutorials/cloud-computing/02-cloud-data-access + - file: tutorials/cloud-computing/03-cloud-optimized-data-access + - file: tutorials/cloud-computing/04-cloud-optimized-icesat2 + - file: tutorials/cloud-computing/atl08_parquet_files/atl08_parquet + options: + - titlesonly: true - caption: Projects chapters: - file: projects/index diff --git a/book/tutorials/cloud-computing/00-goals-and-outline.ipynb b/book/tutorials/cloud-computing/00-goals-and-outline.ipynb new file mode 100644 index 0000000..b9e736b --- /dev/null +++ b/book/tutorials/cloud-computing/00-goals-and-outline.ipynb @@ -0,0 +1,76 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a888b10c-1f9f-406e-8d14-9648be234d44", + "metadata": {}, + "source": [ + "# Cloud Computing Tutorial\n", + "\n", + "
\n", + "\n", + "```{image} ./images/cloud.gif\n", + ":width: 200px\n", + ":align: center\n", + "```\n", + "\n", + "**Welcome to the Cloud Computing Tutorial!**\n", + "\n", + "This tutorial is just the tip of the ice[SAT-2]berg (😬) of cloud computing. It focuses on accessing data stored in the cloud. An understanding of the difference between the \"download to local\" and \"direct from cloud\" methods of data access will explain how and why the cloud facilitates the scaling and reproducibility of your science.\n", + "\n", + ":::{admonition} Learning Goals\n", + "\n", + "**At the conclusion of this tutorial, you should be able to answer:**\n", + "1. What is cloud computing?\n", + "2. What is cloud object storage and the difference between data stored in the cloud, data on a local file system and data stored in \"on-premise\" data centers.\n", + "3. How to optimize data for reading from cloud object storage.\n", + "\n", + ":::\n", + "\n", + "## Outline\n", + "\n", + "1. [What is cloud computing?](./01-cloud-computing.ipynb)\n", + " 1. Definition of cloud computing\n", + " 2. Exercise: Difference between resources on your local machine and resources in the cloud\n", + " 3. Why you might use cloud computing\n", + "2. [Accessing data in the cloud](./02-cloud-data-access.ipynb)\n", + " 1. Definition of cloud object storage\n", + " 2. Exercise: How many NASA datasets (aka collections) are in the cloud? How many ICESat-2 datasets are in the cloud? Which DAAC manages ICESast-2 data?\n", + " 3. Difference between data stored in the cloud, data on a local file system and data stored in \"on-premise\" data centers\n", + " 4. Why you might use cloud object storage\n", + "3. [Cloud-Optimized Data](./03-cloud-optimized-data-access.ipynb)\n", + " 1. What are we optimizing for and why?\n", + " 2. Anatomy of a structured data file\n", + " 3. Thought Exercise: Garage analogy\n", + " 4. How do we optimize data for reading from cloud object storage?\n", + "4. [Cloud-Optimized ICESat-2 Demo](./04-cloud-optimized-icesat2.ipynb)\n", + " 1. Cloud-Optimized vs Cloud-Native \n", + " 1. Creating an ICESat-2 GeoParquet\n", + " 3. Plot the data with lonboard\n", + "\n", + "Or simply: Cloud -> Cloud data access -> Optimized cloud data access -> Demo with ICESat-2" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/book/tutorials/cloud-computing/01-cloud-computing.ipynb b/book/tutorials/cloud-computing/01-cloud-computing.ipynb new file mode 100644 index 0000000..43ed7d4 --- /dev/null +++ b/book/tutorials/cloud-computing/01-cloud-computing.ipynb @@ -0,0 +1,81 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# What is cloud computing?\n", + "\n", + "
\n", + "\n", + "**Cloud computing is compute and storage as a service.** The term \"cloud computing\" is typically used to refer to commercial cloud service providers such as Amazon Web Services (AWS), Google Cloud Platform (GCP), and Microsoft Azure (Azure). These cloud service providers all offer a wide range of computing services, only a few of which we will cover today, via a pay-as-you-go payment structure.\n", + "\n", + "```{image} ./images/AWS_OurDataCenters_Background.jpg\n", + ":width: 600px\n", + ":align: center\n", + "```\n", + "\n", + "

image src: https://aws.amazon.com/compliance/data-center/data-centers/

\n", + "\n", + ">Cloud computing is the on-demand delivery of IT resources over the Internet with pay-as-you-go pricing. Instead of buying, owning, and maintaining physical data centers and servers, you can access technology services, such as computing power, storage, and databases, on an as-needed basis from a cloud provider like Amazon Web Services (AWS). ([source](https://aws.amazon.com/what-is-cloud-computing/))\n", + "\n", + "This tutorial will focus on AWS services and terminology, but Google Cloud and Microsoft Azure offer the same services.\n", + "\n", + ":::{dropdown} 🏋️ Exercise: How many CPUs and how much memory does your laptop have? And how does that compare with CryoCloud?\n", + ":open:\n", + "If you have your laptop available, open the terminal app and use the appropriate commands to determine CPU and memory.\n", + "\n", + "
\n", + "\n", + "| Operating System (OS) | CPU command | Memory Command |\n", + "|-----------------------|-----------------------------------------------------------------------------------|----------------------------|\n", + "| MacOS | `sysctl -a \\| grep hw.ncpu` | `top -l 1 \\| grep PhysMem` |\n", + "| Linux (cryocloud) | `lscpu \\| grep \"^CPU\\(s\\):\"` | `free -h` | \n", + "| Windows | https://www.top-password.com/blog/find-number-of-cores-in-your-cpu-on-windows-10/ | |\n", + "
\n", + "\n", + "Now do the same but on hub.cryointhecloud.com.\n", + "\n", + "Tip: When logged into cryocloud, you can click the ![kernel usage icon](./images/tachometer-alt_1.png) icon on the far-right toolbar.\n", + ":::\n", + "\n", + "**What did you find?** It's possible you found that your machine has **more** CPU and/or memory than cryocloud!\n", + "\n", + ":::{dropdown} So why would we want to use the cloud and not our personal computers?\n", + " 1. Because cryocloud has all the dependencies you need.\n", + " 2. Because cryocloud is \"close\" to the data (more on this later).\n", + " 3. Because you can use larger and bigger machines in the cloud (more on this later).\n", + " 4. **Having the dependencies, data, and runtime environment in the cloud can simplify reproducible science.**\n", + ":::\n", + "\n", + ":::{admonition} Takeaways\n", + "\n", + "* The cloud allows you to access many computing and storage services over the internet. Most cloud services are offered via a \"pay as you go\" model.\n", + "* Hubs like CryoCloud provide a virtual environment which simplifies reproducible science. You should use them whenever you can!\n", + ":::" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/book/tutorials/cloud-computing/02-cloud-data-access.ipynb b/book/tutorials/cloud-computing/02-cloud-data-access.ipynb new file mode 100644 index 0000000..f5a8b3f --- /dev/null +++ b/book/tutorials/cloud-computing/02-cloud-data-access.ipynb @@ -0,0 +1,140 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "9310f818-bbfe-4cb3-8e84-f21beaca334e", + "metadata": {}, + "source": [ + "# Cloud Data Access\n", + "\n", + "
\n", + "\n", + "## NASA's migration from \"on-premise\" to cloud\n", + "\n", + "```{image} ./images/DAAC_map_new.jpg\n", + ":width: 700px\n", + ":align: center\n", + "```\n", + "

image src: https://asf.alaska.edu/about-asf-daac/

\n", + "\n", + "NASA has 12 Distributed Active Archive Centers (DAACs). Each DAAC is associated with a few sub-disciplines of Earth science, and those specialties correspond to which missions and data products those DAACs are in charge of. For example, LPDAAC is the land processes DAAC and is in charge of the Harmonized Landsat Sentinel (HLS) Product which is often used for land classification. Up until about 6 years ago (which is about when I started working with NASA), all NASA Earth Observation archives resided \"on-premise\" at the data center's physical locations in data centers they manage.\n", + "\n", + "NASA, anticipating the exponential growth in their Earth Observation data archives, started the [Earthdata Cloud](https://www.earthdata.nasa.gov/eosdis/cloud-evolution) initiative. Now, NASA DAACs are in the process of migrating their collections to cloud storage. Existing missions are growing their collections as well, but new missions such as NISAR and SWOT are or will be the most significant contributors to NASA's archival volume growth.\n", + "\n", + "\n", + "```{image} ./images/archive-growth-FY22.jpg\n", + ":width: 900px\n", + ":align: center\n", + "```\n", + "

image src: https://www.earthdata.nasa.gov/esds/esds-highlights/2022-esds-highlights

\n", + "\n", + "Now, high priority and new datasets are being stored on **cloud object storage**.\n", + "\n", + "
\n", + "\n", + "## What is cloud object storage?\n", + "\n", + "Cloud object storage stores and manages unstructured data in a flat structure (as opposed to a hierarchy as with file storage). Object storage is distinguished from a database, which requires software (a database management system) to store data and often has connection limits. Object storage is distinct from local file storage, because you access cloud object storage over a network connection, whereas local file storage is accessed by the central processing unit (CPU) of whatever server you are using.\n", + "\n", + "Cloud object storage is accessible using HTTP or a cloud-object storage protocol, such as AWS' Simple Storage Service (S3). Access over the network is critical because it means many servers can access data in parallel and these storage systems are designed to be infinitely scalable and always available.\n", + "\n", + "```{image} ./images/cloud-and-local.png\n", + ":width: 500px\n", + ":align: center\n", + "```\n", + "\n", + ":::{dropdown} 🏋️ Exercise: Datasets on Earthdata Cloud\n", + ":open:\n", + "\n", + "Navigate [https://search.earthdata.nasa.gov](https://search.earthdata.nasa.gov), search for ICESat-2 and answer the following questions:\n", + "\n", + "* Which DAAC hosts ICESat-2 datasets?\n", + "* How many ICESat-2 datasets are hosted on the AWS Cloud and how can you tell?\n", + ":::\n", + "\n", + "\n", + "## There are different access patterns, it can be confusing! 🤯\n", + "\n", + "Here are a likely few:\n", + "1. Download data from a DAAC to your local machine.\n", + "2. Download data from cloud storage to your local machine.\n", + "3. Login to a virtual machine in the cloud and download data from a DAAC (when would you do this?).\n", + "4. Login to a virtual machine in the cloud, like CryoCloud, and access data directly.\n", + "\n", + "```{image} ./images/different-modes-of-access.png\n", + ":width: 1000px\n", + ":align: center\n", + "```\n", + "\n", + ":::{dropdown} Which should you chose and why?\n", + " You should use option 4 - direct access. Because S3 is a cloud service, egress (files being download outside of AWS services) is not free.\n", + " **You can only directly access (both partial reading and download) files on S3 if you are in the same AWS region as the data. This is so NASA can avoid egress fees 💸 but it also benefits you because this style of access is much faster.**\n", + " The good news is that cryointhecloud is located in AWS us-west-2, the same region as NASA's Earthdata Cloud datasets!\n", + "\n", + " Of course, you may still need to access datasets from on-prem servers as well.\n", + "\n", + "

Caveats

\n", + " \n", + ":::\n", + "\n", + "## Cloud vs Local Storage\n", + "\n", + ":::{list-table}\n", + ":header-rows: 1\n", + "\n", + "* - Feature\n", + " - Local\n", + " - Cloud\n", + "* - Scalability\n", + " - ❌ limited by physical hardware\n", + " - ✅ highly scalable\n", + "* - Accessibility\n", + " - ❌ access is limited to local network or complex setup for remote access\n", + " - ✅ accessible from anywhere with an internet connection\n", + "* - Collaboration\n", + " - ❌ sharing is hard\n", + " - ✅ sharing is possible with tools for access control\n", + "* - Data backup\n", + " - ❌ risk of data loss due to hardware failure or human error\n", + " - ✅ typically includes redundancy ([read more](https://docs.aws.amazon.com/AmazonS3/latest/userguide/DataDurability.html))\n", + "* - Performance\n", + " - ✅ faster since it does not depend on any network\n", + " - ❌ performance depends on internet speed or proximity to the data\n", + ":::\n", + "\n", + "\n", + ":::{admonition} Takeaways\n", + "\n", + "1. NASA datasets are still managed by DAACs, even though many datasets are moving to the cloud.\n", + "2. Users are encouraged to access the data directly in the cloud through AWS services (like cryocloud!)\n", + ":::" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/book/tutorials/cloud-computing/03-cloud-optimized-data-access.ipynb b/book/tutorials/cloud-computing/03-cloud-optimized-data-access.ipynb new file mode 100644 index 0000000..f973546 --- /dev/null +++ b/book/tutorials/cloud-computing/03-cloud-optimized-data-access.ipynb @@ -0,0 +1,124 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Cloud-Optimized Data Access\n", + "\n", + "
\n", + "\n", + "Recall from the [Cloud Data Access Notebook](./02-cloud-data-access.ipynb) that cloud object storage is accessed over the network. Local file storage access will always be faster but there are limitations. This is why the design of file formats in the cloud requires more consideration than local file storage.\n", + "\n", + "## 🏋️ Exercise\n", + "\n", + ":::{dropdown} What are some limitations of local file storage?\n", + "See the table **Cloud vs Local Storage** in the [Cloud Data Access Notebook](./02-cloud-data-access.ipynb).\n", + ":::\n", + "\n", + "## Why you should care\n", + "\n", + "Reading ICESat-2 files, which are often large and not cloud-optimized, can be slow! It is nice to know why and possibly advocate for things to be better!\n", + "\n", + "## What are we optimizing for and why?\n", + "\n", + "The \"optimize\" in cloud-optimized is to **minimize data latency** and **maximize throughput** by:\n", + "\n", + "* Making as few requests as possible;\n", + "* Making even less for metadata, preferably only one; and\n", + "* Using a file layout that simplifies accessing data for parallel reads.\n", + "\n", + ":::{attention} A future without file formats\n", + "I like to imagine a day when we won't have to think about file formats. The geospatial software community is working on ways to make all collections appear as logical datasets, so you can query them without having to think about files.\n", + ":::\n", + "\n", + "## Anatomy of a structured data file\n", + "\n", + "```{image} ./images/hdf5-structure-1.jpg\n", + ":width: 450px\n", + ":align: left\n", + "```\n", + "\n", + "

img source: https://www.neonscience.org/resources/learning-hub/tutorials/about-hdf5

\n", + "\n", + "```{image} ./images/hdf5-structure-2.png\n", + ":width: 450px\n", + ":align: left\n", + "```\n", + "\n", + "A structured data file is composed of two parts: **metadata** and the **raw data**. Metadata is information about the data, such as the data shape, data type, the data variables, the data's coordinate system, and how the data is stored, such as chunk shape and compression. Data is the actual data that you want to analyze.\n", + "\n", + "We can optimize this structure for reading from cloud storage.\n", + "\n", + "## How do we accomplish cloud-optimization?\n", + "\n", + "### An analogy - Moving away from home\n", + "\n", + "Imagine when you lived at home with your parents. Everything was right there when you needed it (like local file storage). Let's say you're about to move away to college (the cloud), but you have decided to backpack there and so you can't bring any of your belongings with you. You put everything in your parent's (infinitely large) garage (cloud object storage). Given you would need to have things shipped to you, would it be better to leave everything unpacked? To put everything all in one box? A few different boxes? And what would be the most efficient way for your parents to know where things were when you asked for them?\n", + "\n", + "```{image} ./images/dalle-college.png\n", + ":width: 400px\n", + ":align: center\n", + "```\n", + "

image generated with ChatGPT 4

\n", + "\n", + "You can actually make any common geospatial data formats (HDF5/NetCDF, GeoTIFF, LAS (LIDAR Aerial Survey)) \"cloud-optimized\" by:\n", + "\n", + "1. Separate metadata from data and store it contiguously so it can be read with one request.\n", + "2. Store data in chunks, so the whole file doesn't have to be read to access a portion of the data, and it can be compressed.\n", + "3. Make sure the chunks of data are not too small, so more data is fetched with each request.\n", + "4. Make sure the chunks are not too large, which means more data has to be transferred and decompression takes longer.\n", + "5. Compress these chunks so there is less data to transfer over the network.\n", + "\n", + ":::{note} Lazy loading\n", + "\n", + "**Separating metadata from data supports lazy loading, which is key to working quickly when data is in the cloud.** Libraries, such as xarray, first read the metadata. They defer actually reading data until it's needed for analysis. When a computation of the data is called, libraries use [HTTP range requests](https://http.dev/range-request) to request only the chunks required. This is also called \"lazy loading\" data. See also [xarray's documentation on lazy indexing](https://docs.xarray.dev/en/latest/internals/internal-design.html#lazy-indexing).\n", + "\n", + ":::\n", + "\n", + "\n", + ":::{attention} Opening Arguments\n", + "A few arguments used to open the dataset also make a huge difference, namely with how libraries, such as s3fs and h5py, cache chunks.\n", + "\n", + "For s3fs, use [`cache_type` and `block_size`](https://s3fs.readthedocs.io/en/latest/api.html?highlight=cache_type#s3fs.core.S3File).\n", + "\n", + "For h5py, use [`rdcc_nbytes` and `page_buf_size`](https://docs.h5py.org/en/stable/high/file.html#h5py.File).\n", + ":::\n", + "\n", + ":::{seealso}\n", + "1. [Cloud-Optimized HDF5 Files – Aleksandar Jelenak, The HDF Group](https://www.youtube.com/watch?v=bDH59YTXpkc)\n", + "2. [HDF at the speed of Zarr - Luis Lopez, Pangeo Showcase](https://docs.google.com/presentation/d/1iYFvGt9Zz0iaTj0STIMbboRKcBGhpOH_LuLBLqsJAlk/edit?usp=sharing) is a presentation all about Cloud-Optimizing ICESat-2 Products\n", + "3. A notebook demonstrating how to repack ATL03 product to cloud-optimized (for a subset of datasets): [rechunking_atl03.ipynb](https://gist.github.com/abarciauskas-bgse/8bf4388f8f8989582c807b2451c5cf8c)\n", + ":::\n", + "\n", + ":::{admonition} Takeaways\n", + "\n", + "* Understanding file formats may help in diagnosing issues when things are slow.\n", + "* You can make files cloud-optimized by separating metadata and storing it contiguously so it can all be read in one request.\n", + "* You can use arguments to libraries like s3fs and h5py to support caching.\n", + ":::" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/book/tutorials/cloud-computing/04-cloud-optimized-icesat2.ipynb b/book/tutorials/cloud-computing/04-cloud-optimized-icesat2.ipynb new file mode 100644 index 0000000..995c139 --- /dev/null +++ b/book/tutorials/cloud-computing/04-cloud-optimized-icesat2.ipynb @@ -0,0 +1,149 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "# Cloud-Optimized ICESat-2\n", + "\n", + "## Cloud-Optimized vs Cloud-Native\n", + "\n", + "Recall from [03-cloud-optimized-data-access.ipynb](./03-cloud-optimized-data-access.ipynb) that we can make any HDF5 file cloud-optimized by restructuring the file so that all the metadata is in one place and chunks are \"not too big\" and \"not too small\". However, as users of the data, not archivers, we don't control how the file is generated and distributed, so if we're restructuring the data we might want to go with something even better - a **\"cloud-native\"** format.\n", + "\n", + ":::{important} Cloud-Native Formats\n", + "Cloud-native formats are formats that were designed specifically to be used in a cloud environment. This usually means that metadata and indexes for data is separated from the data itself in a way that allows for logical dataset access across multiple files. In other words, it is fast to open a large dataset and access just the parts of it that you need.\n", + ":::\n", + "\n", + ":::{warning}\n", + "Generating cloud-native formats is non-trivial.\n", + ":::\n", + "\n", + ":::{seealso}\n", + "* https://eo-college.org/topics/cloud-native-data-formats/\n", + "* https://guide.cloudnativegeo.org\n", + ":::\n", + "\n", + "## Geoparquet\n", + "\n", + "To demonstrate one such cloud-native format, geoparquet, we have generated a geoparquet store (see [atl08_parquet.ipynb](./atl08_parquet_files/atl08_parquet.ipynb)) for the ATL08 dataset and will visualize it using a very performant geospatial vector visualization library, [`lonboard`](https://developmentseed.org/lonboard/latest/).\n", + "\n", + ":::{seealso} Resource on Geoparquet\n", + "* https://guide.cloudnativegeo.org/geoparquet/\n", + "* https://geoparquet.org/\n", + ":::\n", + "\n", + "## Demo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "metadata": { + "mystnb": { + "skip-execution": true + } + }, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import geopandas as gpd\n", + "import pyarrow.parquet as pq\n", + "from pyarrow import fs\n", + "import pyarrow.dataset as ds\n", + "from shapely import wkb\n", + "\n", + "s3 = fs.S3FileSystem(region=\"us-west-2\", anonymous=True)\n", + "dataset = pq.ParquetDataset(\"eodc-public/atl08_parquet/\", filesystem=s3,\n", + " partitioning=\"hive\", filters=[('year', '>=', 2021), ('year', '<=', 2021), ('month', '>=', 11), ('month', '<=', 11)])\n", + "table = dataset.read(columns=[\"h_canopy\", \"geometry\"])\n", + "df = table.to_pandas()\n", + "df['geometry'] = df['geometry'].apply(wkb.loads)\n", + "\n", + "\n", + "gdf = gpd.GeoDataFrame(df, geometry='geometry')\n", + "null_value = gdf['h_canopy'].max() # can we change this to a no data value?\n", + "gdf_filtered = gdf.loc[gdf['h_canopy'] != null_value]\n", + "gdf_filtered" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "gdf_filtered['h_canopy'].hist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "from lonboard import Map, ScatterplotLayer\n", + "from lonboard.colormap import apply_continuous_cmap\n", + "from palettable.colorbrewer.diverging import BrBG_10\n", + "\n", + "min_bound = 0\n", + "max_bound = 60\n", + "h_canopy = gdf_filtered['h_canopy']\n", + "h_canopy_normalized = (h_canopy - min_bound) / (max_bound - min_bound)\n", + "\n", + "# From https://developmentseed.org/lonboard/latest/api/layers/scatterplot-layer/#lonboard.ScatterplotLayer.radius_min_pixels:\n", + "# radius_min_pixels is \"the minimum radius in pixels. This can be used to prevent the circle from getting too small when zoomed out.\"\n", + "layer = ScatterplotLayer.from_geopandas(gdf_filtered, radius_min_pixels=0.5)\n", + "layer.get_fill_color = apply_continuous_cmap(h_canopy_normalized, BrBG_10, alpha=0.7)\n", + "\n", + "m = Map(layer)\n", + "m.set_view_state(zoom=2)\n", + "m" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + }, + "metadata": { + "mystnb": { + "nb_execution_mode": "off" + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/book/tutorials/cloud-computing/README.md b/book/tutorials/cloud-computing/README.md new file mode 100644 index 0000000..9aa3a6b --- /dev/null +++ b/book/tutorials/cloud-computing/README.md @@ -0,0 +1,5 @@ +# Cloud Computing Tutorial + +An introduction to cloud computing for ICESat-2 Hackweek 2024. + +Start here [00-goals-and-outline.ipynb](./00-goals-and-outline.ipynb) diff --git a/book/tutorials/cloud-computing/atl08_parquet_files/atl08-parquet-metadata.json b/book/tutorials/cloud-computing/atl08_parquet_files/atl08-parquet-metadata.json new file mode 100644 index 0000000..0208d5d --- /dev/null +++ b/book/tutorials/cloud-computing/atl08_parquet_files/atl08-parquet-metadata.json @@ -0,0 +1,132 @@ +{ + "columns": { + "geometry": { + "bbox": [], + "covering": { + "bbox": { + "xmax": [ + "bbox", + "xmax" + ], + "xmin": [ + "bbox", + "xmin" + ], + "ymax": [ + "bbox", + "ymax" + ], + "ymin": [ + "bbox", + "ymin" + ] + } + }, + "crs": { + "$schema": "https://proj.org/schemas/v0.6/projjson.schema.json", + "area": "World.", + "bbox": { + "east_longitude": 180, + "north_latitude": 90, + "south_latitude": -90, + "west_longitude": -180 + }, + "coordinate_system": { + "axis": [ + { + "abbreviation": "Lon", + "direction": "east", + "name": "Geodetic longitude", + "unit": "degree" + }, + { + "abbreviation": "Lat", + "direction": "north", + "name": "Geodetic latitude", + "unit": "degree" + } + ], + "subtype": "ellipsoidal" + }, + "datum_ensemble": { + "accuracy": "2.0", + "ellipsoid": { + "inverse_flattening": 298.257223563, + "name": "WGS 84", + "semi_major_axis": 6378137 + }, + "id": { + "authority": "EPSG", + "code": 6326 + }, + "members": [ + { + "id": { + "authority": "EPSG", + "code": 1166 + }, + "name": "World Geodetic System 1984 (Transit)" + }, + { + "id": { + "authority": "EPSG", + "code": 1152 + }, + "name": "World Geodetic System 1984 (G730)" + }, + { + "id": { + "authority": "EPSG", + "code": 1153 + }, + "name": "World Geodetic System 1984 (G873)" + }, + { + "id": { + "authority": "EPSG", + "code": 1154 + }, + "name": "World Geodetic System 1984 (G1150)" + }, + { + "id": { + "authority": "EPSG", + "code": 1155 + }, + "name": "World Geodetic System 1984 (G1674)" + }, + { + "id": { + "authority": "EPSG", + "code": 1156 + }, + "name": "World Geodetic System 1984 (G1762)" + }, + { + "id": { + "authority": "EPSG", + "code": 1309 + }, + "name": "World Geodetic System 1984 (G2139)" + } + ], + "name": "World Geodetic System 1984 ensemble" + }, + "id": { + "authority": "OGC", + "code": "CRS84" + }, + "name": "WGS 84 (CRS84)", + "scope": "Not known.", + "type": "GeographicCRS" + }, + "edges": "planar", + "encoding": "WKB", + "geometry_types": [ + "Point" + ] + } + }, + "primary_column": "geometry", + "version": "1.1.0-dev" +} \ No newline at end of file diff --git a/book/tutorials/cloud-computing/atl08_parquet_files/atl08_parquet.ipynb b/book/tutorials/cloud-computing/atl08_parquet_files/atl08_parquet.ipynb new file mode 100644 index 0000000..543acba --- /dev/null +++ b/book/tutorials/cloud-computing/atl08_parquet_files/atl08_parquet.ipynb @@ -0,0 +1,302 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "017000a7-859d-4aad-a89f-754a3973c71b", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "# Generating an ATL08 GeoParquet Store\n", + "\n", + "
\n", + "\n", + "This notebook creates a [GeoParquet](https://geoparquet.org/) store from scratch using a subset of [ICESat-2 ATL08](https://nsidc.org/data/atl08/versions/6) files. GeoParquet is built on [Apache Parquet](https://parquet.apache.org/) which is an open-source column-oriented file format which allows for efficient storage and retrieval using high performance compression.\n", + "\n", + "The conversion functions are in the helpers file atl08_parquet_helpers which are functions originally written by Sean Harkins of Development Seed in https://github.com/developmentseed/icesat-parquet/.\n", + "\n", + ":::{warning}\n", + "This work is experimental\n", + ":::" + ] + }, + { + "cell_type": "markdown", + "id": "60b32c78-7a35-4ad6-892d-aa459aff2550", + "metadata": {}, + "source": [ + "## 1. Install and import the necessary libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cddb38b2-7f9f-4be7-8ae4-cb79f01d8607", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install pyarrow geoarrow-pyarrow geopandas earthaccess==0.9.0 jupyterlab_vim" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4527f0f3-6233-4bee-98b6-ff9c89ec0d26", + "metadata": {}, + "outputs": [], + "source": [ + "import atl08_parquet_helpers as aph\n", + "from datetime import datetime, timezone, timedelta\n", + "import earthaccess\n", + "import fsspec\n", + "import geopandas as gpd\n", + "from lonboard import viz\n", + "import os\n", + "import pyarrow.parquet as pq\n", + "from shapely import wkb" + ] + }, + { + "cell_type": "markdown", + "id": "ed0e098c-f3c3-4a7b-83b3-d59148310ae3", + "metadata": {}, + "source": [ + "## 2. Login to earthaccess using [URS credentials](https://urs.earthdata.nasa.gov/home) and then setup an S3 client with credentials for NSIDC DAAC" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f21f6342-6864-41ef-820c-f06e5116a8a4", + "metadata": {}, + "outputs": [], + "source": [ + "earthaccess.login()\n", + "\n", + "aws_creds = earthaccess.get_s3_credentials(daac='NSIDC')\n", + "\n", + "s3 = fsspec.filesystem(\n", + " 's3',\n", + " anon=False,\n", + " key=aws_creds['accessKeyId'],\n", + " secret=aws_creds['secretAccessKey'],\n", + " token=aws_creds['sessionToken'],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "39224451-e684-4bad-9b1d-3dc9eafe01f4", + "metadata": {}, + "source": [ + "## 3. Search for a subset of ATL08 granules using the [earthaccess](https://github.com/nsidc/earthaccess) library\n", + "\n", + "This search is only for 1 week for results over South America." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c55b6316-eb17-4ab0-acfb-f3563e5bc316", + "metadata": {}, + "outputs": [], + "source": [ + "start = datetime(2021, 11, 1, tzinfo=timezone.utc)\n", + "end = start + timedelta(days=7)\n", + "\n", + "results = earthaccess.search_data(\n", + " short_name=\"ATL08\",\n", + " cloud_hosted=True,\n", + " temporal=(start, end),\n", + " bounding_box=(-90,-56,-32,14),\n", + " count=-1\n", + ")\n", + "year_month = f\"year={start.year}/month={start.month}\"\n", + "week = 0\n", + "len(results)" + ] + }, + { + "cell_type": "markdown", + "id": "7f82d0e1-c0d0-4165-ba97-63c8a890ef94", + "metadata": {}, + "source": [ + "## 4. Sort the results and setup the parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d1e3632-7af4-4c3e-aa68-4dd11fc8d06e", + "metadata": {}, + "outputs": [], + "source": [ + "sorted_results = sorted(results, key=lambda r : datetime.strptime(r['umm']['TemporalExtent']['RangeDateTime']['BeginningDateTime'], '%Y-%m-%dT%H:%M:%S.%fZ'))\n", + "\n", + "template_file = s3.open(sorted_results[0].data_links(access=\"direct\")[0], 'rb')\n", + "\n", + "atl08_parquet = aph.ParquetTable(\n", + " geometadata_file='atl08-parquet-metadata.json',\n", + " template_file=template_file\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "18f17bed-fb90-4386-9b47-05ea56b920a6", + "metadata": {}, + "source": [ + "## 5. Write results to the parquet table\n", + "\n", + "Write results to 1 parquet file, using the year-month as a partition. Later on if we add more weeks we can add them to new parquet files and new partitions as appropriate." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42bc69f4-69d5-4682-be3d-02fdfeebea02", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "# fetch a group and write them to a partition\n", + "directory=\"atl08_parquet\"\n", + "os.makedirs(f\"{directory}/{year_month}\", exist_ok=True)\n", + "# i think it can only go one beam at a time even with more workers because of the global hdf5 interpreter lock\n", + "atl08_parquet.write_results_by_partition(sorted_results, s3, parquet_file=f\"{directory}/{year_month}/{week}.parquet\")" + ] + }, + { + "cell_type": "markdown", + "id": "6c41ee4f-5885-475b-930d-ad523f03bb7c", + "metadata": {}, + "source": [ + "## We're done creating the parquet!\n", + "\n", + "Now we can checkout the results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "852324eb-e248-475f-a764-149c324891f6", + "metadata": {}, + "outputs": [], + "source": [ + "# The hive partitioning scheme assumes directory names with key=value pairs like \"/year=2009/month=11\"\n", + "# Partitioning speeds up queries as the query engine only needs to look at certain paths which match the key/value pairs used in creating the partitions.\n", + "dataset = pq.ParquetDataset(\"atl08_parquet\", partitioning=\"hive\", filters=[('year', '>=', 2021),\n", + " ('year', '<=', 2021),\n", + " ('month', '>=', 11),\n", + " ('month', '<=', 11)])\n", + "table = dataset.read(columns=[\"h_canopy\", \"geometry\"])\n", + "df = table.to_pandas()\n", + "df['geometry'] = df['geometry'].apply(wkb.loads)\n", + "\n", + "\n", + "gdf = gpd.GeoDataFrame(df, geometry='geometry', crs='EPSG:4326')\n", + "gdf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db8a1c6f-5317-46dd-aac6-cdcafb1a2ba8", + "metadata": {}, + "outputs": [], + "source": [ + "null_value = gdf['h_canopy'].max() \n", + "gdf_filtered = gdf.loc[gdf['h_canopy'] != null_value]\n", + "gdf_filtered" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10f0499b-5cf2-4b68-81b4-3d81a0eee73f", + "metadata": {}, + "outputs": [], + "source": [ + "gdf_filtered['h_canopy'].min(), gdf_filtered['h_canopy'].max()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bebbd2e2-a5da-4473-a82d-3a67e52f8563", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "# this will take too long and / or cause the kernel to die with large dataframes\n", + "# depending on your available memory\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib\n", + "import cartopy.crs as ccrs\n", + "\n", + "crs = ccrs.PlateCarree()\n", + "fig, ax = plt.subplots(subplot_kw=dict(projection=crs))\n", + "gdf_filtered.plot(column='h_canopy', ax=ax, legend=True, cmap='viridis')\n", + "ax.set_extent([-116,-23,-32,21])\n", + "ax.set_title('h_canopy plot')\n", + "\n", + "# Add coastlines and gridlines\n", + "ax.coastlines()\n", + "ax.gridlines(draw_labels=True)\n", + "\n", + "# Show plot\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8472a90-04de-44cc-a7ec-1152a68d3e1f", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "from lonboard import Map, ScatterplotLayer\n", + "from lonboard.colormap import apply_continuous_cmap\n", + "from palettable.colorbrewer.diverging import BrBG_10\n", + "\n", + "layer = ScatterplotLayer.from_geopandas(gdf_filtered)\n", + "h_canopy = gdf_filtered['h_canopy']\n", + "layer.get_fill_color = apply_continuous_cmap(h_canopy, BrBG_10, alpha=0.7)\n", + "\n", + "m = Map(layer)\n", + "m" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + }, + "metadata": { + "mystnb": { + "nb_execution_mode": "off" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/book/tutorials/cloud-computing/atl08_parquet_files/atl08_parquet_helpers.py b/book/tutorials/cloud-computing/atl08_parquet_files/atl08_parquet_helpers.py new file mode 100644 index 0000000..2fba361 --- /dev/null +++ b/book/tutorials/cloud-computing/atl08_parquet_files/atl08_parquet_helpers.py @@ -0,0 +1,198 @@ +from dataclasses import dataclass +from datetime import datetime, timezone, timedelta +import h5py +import pandas as pd +import pyarrow as pa +import pyarrow.parquet as pq +import fsspec +import numpy as np +import shapely +from earthaccess.results import DataGranule +from earthaccess.search import DataGranules +import json +import os +import time +import concurrent + +@dataclass +class ParquetTable: + """Class for interacting with a parquet table""" + schema: pa.schema + geometadata: dict + + def __init__(self, geometadata_file: str, template_file: str): + with open(geometadata_file, 'r') as f: + self.geometadata = json.loads(f.read()) + file = h5py.File(template_file, rdcc_bytes=4*1024*1024) + self.schema = self.create_pyarrow_schema(file, self.geometadata) + + def create_pyarrow_schema(self, template_file: h5py.File, geometadata) -> pa.schema: + # TODO: make this configurable + land_segments_group = template_file["gt1l"]["land_segments"] + canopy_group = template_file["gt1l"]["land_segments"]["canopy"] + terrain_group = template_file["gt1l"]["land_segments"]["terrain"] + + land_segment_fields = self.datasets_to_fields(land_segments_group) + geometry_field = pa.field("geometry", pa.binary(), metadata={ + "encoding": "WKB", + "geometry_types": "POINT" + }) + land_segment_fields.append(geometry_field) + + timestamp_field = pa.field("timestamp", pa.timestamp('ns')) + land_segment_fields.append(timestamp_field) + + beam_field = pa.field("beam", pa.string()) + land_segment_fields.append(beam_field) + + strength_field = pa.field("strength", pa.string()) + land_segment_fields.append(strength_field) + + + canopy_fields = self.datasets_to_fields(canopy_group) + terrain_fields = self.datasets_to_fields(terrain_group) + + fields = land_segment_fields + canopy_fields + terrain_fields + + metadata = json.dumps(geometadata).encode('utf-8') + schema = pa.schema(fields, metadata={b"geo": metadata}) + + return schema + + def result_bbox(self, result: DataGranule): + points = result["umm"]["SpatialExtent"]["HorizontalSpatialDomain"]["Geometry"]["GPolygons"][0]["Boundary"]["Points"] + + longitudes = [point['Longitude'] for point in points] + latitudes = [point['Latitude'] for point in points] + + min_lon, min_lat = min(longitudes), min(latitudes) + max_lon, max_lat = max(longitudes), max(latitudes) + bbox = shapely.geometry.box(min_lon, min_lat, max_lon, max_lat) + return bbox + + def results_bounds(self, results: list[DataGranule]): + union_bbox = self.result_bbox(results[0]) + for result in results: + bbox = self.result_bbox(result) + union_bbox = union_bbox.union(bbox) + return list(union_bbox.envelope.bounds) + + def datasets_to_fields(self, group: h5py.Group): + fields = [] + for key in group.keys(): + if isinstance(group[key], h5py.Dataset): + dtype = group[key].dtype + numpy_dtype = dtype.newbyteorder("=") + arrow_type = pa.from_numpy_dtype(numpy_dtype) + fields.append((key, arrow_type)) + return fields + + def get_group_chunks(self, group: h5py.Group, offset: int, chunk_size: int) -> list[np.array]: + chunks = [] + for key in group.keys(): + if isinstance(group[key], h5py.Dataset): + if len(group[key].chunks) == 1: + chunks.append(group[key][offset:offset+chunk_size]) + # Handle variables with land segment chunking + elif len(group[key].chunks) == 2: + chunks.append(group[key][offset:offset+chunk_size, 0]) + return chunks + + def chunks_to_tables(self, result: DataGranule, fs: fsspec.filesystem, beam: str, group_schema: pa.schema): + tables = [] + url = result.data_links(access="direct")[0] + print(url) + with fs.open(url, 'rb') as f: + file = h5py.File(f, rdcc_nbytes=4*1024*1024) + orientation = file['orbit_info']['sc_orient'][0] + if orientation == 0 and beam[-1] == "l": + strength = "strong" + elif orientation == 1 and beam[-1] == "r": + strength = "strong" + elif orientation == 2: + strength = "degraded" + else: + strength = "weak" + + GPS_EPOCH = pd.to_datetime('1980-01-06 00:00:00') + # Not sure why other examples of this were using the value as an array + atlas_sdp_gps_epoch = file['ancillary_data']['atlas_sdp_gps_epoch'][0] + + land_segments_group = file[beam]["land_segments"] + canopy_group = file[beam]["land_segments"]["canopy"] + terrain_group = file[beam]["land_segments"]["terrain"] + + chunk_size = land_segments_group["latitude"].chunks[0] + size = land_segments_group["latitude"].size + number_of_chunks = (size // chunk_size) + 1 + + for n in range(number_of_chunks): + offset = n * chunk_size + land_segment_chunks = self.get_group_chunks(land_segments_group, offset, chunk_size) + # Populate geometry field + geometries = [] + for lat, lon in zip( + land_segments_group["latitude"][offset:offset+chunk_size], + land_segments_group["longitude"][offset:offset+chunk_size] + ): + point = shapely.Point(lon, lat) + point_wkb = shapely.to_wkb(point, flavor="iso") + geometries.append(point_wkb) + land_segment_chunks.append(geometries) + + # Important to note that array order append order needs to be the same as the schema fields order. + timestamps = [] + for delta_time in land_segments_group["delta_time"][offset:offset+chunk_size]: + timestamp = GPS_EPOCH + pd.to_timedelta(delta_time+atlas_sdp_gps_epoch, unit='s') + timestamps.append(timestamp) + land_segment_chunks.append(timestamps) + + # Add fixed values + beam_values = [beam] * len(geometries) + land_segment_chunks.append(beam_values) + strength_values = [strength] * len(geometries) + land_segment_chunks.append(strength_values) + + canopy_chunks = self.get_group_chunks(canopy_group, offset, chunk_size) + terrain_chunks = self.get_group_chunks(terrain_group, offset, chunk_size) + chunks = land_segment_chunks + canopy_chunks + terrain_chunks + table = pa.Table.from_arrays(chunks, schema=group_schema) + tables.append(table) + return tables + + def update_schema_geo_metadata(self, results: list[DataGranule]) -> pa.schema: + """ + Update schema metadata with bounds from the list of granules. + """ + bounds = self.results_bounds(results) + + # Create a copy of geometadata and update the bounds + updated_geometadata = self.geometadata.copy() + updated_geometadata["columns"]["geometry"]["bbox"] = bounds + + # Update the schema's metadata + updated_metadata = self.schema.metadata.copy() + updated_metadata[b"geo"] = json.dumps(updated_geometadata).encode('utf-8') + + # Return the schema with updated metadata + return self.schema.with_metadata(updated_metadata) + + def write_results_by_partition(self, results_list: list[DataGranule], fs: fsspec.filesystem, parquet_file): # parquet_file is path or file-like + results_schema = self.update_schema_geo_metadata(results_list) + table_writer = pq.ParquetWriter(parquet_file, results_schema) + beams = ["gt1l", "gt1r", "gt2l", "gt2r", "gt3l", "gt3r"] + for beam in beams: + results_tables = [] + with concurrent.futures.ThreadPoolExecutor() as executor: + # does it make sense to use the results schema for each table? + futures = [executor.submit(self.chunks_to_tables, result, fs, beam, results_schema) for result in results_list] + completed_futures, _ = concurrent.futures.wait(futures) + for future in completed_futures: + try: + results_tables.extend(future.result()) + except Exception as exception: + print(exception) + + combined_table = pa.concat_tables(results_tables) + table_writer.write_table(combined_table) + table_writer.close() diff --git a/book/tutorials/cloud-computing/images/AWS_OurDataCenters_Background.jpg b/book/tutorials/cloud-computing/images/AWS_OurDataCenters_Background.jpg new file mode 100644 index 0000000..389811b Binary files /dev/null and b/book/tutorials/cloud-computing/images/AWS_OurDataCenters_Background.jpg differ diff --git a/book/tutorials/cloud-computing/images/DAAC_map_new.jpg b/book/tutorials/cloud-computing/images/DAAC_map_new.jpg new file mode 100644 index 0000000..38f34c1 Binary files /dev/null and b/book/tutorials/cloud-computing/images/DAAC_map_new.jpg differ diff --git a/book/tutorials/cloud-computing/images/EarthDataCloud-Logo.jpg b/book/tutorials/cloud-computing/images/EarthDataCloud-Logo.jpg new file mode 100644 index 0000000..524bcd3 Binary files /dev/null and b/book/tutorials/cloud-computing/images/EarthDataCloud-Logo.jpg differ diff --git a/book/tutorials/cloud-computing/images/archive-growth-FY22.jpg b/book/tutorials/cloud-computing/images/archive-growth-FY22.jpg new file mode 100644 index 0000000..e88e50b Binary files /dev/null and b/book/tutorials/cloud-computing/images/archive-growth-FY22.jpg differ diff --git a/book/tutorials/cloud-computing/images/cloud-and-local.png b/book/tutorials/cloud-computing/images/cloud-and-local.png new file mode 100644 index 0000000..f0ea759 Binary files /dev/null and b/book/tutorials/cloud-computing/images/cloud-and-local.png differ diff --git a/book/tutorials/cloud-computing/images/cloud.gif b/book/tutorials/cloud-computing/images/cloud.gif new file mode 100644 index 0000000..1d908f3 Binary files /dev/null and b/book/tutorials/cloud-computing/images/cloud.gif differ diff --git a/book/tutorials/cloud-computing/images/dalle-college.png b/book/tutorials/cloud-computing/images/dalle-college.png new file mode 100644 index 0000000..1664bff Binary files /dev/null and b/book/tutorials/cloud-computing/images/dalle-college.png differ diff --git a/book/tutorials/cloud-computing/images/different-modes-of-access.png b/book/tutorials/cloud-computing/images/different-modes-of-access.png new file mode 100644 index 0000000..910f0fe Binary files /dev/null and b/book/tutorials/cloud-computing/images/different-modes-of-access.png differ diff --git a/book/tutorials/cloud-computing/images/hdf5-structure-1.jpg b/book/tutorials/cloud-computing/images/hdf5-structure-1.jpg new file mode 100644 index 0000000..95b83bb Binary files /dev/null and b/book/tutorials/cloud-computing/images/hdf5-structure-1.jpg differ diff --git a/book/tutorials/cloud-computing/images/hdf5-structure-2.png b/book/tutorials/cloud-computing/images/hdf5-structure-2.png new file mode 100644 index 0000000..5aba80c Binary files /dev/null and b/book/tutorials/cloud-computing/images/hdf5-structure-2.png differ diff --git a/book/tutorials/cloud-computing/images/kernel-usage.png b/book/tutorials/cloud-computing/images/kernel-usage.png new file mode 100644 index 0000000..9570856 Binary files /dev/null and b/book/tutorials/cloud-computing/images/kernel-usage.png differ diff --git a/book/tutorials/cloud-computing/images/tachometer-alt_1.png b/book/tutorials/cloud-computing/images/tachometer-alt_1.png new file mode 100644 index 0000000..610f2e5 Binary files /dev/null and b/book/tutorials/cloud-computing/images/tachometer-alt_1.png differ diff --git a/book/tutorials/index.md b/book/tutorials/index.md index 576c1d1..7095558 100644 --- a/book/tutorials/index.md +++ b/book/tutorials/index.md @@ -7,3 +7,4 @@ Below you'll find a table keeping track of all tutorials presented at this event | Tutorial | Topics | Datasets | Recording Link | | - | - | - | - | | [Example Notebook](./example/tutorial-notebook.ipynb) | Jupyter Book formatting, ipyleaflet | n/a | Not recorded | +| [Cloud Computing](./cloud-computing/00-goals-and-outline.ipynb) | Cloud Computing Tutorial | n/a | Not recorded |