From 756c436ebc1fb260799a82195042e2352d37dc5e Mon Sep 17 00:00:00 2001 From: Julia Kent Date: Fri, 18 Sep 2020 17:15:21 -0600 Subject: [PATCH 01/30] add wildcard --- docs/source/notebooks/wildcard.ipynb | 499 +++++++++++++++++++++++++++ 1 file changed, 499 insertions(+) create mode 100644 docs/source/notebooks/wildcard.ipynb diff --git a/docs/source/notebooks/wildcard.ipynb b/docs/source/notebooks/wildcard.ipynb new file mode 100644 index 00000000..41a374f8 --- /dev/null +++ b/docs/source/notebooks/wildcard.ipynb @@ -0,0 +1,499 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Search Functionality\n", + "\n", + "## How to use both regular and wildcard expressions in `search()`\n", + "\n", + "It is often useful to search using a wildcard or a regular expression for your search." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use Case #1 - Inconsistent First Member IDs\n", + "\n", + "Some models don't have the member number **r1i1p1f1**. For example, **CNRM-CM6-1**'s first member is **r1i1p1f2**. \n", + "\n", + "A typical query that looks like:\n", + "\n", + "```python\n", + "\n", + "col.search(\n", + " experiment_id=[\"historical\"],\n", + " table_id=\"Amon\",\n", + " variable_id=\"tas\",\n", + " member_id=\"r1i1p1f1\",\n", + ")\n", + "\n", + "```\n", + "\n", + "Will return models that that strictly meet this criteria, excluding all models, such as **CNRM-CM-01**, that don't have this first member.\n", + "\n", + "If you want to include models that begin with varying member ids, you can use a wildcard (`*`) in your search. \n", + "\n", + "```python\n", + "\n", + "col.search(\n", + " experiment_id=[\"historical\"],\n", + " table_id=\"Amon\",\n", + " variable_id=\"tas\",\n", + " member_id=\"r1i1p1f*\",\n", + ")\n", + "\n", + "```\n", + "This search will return all of the target members." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use Case #2 Non-CF Standard Names\n", + "\n", + "In some datasets the long names are **not** CF Standard Names, but names specified in some other documentation. For this reason the user may not know exactly what name to search for without listing all names.\n", + "\n", + "```\n", + "uniques = col.unique(columns=['long_name'])\n", + "nameList = sorted(uniques['long_name']['values'])\n", + "print(*nameList, sep='\\n') #note *list to unpack each item for print function\n", + "```\n", + "\n", + "The above code block uses the wildcard expression (`*`) to find all unique names in the collection, then alphabetically sorts and prints them." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### The longer example\n", + "\n", + "Import and load in a typical enhanced collection description file" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import intake\n", + "import pandas as pd\n", + "import pprint\n", + "from IPython.display import HTML" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

aws-cesm1-le catalog with 27 dataset(s) from 365 asset(s):

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unique
component5
dim2
frequency5
experiment6
start10
end11
variable75
long_name75
path365
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "cat_url = 'https://ncar-cesm-lens.s3-us-west-2.amazonaws.com/catalogs/aws-cesm1-le-enhanced.json'\n", + "col = intake.open_esm_datastore(cat_url)\n", + "col" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Take a look at the first few lines of the enhanced catalog" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is an inventory of the Community Earth System Model (CESM) Large Ensemble (LENS) dataset in Zarr format publicly available on Amazon S3 (https://doi.org/10.26024/wt24-5j82)\n", + "Catalog file: https://ncar-cesm-lens.s3-us-west-2.amazonaws.com/catalogs/aws-cesm1-le-enhanced.csv\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
componentdimfrequencyexperimentstartendvariablelong_namepath
atm2DmonthlyHIST1850-011919-12FLNSNet longwave flux at surfaces3://ncar-cesm-lens/atm/monthly/cesmLE-HIST-FLNS.zarr
atm2Dmonthly20C1920-012005-12FLNSNet longwave flux at surfaces3://ncar-cesm-lens/atm/monthly/cesmLE-20C-FLNS.zarr
atm2Ddaily20C1920-01-012005-12-31FLNSNet longwave flux at surfaces3://ncar-cesm-lens/atm/daily/cesmLE-20C-FLNS.zarr
atm2DmonthlyRCP852006-012100-12FLNSNet longwave flux at surfaces3://ncar-cesm-lens/atm/monthly/cesmLE-RCP85-FLNS.zarr
atm2DdailyRCP852006-01-012100-12-31FLNSNet longwave flux at surfaces3://ncar-cesm-lens/atm/daily/cesmLE-RCP85-FLNS.zarr
atm2DmonthlyCTRL0400-012200-12FLNSNet longwave flux at surfaces3://ncar-cesm-lens/atm/monthly/cesmLE-CTRL-FLNS.zarr
atm2DmonthlyCTRL_AMIP0001-012600-12FLNSNet longwave flux at surfaces3://ncar-cesm-lens/atm/monthly/cesmLE-CTRL_AMIP-FLNS.zarr
atm2DmonthlyCTRL_SLAB_OCN0001-011000-12FLNSNet longwave flux at surfaces3://ncar-cesm-lens/atm/monthly/cesmLE-CTRL_SLAB_OCN-FLNS.zarr
atm2DmonthlyHIST1850-011919-12FLNSCClearsky net longwave flux at surfaces3://ncar-cesm-lens/atm/monthly/cesmLE-HIST-FLNSC.zarr
atm2Dmonthly20C1920-012005-12FLNSCClearsky net longwave flux at surfaces3://ncar-cesm-lens/atm/monthly/cesmLE-20C-FLNSC.zarr
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(col.esmcol_data['description'])\n", + "print(\"Catalog file:\", col.esmcol_data['catalog_file'])\n", + "print(col)\n", + "\n", + "HTML(col.df.head(10).to_html(index=False))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Display all of the `long_name` variable options" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Clearsky net longwave flux at surface', 'Clearsky net solar flux at surface', 'Convective precipitation rate (liq + ice)', 'Convective snow rate (water equivalent)', 'Dissolved Inorganic Carbon', 'Dissolved Organic Carbon', 'Dissolved Oxygen', 'Fraction of sfc area covered by sea-ice', 'Free-Surface Residual Heat Flux', 'Free-Surface Residual Salt Flux', 'Freshwater Flux', 'Geopotential Height (above sea level)', 'Geopotential Z at 500 mbar pressure surface', 'Heat Flux across top face', 'Heat Flux in grid-x direction', 'Heat Flux in grid-y direction', 'Horizontal total wind speed average at the surface', 'Internal Ocean Heat Flux Due to Ice Formation', 'Large-scale (stable) precipitation rate (liq + ice)', 'Large-scale (stable) snow rate (water equivalent)', 'Lowest model level zonal wind', 'Maximum (convective and large-scale) precipitation rate (liq+ice)', 'Maximum reference height temperature over output period', 'Meridional wind', 'Minimum reference height temperature over output period', 'Mixed-Layer Depth', 'Net longwave flux at surface', 'Net solar flux at surface', 'Net solar flux at top of atmosphere', 'Potential Density Ref to Surface', 'Potential Temperature', 'Reference height humidity', 'Reference height temperature', 'Salinity', 'Salt Flux across top face', 'Salt Flux in grid-x direction', 'Salt Flux in grid-y direction', 'Sea Surface Height', 'Sea level pressure', 'Solar Short-Wave Heat Flux', 'Solar Short-Wave Heat Flux in boundary layer', 'Solar Short-Wave Heat Flux in top layer', 'Specific Humidity at 850 mbar pressure surface', 'Specific humidity', 'Surface latent heat flux', 'Surface pressure', 'Surface sensible heat flux', 'Surface temperature (radiative)', 'Temperature', 'Total (convective and large-scale) precipitation rate (liq + ice)', 'Total (vertically integrated) precipitable water', 'Total Surface Heat Flux including short-wave', 'Upwelling longwave flux at top of model', 'Velocity in grid-x direction', 'Velocity in grid-y direction', 'Vertical Velocity', 'Virtual Salt Flux due to weak restoring', 'Virtual Salt Flux in FW Flux formulation', 'Wind stress (squared) in grid-x direction', 'Wind stress (squared) in grid-y direction', 'Wind stress in grid-x direction', 'Wind stress in grid-y direction', 'Zonal wind', 'atmospheric rain', 'atmospheric snow', 'fraction of ground covered by snow', 'grid cellmean ice thickness', 'grid cellmean ice thickness (daily)', 'grid cellmean ice thickness (monthly)', 'ice area, aggregate (daily)', 'ice area, aggregate (monthly)', 'snow depth (liquid water)', 'soil liquid water (vegetated landunits only)', 'soil liquid water + ice in top 10cm of soil (veg landunits only)', 'total liquid runoff (does not include QSNWCPICE)']\n" + ] + } + ], + "source": [ + "uniques = col.unique(columns=['long_name'])\n", + "nameList = sorted(uniques['long_name']['values'])\n", + "print(nameList, sep='\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want to unpack each item for the `print` function, you need to use the wildcard `*`" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Clearsky net longwave flux at surface\n", + "Clearsky net solar flux at surface\n", + "Convective precipitation rate (liq + ice)\n", + "Convective snow rate (water equivalent)\n", + "Dissolved Inorganic Carbon\n", + "Dissolved Organic Carbon\n", + "Dissolved Oxygen\n", + "Fraction of sfc area covered by sea-ice\n", + "Free-Surface Residual Heat Flux\n", + "Free-Surface Residual Salt Flux\n", + "Freshwater Flux\n", + "Geopotential Height (above sea level)\n", + "Geopotential Z at 500 mbar pressure surface\n", + "Heat Flux across top face\n", + "Heat Flux in grid-x direction\n", + "Heat Flux in grid-y direction\n", + "Horizontal total wind speed average at the surface\n", + "Internal Ocean Heat Flux Due to Ice Formation\n", + "Large-scale (stable) precipitation rate (liq + ice)\n", + "Large-scale (stable) snow rate (water equivalent)\n", + "Lowest model level zonal wind\n", + "Maximum (convective and large-scale) precipitation rate (liq+ice)\n", + "Maximum reference height temperature over output period\n", + "Meridional wind\n", + "Minimum reference height temperature over output period\n", + "Mixed-Layer Depth\n", + "Net longwave flux at surface\n", + "Net solar flux at surface\n", + "Net solar flux at top of atmosphere\n", + "Potential Density Ref to Surface\n", + "Potential Temperature\n", + "Reference height humidity\n", + "Reference height temperature\n", + "Salinity\n", + "Salt Flux across top face\n", + "Salt Flux in grid-x direction\n", + "Salt Flux in grid-y direction\n", + "Sea Surface Height\n", + "Sea level pressure\n", + "Solar Short-Wave Heat Flux\n", + "Solar Short-Wave Heat Flux in boundary layer\n", + "Solar Short-Wave Heat Flux in top layer\n", + "Specific Humidity at 850 mbar pressure surface\n", + "Specific humidity\n", + "Surface latent heat flux\n", + "Surface pressure\n", + "Surface sensible heat flux\n", + "Surface temperature (radiative)\n", + "Temperature\n", + "Total (convective and large-scale) precipitation rate (liq + ice)\n", + "Total (vertically integrated) precipitable water\n", + "Total Surface Heat Flux including short-wave\n", + "Upwelling longwave flux at top of model\n", + "Velocity in grid-x direction\n", + "Velocity in grid-y direction\n", + "Vertical Velocity\n", + "Virtual Salt Flux due to weak restoring\n", + "Virtual Salt Flux in FW Flux formulation\n", + "Wind stress (squared) in grid-x direction\n", + "Wind stress (squared) in grid-y direction\n", + "Wind stress in grid-x direction\n", + "Wind stress in grid-y direction\n", + "Zonal wind\n", + "atmospheric rain\n", + "atmospheric snow\n", + "fraction of ground covered by snow\n", + "grid cellmean ice thickness\n", + "grid cellmean ice thickness (daily)\n", + "grid cellmean ice thickness (monthly)\n", + "ice area, aggregate (daily)\n", + "ice area, aggregate (monthly)\n", + "snow depth (liquid water)\n", + "soil liquid water (vegetated landunits only)\n", + "soil liquid water + ice in top 10cm of soil (veg landunits only)\n", + "total liquid runoff (does not include QSNWCPICE)\n" + ] + } + ], + "source": [ + "print(*nameList, sep='\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "**Note**: For the wildcard search to work, you will need at least intake-esm v2020.08.15." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "intake-esm-dev", + "language": "python", + "name": "intake-esm-dev" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 90fd1ce655a2d6abf486800cd7e2590dda41ee97 Mon Sep 17 00:00:00 2001 From: Julia Kent Date: Fri, 18 Sep 2020 17:25:31 -0600 Subject: [PATCH 02/30] linting --- docs/source/notebooks/wildcard.ipynb | 51 ++++++++++++++++------------ 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/docs/source/notebooks/wildcard.ipynb b/docs/source/notebooks/wildcard.ipynb index 41a374f8..0bd9c340 100644 --- a/docs/source/notebooks/wildcard.ipynb +++ b/docs/source/notebooks/wildcard.ipynb @@ -8,16 +8,18 @@ "\n", "## How to use both regular and wildcard expressions in `search()`\n", "\n", - "It is often useful to search using a wildcard or a regular expression for your search." + "It is often useful to search using a wildcard or a regular expression for your\n", + "search.\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Use Case #1 - Inconsistent First Member IDs\n", + "## Use Case #1 - Inconsistent First Member IDs\n", "\n", - "Some models don't have the member number **r1i1p1f1**. For example, **CNRM-CM6-1**'s first member is **r1i1p1f2**. \n", + "Some models don't have the member number **r1i1p1f1**. For example,\n", + "**CNRM-CM6-1**'s first member is **r1i1p1f2**.\n", "\n", "A typical query that looks like:\n", "\n", @@ -32,9 +34,11 @@ "\n", "```\n", "\n", - "Will return models that that strictly meet this criteria, excluding all models, such as **CNRM-CM-01**, that don't have this first member.\n", + "Will return models that that strictly meet this criteria, excluding all models,\n", + "such as **CNRM-CM-01**, that don't have this first member.\n", "\n", - "If you want to include models that begin with varying member ids, you can use a wildcard (`*`) in your search. \n", + "If you want to include models that begin with varying member ids, you can use a\n", + "wildcard (`*`) in your search.\n", "\n", "```python\n", "\n", @@ -46,7 +50,8 @@ ")\n", "\n", "```\n", - "This search will return all of the target members." + "\n", + "This search will return all of the target members.\n" ] }, { @@ -55,7 +60,9 @@ "source": [ "## Use Case #2 Non-CF Standard Names\n", "\n", - "In some datasets the long names are **not** CF Standard Names, but names specified in some other documentation. For this reason the user may not know exactly what name to search for without listing all names.\n", + "In some datasets the long names are **not** CF Standard Names, but names\n", + "specified in some other documentation. For this reason the user may not know\n", + "exactly what name to search for without listing all names.\n", "\n", "```\n", "uniques = col.unique(columns=['long_name'])\n", @@ -63,7 +70,8 @@ "print(*nameList, sep='\\n') #note *list to unpack each item for print function\n", "```\n", "\n", - "The above code block uses the wildcard expression (`*`) to find all unique names in the collection, then alphabetically sorts and prints them." + "The above code block uses the wildcard expression (`*`) to find all unique names\n", + "in the collection, then alphabetically sorts and prints them.\n" ] }, { @@ -72,7 +80,7 @@ "source": [ "### The longer example\n", "\n", - "Import and load in a typical enhanced collection description file" + "Import and load in a typical enhanced collection description file\n" ] }, { @@ -166,7 +174,7 @@ } ], "source": [ - "cat_url = 'https://ncar-cesm-lens.s3-us-west-2.amazonaws.com/catalogs/aws-cesm1-le-enhanced.json'\n", + "cat_url = \"https://ncar-cesm-lens.s3-us-west-2.amazonaws.com/catalogs/aws-cesm1-le-enhanced.json\"\n", "col = intake.open_esm_datastore(cat_url)\n", "col" ] @@ -175,7 +183,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Take a look at the first few lines of the enhanced catalog" + "Take a look at the first few lines of the enhanced catalog\n" ] }, { @@ -333,8 +341,8 @@ } ], "source": [ - "print(col.esmcol_data['description'])\n", - "print(\"Catalog file:\", col.esmcol_data['catalog_file'])\n", + "print(col.esmcol_data[\"description\"])\n", + "print(\"Catalog file:\", col.esmcol_data[\"catalog_file\"])\n", "print(col)\n", "\n", "HTML(col.df.head(10).to_html(index=False))" @@ -344,7 +352,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Display all of the `long_name` variable options" + "Display all of the `long_name` variable options\n" ] }, { @@ -363,16 +371,17 @@ } ], "source": [ - "uniques = col.unique(columns=['long_name'])\n", - "nameList = sorted(uniques['long_name']['values'])\n", - "print(nameList, sep='\\n')" + "uniques = col.unique(columns=[\"long_name\"])\n", + "nameList = sorted(uniques[\"long_name\"][\"values\"])\n", + "print(nameList, sep=\"\\n\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "If you want to unpack each item for the `print` function, you need to use the wildcard `*`" + "If you want to unpack each item for the `print` function, you need to use the\n", + "wildcard `*`\n" ] }, { @@ -463,15 +472,15 @@ } ], "source": [ - "print(*nameList, sep='\\n')" + "print(*nameList, sep=\"\\n\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "**Note**: For the wildcard search to work, you will need at least intake-esm v2020.08.15." + "**Note**: For the wildcard search to work, you will need at least intake-esm\n", + "v2020.08.15.\n" ] } ], From f5501a1d3b619c78983af054ce0bf60da8ce860b Mon Sep 17 00:00:00 2001 From: Julia Kent Date: Mon, 8 Nov 2021 14:53:42 -0700 Subject: [PATCH 03/30] first pass at editing to_dataset_dict --- intake_esm/core.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/intake_esm/core.py b/intake_esm/core.py index 7ac8559e..1e834be3 100644 --- a/intake_esm/core.py +++ b/intake_esm/core.py @@ -7,6 +7,7 @@ import pandas as pd import pydantic import xarray as xr +import xcollection as xc from fastprogress.fastprogress import progress_bar from intake.catalog import Catalog @@ -441,9 +442,9 @@ def to_dataset_dict( aggregate: pydantic.StrictBool = None, skip_on_error: pydantic.StrictBool = False, **kwargs, - ) -> typing.Dict[str, xr.Dataset]: + ) -> typing.Collection[str, xc.Collection]: """ - Load catalog entries into a dictionary of xarray datasets. + Load catalog entries into a Collection of xarray datasets. Parameters ---------- @@ -466,8 +467,8 @@ def to_dataset_dict( Returns ------- - dsets : dict - A dictionary of xarray :py:class:`~xarray.Dataset`. + dsets : Collection + A Collection of xarray :py:class:`~xarray.Dataset`. Examples -------- @@ -481,7 +482,7 @@ def to_dataset_dict( ... grid_label="gn", ... ) >>> dsets = cat.to_dataset_dict() - >>> dsets.keys() + >>> dsets.keys() ## change this and the following line!! dict_keys(['CMIP.BCC.BCC-CSM2-MR.historical.Amon.gn', 'ScenarioMIP.BCC.BCC-CSM2-MR.ssp585.Amon.gn']) >>> dsets["CMIP.BCC.BCC-CSM2-MR.historical.Amon.gn"] @@ -506,7 +507,7 @@ def to_dataset_dict( UserWarning, stacklevel=2, ) - return {} + return xc.Collection({}) if ( self.esmcat.aggregation_control.variable_column_name From 955c3ed3c3cc1e997d3db9b1121a28e3afdbef3d Mon Sep 17 00:00:00 2001 From: Julia Kent Date: Mon, 8 Nov 2021 14:55:17 -0700 Subject: [PATCH 04/30] add xcollection to intake-esm requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 802b7879..5153929d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ requests>=2.24.0 xarray>=0.19 zarr>=2.5 pydantic>=1.8.2 +xcollection From 5fc56fb6c792e2d49e073922a44c2cead835714f Mon Sep 17 00:00:00 2001 From: Julia Kent Date: Wed, 24 Nov 2021 09:43:30 -0700 Subject: [PATCH 05/30] first pass at collection --- intake_esm/core.py | 5 +++-- requirements.txt | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/intake_esm/core.py b/intake_esm/core.py index 1e834be3..70efcb25 100644 --- a/intake_esm/core.py +++ b/intake_esm/core.py @@ -482,7 +482,7 @@ def to_dataset_dict( ... grid_label="gn", ... ) >>> dsets = cat.to_dataset_dict() - >>> dsets.keys() ## change this and the following line!! + >>> dsets.keys() dict_keys(['CMIP.BCC.BCC-CSM2-MR.historical.Amon.gn', 'ScenarioMIP.BCC.BCC-CSM2-MR.ssp585.Amon.gn']) >>> dsets["CMIP.BCC.BCC-CSM2-MR.historical.Amon.gn"] @@ -549,7 +549,7 @@ def to_dataset_dict( self.progressbar = progressbar if self.progressbar: print( - f"""\n--> The keys in the returned dictionary of datasets are constructed as follows:\n\t'{self.key_template}'""" + f"""\n--> The keys in the returned Collection of datasets are constructed as follows:\n\t'{self.key_template}'""" ) sources = {key: source(**source_kwargs) for key, source in self.items()} datasets = {} @@ -571,6 +571,7 @@ def to_dataset_dict( if not skip_on_error: raise exc self.datasets = self._create_derived_variables(datasets, skip_on_error) + self.datasets = xc.Collection(self.datasets) return self.datasets def _create_derived_variables(self, datasets, skip_on_error): diff --git a/requirements.txt b/requirements.txt index 5153929d..636df794 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,4 @@ requests>=2.24.0 xarray>=0.19 zarr>=2.5 pydantic>=1.8.2 -xcollection +git+https://github.com/NCAR/xcollection.git \ No newline at end of file From 3b1626fc8e17c5211f58a3805716c7e94d8fde14 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 24 Nov 2021 16:48:38 +0000 Subject: [PATCH 06/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- docs/source/notebooks/wildcard.ipynb | 3 ++- requirements.txt | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/source/notebooks/wildcard.ipynb b/docs/source/notebooks/wildcard.ipynb index 0bd9c340..22fbff72 100644 --- a/docs/source/notebooks/wildcard.ipynb +++ b/docs/source/notebooks/wildcard.ipynb @@ -89,9 +89,10 @@ "metadata": {}, "outputs": [], "source": [ + "import pprint\n", + "\n", "import intake\n", "import pandas as pd\n", - "import pprint\n", "from IPython.display import HTML" ] }, diff --git a/requirements.txt b/requirements.txt index 636df794..f4101370 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,4 @@ requests>=2.24.0 xarray>=0.19 zarr>=2.5 pydantic>=1.8.2 -git+https://github.com/NCAR/xcollection.git \ No newline at end of file +git+https://github.com/NCAR/xcollection.git From 500357c529fcdcfb553c5bfb1351addc051fe0ea Mon Sep 17 00:00:00 2001 From: Julia Kent Date: Wed, 24 Nov 2021 09:54:08 -0700 Subject: [PATCH 07/30] rm xr from imports --- intake_esm/core.py | 1 - 1 file changed, 1 deletion(-) diff --git a/intake_esm/core.py b/intake_esm/core.py index 70efcb25..2507acbc 100644 --- a/intake_esm/core.py +++ b/intake_esm/core.py @@ -6,7 +6,6 @@ import dask import pandas as pd import pydantic -import xarray as xr import xcollection as xc from fastprogress.fastprogress import progress_bar from intake.catalog import Catalog From 3c07c6d67eb3b48b056ac8c4a0e33f1c5fcda848 Mon Sep 17 00:00:00 2001 From: Julia Kent Date: Wed, 24 Nov 2021 11:04:31 -0700 Subject: [PATCH 08/30] git syntax --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f4101370..51053aaa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,4 @@ requests>=2.24.0 xarray>=0.19 zarr>=2.5 pydantic>=1.8.2 -git+https://github.com/NCAR/xcollection.git +git+git://https://github.com/NCAR/xcollection.git From 7496fa7f83da8a725c6a2d2afac0eaad466c819d Mon Sep 17 00:00:00 2001 From: Julia Kent Date: Wed, 24 Nov 2021 11:10:47 -0700 Subject: [PATCH 09/30] pin mamba version --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index a8754df8..afc3c86e 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -36,7 +36,7 @@ jobs: auto-update-conda: false python-version: ${{ matrix.python-version }} environment-file: ci/environment.yml - mamba-version: '*' + mamba-version: 0.17.0 use-mamba: true miniforge-variant: Mambaforge From 0e4194a2262a0f880f11f3289a3df63aa6c00b9c Mon Sep 17 00:00:00 2001 From: Julia Kent <46687291+jukent@users.noreply.github.com> Date: Wed, 24 Nov 2021 11:15:26 -0700 Subject: [PATCH 10/30] Delete wildcard.ipynb --- docs/source/notebooks/wildcard.ipynb | 509 --------------------------- 1 file changed, 509 deletions(-) delete mode 100644 docs/source/notebooks/wildcard.ipynb diff --git a/docs/source/notebooks/wildcard.ipynb b/docs/source/notebooks/wildcard.ipynb deleted file mode 100644 index 22fbff72..00000000 --- a/docs/source/notebooks/wildcard.ipynb +++ /dev/null @@ -1,509 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Search Functionality\n", - "\n", - "## How to use both regular and wildcard expressions in `search()`\n", - "\n", - "It is often useful to search using a wildcard or a regular expression for your\n", - "search.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Use Case #1 - Inconsistent First Member IDs\n", - "\n", - "Some models don't have the member number **r1i1p1f1**. For example,\n", - "**CNRM-CM6-1**'s first member is **r1i1p1f2**.\n", - "\n", - "A typical query that looks like:\n", - "\n", - "```python\n", - "\n", - "col.search(\n", - " experiment_id=[\"historical\"],\n", - " table_id=\"Amon\",\n", - " variable_id=\"tas\",\n", - " member_id=\"r1i1p1f1\",\n", - ")\n", - "\n", - "```\n", - "\n", - "Will return models that that strictly meet this criteria, excluding all models,\n", - "such as **CNRM-CM-01**, that don't have this first member.\n", - "\n", - "If you want to include models that begin with varying member ids, you can use a\n", - "wildcard (`*`) in your search.\n", - "\n", - "```python\n", - "\n", - "col.search(\n", - " experiment_id=[\"historical\"],\n", - " table_id=\"Amon\",\n", - " variable_id=\"tas\",\n", - " member_id=\"r1i1p1f*\",\n", - ")\n", - "\n", - "```\n", - "\n", - "This search will return all of the target members.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Use Case #2 Non-CF Standard Names\n", - "\n", - "In some datasets the long names are **not** CF Standard Names, but names\n", - "specified in some other documentation. For this reason the user may not know\n", - "exactly what name to search for without listing all names.\n", - "\n", - "```\n", - "uniques = col.unique(columns=['long_name'])\n", - "nameList = sorted(uniques['long_name']['values'])\n", - "print(*nameList, sep='\\n') #note *list to unpack each item for print function\n", - "```\n", - "\n", - "The above code block uses the wildcard expression (`*`) to find all unique names\n", - "in the collection, then alphabetically sorts and prints them.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### The longer example\n", - "\n", - "Import and load in a typical enhanced collection description file\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pprint\n", - "\n", - "import intake\n", - "import pandas as pd\n", - "from IPython.display import HTML" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "

aws-cesm1-le catalog with 27 dataset(s) from 365 asset(s):

\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
unique
component5
dim2
frequency5
experiment6
start10
end11
variable75
long_name75
path365
\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "cat_url = \"https://ncar-cesm-lens.s3-us-west-2.amazonaws.com/catalogs/aws-cesm1-le-enhanced.json\"\n", - "col = intake.open_esm_datastore(cat_url)\n", - "col" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Take a look at the first few lines of the enhanced catalog\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "This is an inventory of the Community Earth System Model (CESM) Large Ensemble (LENS) dataset in Zarr format publicly available on Amazon S3 (https://doi.org/10.26024/wt24-5j82)\n", - "Catalog file: https://ncar-cesm-lens.s3-us-west-2.amazonaws.com/catalogs/aws-cesm1-le-enhanced.csv\n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
componentdimfrequencyexperimentstartendvariablelong_namepath
atm2DmonthlyHIST1850-011919-12FLNSNet longwave flux at surfaces3://ncar-cesm-lens/atm/monthly/cesmLE-HIST-FLNS.zarr
atm2Dmonthly20C1920-012005-12FLNSNet longwave flux at surfaces3://ncar-cesm-lens/atm/monthly/cesmLE-20C-FLNS.zarr
atm2Ddaily20C1920-01-012005-12-31FLNSNet longwave flux at surfaces3://ncar-cesm-lens/atm/daily/cesmLE-20C-FLNS.zarr
atm2DmonthlyRCP852006-012100-12FLNSNet longwave flux at surfaces3://ncar-cesm-lens/atm/monthly/cesmLE-RCP85-FLNS.zarr
atm2DdailyRCP852006-01-012100-12-31FLNSNet longwave flux at surfaces3://ncar-cesm-lens/atm/daily/cesmLE-RCP85-FLNS.zarr
atm2DmonthlyCTRL0400-012200-12FLNSNet longwave flux at surfaces3://ncar-cesm-lens/atm/monthly/cesmLE-CTRL-FLNS.zarr
atm2DmonthlyCTRL_AMIP0001-012600-12FLNSNet longwave flux at surfaces3://ncar-cesm-lens/atm/monthly/cesmLE-CTRL_AMIP-FLNS.zarr
atm2DmonthlyCTRL_SLAB_OCN0001-011000-12FLNSNet longwave flux at surfaces3://ncar-cesm-lens/atm/monthly/cesmLE-CTRL_SLAB_OCN-FLNS.zarr
atm2DmonthlyHIST1850-011919-12FLNSCClearsky net longwave flux at surfaces3://ncar-cesm-lens/atm/monthly/cesmLE-HIST-FLNSC.zarr
atm2Dmonthly20C1920-012005-12FLNSCClearsky net longwave flux at surfaces3://ncar-cesm-lens/atm/monthly/cesmLE-20C-FLNSC.zarr
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "print(col.esmcol_data[\"description\"])\n", - "print(\"Catalog file:\", col.esmcol_data[\"catalog_file\"])\n", - "print(col)\n", - "\n", - "HTML(col.df.head(10).to_html(index=False))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Display all of the `long_name` variable options\n" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['Clearsky net longwave flux at surface', 'Clearsky net solar flux at surface', 'Convective precipitation rate (liq + ice)', 'Convective snow rate (water equivalent)', 'Dissolved Inorganic Carbon', 'Dissolved Organic Carbon', 'Dissolved Oxygen', 'Fraction of sfc area covered by sea-ice', 'Free-Surface Residual Heat Flux', 'Free-Surface Residual Salt Flux', 'Freshwater Flux', 'Geopotential Height (above sea level)', 'Geopotential Z at 500 mbar pressure surface', 'Heat Flux across top face', 'Heat Flux in grid-x direction', 'Heat Flux in grid-y direction', 'Horizontal total wind speed average at the surface', 'Internal Ocean Heat Flux Due to Ice Formation', 'Large-scale (stable) precipitation rate (liq + ice)', 'Large-scale (stable) snow rate (water equivalent)', 'Lowest model level zonal wind', 'Maximum (convective and large-scale) precipitation rate (liq+ice)', 'Maximum reference height temperature over output period', 'Meridional wind', 'Minimum reference height temperature over output period', 'Mixed-Layer Depth', 'Net longwave flux at surface', 'Net solar flux at surface', 'Net solar flux at top of atmosphere', 'Potential Density Ref to Surface', 'Potential Temperature', 'Reference height humidity', 'Reference height temperature', 'Salinity', 'Salt Flux across top face', 'Salt Flux in grid-x direction', 'Salt Flux in grid-y direction', 'Sea Surface Height', 'Sea level pressure', 'Solar Short-Wave Heat Flux', 'Solar Short-Wave Heat Flux in boundary layer', 'Solar Short-Wave Heat Flux in top layer', 'Specific Humidity at 850 mbar pressure surface', 'Specific humidity', 'Surface latent heat flux', 'Surface pressure', 'Surface sensible heat flux', 'Surface temperature (radiative)', 'Temperature', 'Total (convective and large-scale) precipitation rate (liq + ice)', 'Total (vertically integrated) precipitable water', 'Total Surface Heat Flux including short-wave', 'Upwelling longwave flux at top of model', 'Velocity in grid-x direction', 'Velocity in grid-y direction', 'Vertical Velocity', 'Virtual Salt Flux due to weak restoring', 'Virtual Salt Flux in FW Flux formulation', 'Wind stress (squared) in grid-x direction', 'Wind stress (squared) in grid-y direction', 'Wind stress in grid-x direction', 'Wind stress in grid-y direction', 'Zonal wind', 'atmospheric rain', 'atmospheric snow', 'fraction of ground covered by snow', 'grid cellmean ice thickness', 'grid cellmean ice thickness (daily)', 'grid cellmean ice thickness (monthly)', 'ice area, aggregate (daily)', 'ice area, aggregate (monthly)', 'snow depth (liquid water)', 'soil liquid water (vegetated landunits only)', 'soil liquid water + ice in top 10cm of soil (veg landunits only)', 'total liquid runoff (does not include QSNWCPICE)']\n" - ] - } - ], - "source": [ - "uniques = col.unique(columns=[\"long_name\"])\n", - "nameList = sorted(uniques[\"long_name\"][\"values\"])\n", - "print(nameList, sep=\"\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you want to unpack each item for the `print` function, you need to use the\n", - "wildcard `*`\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Clearsky net longwave flux at surface\n", - "Clearsky net solar flux at surface\n", - "Convective precipitation rate (liq + ice)\n", - "Convective snow rate (water equivalent)\n", - "Dissolved Inorganic Carbon\n", - "Dissolved Organic Carbon\n", - "Dissolved Oxygen\n", - "Fraction of sfc area covered by sea-ice\n", - "Free-Surface Residual Heat Flux\n", - "Free-Surface Residual Salt Flux\n", - "Freshwater Flux\n", - "Geopotential Height (above sea level)\n", - "Geopotential Z at 500 mbar pressure surface\n", - "Heat Flux across top face\n", - "Heat Flux in grid-x direction\n", - "Heat Flux in grid-y direction\n", - "Horizontal total wind speed average at the surface\n", - "Internal Ocean Heat Flux Due to Ice Formation\n", - "Large-scale (stable) precipitation rate (liq + ice)\n", - "Large-scale (stable) snow rate (water equivalent)\n", - "Lowest model level zonal wind\n", - "Maximum (convective and large-scale) precipitation rate (liq+ice)\n", - "Maximum reference height temperature over output period\n", - "Meridional wind\n", - "Minimum reference height temperature over output period\n", - "Mixed-Layer Depth\n", - "Net longwave flux at surface\n", - "Net solar flux at surface\n", - "Net solar flux at top of atmosphere\n", - "Potential Density Ref to Surface\n", - "Potential Temperature\n", - "Reference height humidity\n", - "Reference height temperature\n", - "Salinity\n", - "Salt Flux across top face\n", - "Salt Flux in grid-x direction\n", - "Salt Flux in grid-y direction\n", - "Sea Surface Height\n", - "Sea level pressure\n", - "Solar Short-Wave Heat Flux\n", - "Solar Short-Wave Heat Flux in boundary layer\n", - "Solar Short-Wave Heat Flux in top layer\n", - "Specific Humidity at 850 mbar pressure surface\n", - "Specific humidity\n", - "Surface latent heat flux\n", - "Surface pressure\n", - "Surface sensible heat flux\n", - "Surface temperature (radiative)\n", - "Temperature\n", - "Total (convective and large-scale) precipitation rate (liq + ice)\n", - "Total (vertically integrated) precipitable water\n", - "Total Surface Heat Flux including short-wave\n", - "Upwelling longwave flux at top of model\n", - "Velocity in grid-x direction\n", - "Velocity in grid-y direction\n", - "Vertical Velocity\n", - "Virtual Salt Flux due to weak restoring\n", - "Virtual Salt Flux in FW Flux formulation\n", - "Wind stress (squared) in grid-x direction\n", - "Wind stress (squared) in grid-y direction\n", - "Wind stress in grid-x direction\n", - "Wind stress in grid-y direction\n", - "Zonal wind\n", - "atmospheric rain\n", - "atmospheric snow\n", - "fraction of ground covered by snow\n", - "grid cellmean ice thickness\n", - "grid cellmean ice thickness (daily)\n", - "grid cellmean ice thickness (monthly)\n", - "ice area, aggregate (daily)\n", - "ice area, aggregate (monthly)\n", - "snow depth (liquid water)\n", - "soil liquid water (vegetated landunits only)\n", - "soil liquid water + ice in top 10cm of soil (veg landunits only)\n", - "total liquid runoff (does not include QSNWCPICE)\n" - ] - } - ], - "source": [ - "print(*nameList, sep=\"\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Note**: For the wildcard search to work, you will need at least intake-esm\n", - "v2020.08.15.\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "intake-esm-dev", - "language": "python", - "name": "intake-esm-dev" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From ff58201fa2d17981b6f94f6b9796ede7b156331b Mon Sep 17 00:00:00 2001 From: Julia Kent Date: Wed, 24 Nov 2021 11:24:11 -0700 Subject: [PATCH 11/30] unpin mamba --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index afc3c86e..a8754df8 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -36,7 +36,7 @@ jobs: auto-update-conda: false python-version: ${{ matrix.python-version }} environment-file: ci/environment.yml - mamba-version: 0.17.0 + mamba-version: '*' use-mamba: true miniforge-variant: Mambaforge From f62c2e3742682e87518c557b302a95526261c308 Mon Sep 17 00:00:00 2001 From: Kevin Paul Date: Wed, 24 Nov 2021 11:33:01 -0700 Subject: [PATCH 12/30] Update requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 51053aaa..05630e62 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,4 @@ requests>=2.24.0 xarray>=0.19 zarr>=2.5 pydantic>=1.8.2 -git+git://https://github.com/NCAR/xcollection.git +git+git://github.com/NCAR/xcollection.git From d8af73541f9a843453125265b57c855cd9c05023 Mon Sep 17 00:00:00 2001 From: Julia Kent Date: Wed, 24 Nov 2021 11:49:22 -0700 Subject: [PATCH 13/30] update setup.py --- setup.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 86a160ad..a0318ba6 100644 --- a/setup.py +++ b/setup.py @@ -7,8 +7,11 @@ from setuptools import find_packages, setup with open('requirements.txt') as f: - install_requires = f.read().strip().split('\n') - + install_requires = [] + for line in f: + line = line.split('/')[-1] + line = line.replace('.git', '') + install_requires.append(line) if exists('README.md'): with open('README.md') as f: From 451f8eba3a02a8fcd6074a8de8cfc63c1ff0f7bf Mon Sep 17 00:00:00 2001 From: Julia Kent Date: Wed, 24 Nov 2021 11:52:31 -0700 Subject: [PATCH 14/30] change hack --- setup.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index a0318ba6..b3019f05 100644 --- a/setup.py +++ b/setup.py @@ -7,11 +7,8 @@ from setuptools import find_packages, setup with open('requirements.txt') as f: - install_requires = [] - for line in f: - line = line.split('/')[-1] - line = line.replace('.git', '') - install_requires.append(line) + install_requires = f.read().strip().split('\n')[0:-1] + if exists('README.md'): with open('README.md') as f: From 33024771d72dff3d64f33b78ac1c6cc95cbc34c1 Mon Sep 17 00:00:00 2001 From: Kevin Paul Date: Wed, 24 Nov 2021 11:53:19 -0700 Subject: [PATCH 15/30] Update requirements.txt Co-authored-by: Anderson Banihirwe --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 05630e62..f4101370 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,4 @@ requests>=2.24.0 xarray>=0.19 zarr>=2.5 pydantic>=1.8.2 -git+git://github.com/NCAR/xcollection.git +git+https://github.com/NCAR/xcollection.git From 9d5214803d53ba813f854fbc09564d8b8853a342 Mon Sep 17 00:00:00 2001 From: Julia Kent Date: Wed, 24 Nov 2021 11:54:58 -0700 Subject: [PATCH 16/30] add xarray back --- intake_esm/core.py | 1 + 1 file changed, 1 insertion(+) diff --git a/intake_esm/core.py b/intake_esm/core.py index 2507acbc..7daf67de 100644 --- a/intake_esm/core.py +++ b/intake_esm/core.py @@ -7,6 +7,7 @@ import pandas as pd import pydantic import xcollection as xc +import xarray as xr from fastprogress.fastprogress import progress_bar from intake.catalog import Catalog From 4f9ef5c5885aa6fa4e2ef71cbf704cd6469edf24 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 24 Nov 2021 18:56:46 +0000 Subject: [PATCH 17/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- intake_esm/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/intake_esm/core.py b/intake_esm/core.py index 7daf67de..70efcb25 100644 --- a/intake_esm/core.py +++ b/intake_esm/core.py @@ -6,8 +6,8 @@ import dask import pandas as pd import pydantic -import xcollection as xc import xarray as xr +import xcollection as xc from fastprogress.fastprogress import progress_bar from intake.catalog import Catalog From ffa1dd215cd84a43d8391336dbcb6e573ea030bd Mon Sep 17 00:00:00 2001 From: Julia Kent <46687291+jukent@users.noreply.github.com> Date: Wed, 24 Nov 2021 12:03:55 -0700 Subject: [PATCH 18/30] Update intake_esm/core.py Co-authored-by: Anderson Banihirwe --- intake_esm/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/intake_esm/core.py b/intake_esm/core.py index 70efcb25..36eb2d26 100644 --- a/intake_esm/core.py +++ b/intake_esm/core.py @@ -442,7 +442,7 @@ def to_dataset_dict( aggregate: pydantic.StrictBool = None, skip_on_error: pydantic.StrictBool = False, **kwargs, - ) -> typing.Collection[str, xc.Collection]: + ) -> xc.Collection: """ Load catalog entries into a Collection of xarray datasets. From aae8457cf17d90655c61e8233a440044c8254122 Mon Sep 17 00:00:00 2001 From: Julia Kent Date: Wed, 24 Nov 2021 12:12:06 -0700 Subject: [PATCH 19/30] add to_dataet_dict original back --- intake_esm/core.py | 154 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 149 insertions(+), 5 deletions(-) diff --git a/intake_esm/core.py b/intake_esm/core.py index 36eb2d26..0a2b3e99 100644 --- a/intake_esm/core.py +++ b/intake_esm/core.py @@ -6,8 +6,8 @@ import dask import pandas as pd import pydantic -import xarray as xr import xcollection as xc +import xarray as xr from fastprogress.fastprogress import progress_bar from intake.catalog import Catalog @@ -248,6 +248,7 @@ def __dir__(self) -> typing.List[str]: rv = [ 'df', 'to_dataset_dict', + 'to_collection' 'keys', 'serialize', 'datasets', @@ -442,9 +443,9 @@ def to_dataset_dict( aggregate: pydantic.StrictBool = None, skip_on_error: pydantic.StrictBool = False, **kwargs, - ) -> xc.Collection: + ) -> typing.Dict[str, xr.Dataset]: """ - Load catalog entries into a Collection of xarray datasets. + Load catalog entries into a dictionary of xarray datasets. Parameters ---------- @@ -467,8 +468,8 @@ def to_dataset_dict( Returns ------- - dsets : Collection - A Collection of xarray :py:class:`~xarray.Dataset`. + dsets : dict + A dictionary of xarray :py:class:`~xarray.Dataset`. Examples -------- @@ -507,6 +508,149 @@ def to_dataset_dict( UserWarning, stacklevel=2, ) + return {} + + if ( + self.esmcat.aggregation_control.variable_column_name + in self.esmcat.aggregation_control.groupby_attrs + ) and len(self.derivedcat) > 0: + raise NotImplementedError( + f'The `{self.esmcat.aggregation_control.variable_column_name}` column name is used as a groupby attribute: {self.esmcat.aggregation_control.groupby_attrs}. ' + 'This is not yet supported when computing derived variables.' + ) + + xarray_open_kwargs = xarray_open_kwargs or {} + xarray_combine_by_coords_kwargs = xarray_combine_by_coords_kwargs or {} + cdf_kwargs, zarr_kwargs = kwargs.get('cdf_kwargs'), kwargs.get('zarr_kwargs') + + if cdf_kwargs or zarr_kwargs: + warnings.warn( + 'cdf_kwargs and zarr_kwargs are deprecated and will be removed in a future version. ' + 'Please use xarray_open_kwargs instead.', + DeprecationWarning, + stacklevel=2, + ) + if cdf_kwargs: + xarray_open_kwargs.update(cdf_kwargs) + if zarr_kwargs: + xarray_open_kwargs.update(zarr_kwargs) + + source_kwargs = dict( + xarray_open_kwargs=xarray_open_kwargs, + xarray_combine_by_coords_kwargs=xarray_combine_by_coords_kwargs, + preprocess=preprocess, + storage_options=storage_options, + requested_variables=self._requested_variables, + ) + + if aggregate is not None and not aggregate: + self = deepcopy(self) + self.esmcat.aggregation_control.groupby_attrs = [] + if progressbar is not None: + self.progressbar = progressbar + if self.progressbar: + print( + f"""\n--> The keys in the returned dictionary of datasets are constructed as follows:\n\t'{self.key_template}'""" + ) + sources = {key: source(**source_kwargs) for key, source in self.items()} + datasets = {} + with concurrent.futures.ThreadPoolExecutor(max_workers=dask.system.CPU_COUNT) as executor: + future_tasks = [ + executor.submit(_load_source, key, source) for key, source in sources.items() + ] + if self.progressbar: + gen = progress_bar( + concurrent.futures.as_completed(future_tasks), total=len(sources) + ) + else: + gen = concurrent.futures.as_completed(future_tasks) + for task in gen: + try: + key, ds = task.result() + datasets[key] = ds + except Exception as exc: + if not skip_on_error: + raise exc + self.datasets = self._create_derived_variables(datasets, skip_on_error) + return self.datasets + + @pydantic.validate_arguments + def to_collection( + self, + xarray_open_kwargs: typing.Dict[str, typing.Any] = None, + xarray_combine_by_coords_kwargs: typing.Dict[str, typing.Any] = None, + preprocess: typing.Callable = None, + storage_options: typing.Dict[pydantic.StrictStr, typing.Any] = None, + progressbar: pydantic.StrictBool = None, + aggregate: pydantic.StrictBool = None, + skip_on_error: pydantic.StrictBool = False, + **kwargs, + ) -> xc.Collection: + """ + Load a dictionary of datasets into a Collection of xarray datasets. + + Parameters + ---------- + xarray_open_kwargs : dict + Keyword arguments to pass to :py:func:`~xarray.open_dataset` function + xarray_combine_by_coords_kwargs: : dict + Keyword arguments to pass to :py:func:`~xarray.combine_by_coords` function. + preprocess : callable, optional + If provided, call this function on each dataset prior to aggregation. + storage_options : dict, optional + Parameters passed to the backend file-system such as Google Cloud Storage, + Amazon Web Service S3. + progressbar : bool + If True, will print a progress bar to standard error (stderr) + when loading assets into :py:class:`~xarray.Dataset`. + aggregate : bool, optional + If False, no aggregation will be done. + skip_on_error : bool, optional + If True, skip datasets that cannot be loaded and/or variables we are unable to derive. + + Returns + ------- + dsets : Collection + A Collection of xarray :py:class:`~xarray.Dataset`. + + Examples + -------- + >>> import intake + >>> col = intake.open_esm_datastore("glade-cmip6.json") + >>> cat = col.search( + ... source_id=["BCC-CSM2-MR", "CNRM-CM6-1", "CNRM-ESM2-1"], + ... experiment_id=["historical", "ssp585"], + ... variable_id="pr", + ... table_id="Amon", + ... grid_label="gn", + ... ) + >>> dset_dict = cat.to_datset_dic() + >>> dset_dict.keys() + dict_keys(['CMIP.BCC.BCC-CSM2-MR.historical.Amon.gn', 'ScenarioMIP.BCC.BCC-CSM2-MR.ssp585.Amon.gn']) + >>> dset_dict["CMIP.BCC.BCC-CSM2-MR.historical.Amon.gn"] + + Dimensions: (bnds: 2, lat: 160, lon: 320, member_id: 3, time: 1980) + Coordinates: + * lon (lon) float64 0.0 1.125 2.25 3.375 ... 355.5 356.6 357.8 358.9 + * lat (lat) float64 -89.14 -88.03 -86.91 -85.79 ... 86.91 88.03 89.14 + * time (time) object 1850-01-16 12:00:00 ... 2014-12-16 12:00:00 + * member_id (member_id) + lon_bnds (lon, bnds) float64 dask.array + time_bnds (time, bnds) object dask.array + pr (member_id, time, lat, lon) float32 dask.array + >>> dset_coll = dset_dict.to_collection() + """ + + # Return fast + if not self.keys(): + warnings.warn( + 'There are no datasets to load! Returning an empty Collection.', + UserWarning, + stacklevel=2, + ) return xc.Collection({}) if ( From 23bd360365472f41fd369f64f99087bfc75ee33e Mon Sep 17 00:00:00 2001 From: Julia Kent Date: Wed, 24 Nov 2021 12:17:27 -0700 Subject: [PATCH 20/30] call to_dataset_dict --- intake_esm/core.py | 79 +++------------------------------------------- 1 file changed, 4 insertions(+), 75 deletions(-) diff --git a/intake_esm/core.py b/intake_esm/core.py index 0a2b3e99..fd886db7 100644 --- a/intake_esm/core.py +++ b/intake_esm/core.py @@ -624,10 +624,10 @@ def to_collection( ... table_id="Amon", ... grid_label="gn", ... ) - >>> dset_dict = cat.to_datset_dic() - >>> dset_dict.keys() + >>> dsetw = cat.to_collection() + >>> dsets.keys() dict_keys(['CMIP.BCC.BCC-CSM2-MR.historical.Amon.gn', 'ScenarioMIP.BCC.BCC-CSM2-MR.ssp585.Amon.gn']) - >>> dset_dict["CMIP.BCC.BCC-CSM2-MR.historical.Amon.gn"] + >>> dsets["CMIP.BCC.BCC-CSM2-MR.historical.Amon.gn"] Dimensions: (bnds: 2, lat: 160, lon: 320, member_id: 3, time: 1980) Coordinates: @@ -641,80 +641,9 @@ def to_collection( lon_bnds (lon, bnds) float64 dask.array time_bnds (time, bnds) object dask.array pr (member_id, time, lat, lon) float32 dask.array - >>> dset_coll = dset_dict.to_collection() """ - # Return fast - if not self.keys(): - warnings.warn( - 'There are no datasets to load! Returning an empty Collection.', - UserWarning, - stacklevel=2, - ) - return xc.Collection({}) - - if ( - self.esmcat.aggregation_control.variable_column_name - in self.esmcat.aggregation_control.groupby_attrs - ) and len(self.derivedcat) > 0: - raise NotImplementedError( - f'The `{self.esmcat.aggregation_control.variable_column_name}` column name is used as a groupby attribute: {self.esmcat.aggregation_control.groupby_attrs}. ' - 'This is not yet supported when computing derived variables.' - ) - - xarray_open_kwargs = xarray_open_kwargs or {} - xarray_combine_by_coords_kwargs = xarray_combine_by_coords_kwargs or {} - cdf_kwargs, zarr_kwargs = kwargs.get('cdf_kwargs'), kwargs.get('zarr_kwargs') - - if cdf_kwargs or zarr_kwargs: - warnings.warn( - 'cdf_kwargs and zarr_kwargs are deprecated and will be removed in a future version. ' - 'Please use xarray_open_kwargs instead.', - DeprecationWarning, - stacklevel=2, - ) - if cdf_kwargs: - xarray_open_kwargs.update(cdf_kwargs) - if zarr_kwargs: - xarray_open_kwargs.update(zarr_kwargs) - - source_kwargs = dict( - xarray_open_kwargs=xarray_open_kwargs, - xarray_combine_by_coords_kwargs=xarray_combine_by_coords_kwargs, - preprocess=preprocess, - storage_options=storage_options, - requested_variables=self._requested_variables, - ) - - if aggregate is not None and not aggregate: - self = deepcopy(self) - self.esmcat.aggregation_control.groupby_attrs = [] - if progressbar is not None: - self.progressbar = progressbar - if self.progressbar: - print( - f"""\n--> The keys in the returned Collection of datasets are constructed as follows:\n\t'{self.key_template}'""" - ) - sources = {key: source(**source_kwargs) for key, source in self.items()} - datasets = {} - with concurrent.futures.ThreadPoolExecutor(max_workers=dask.system.CPU_COUNT) as executor: - future_tasks = [ - executor.submit(_load_source, key, source) for key, source in sources.items() - ] - if self.progressbar: - gen = progress_bar( - concurrent.futures.as_completed(future_tasks), total=len(sources) - ) - else: - gen = concurrent.futures.as_completed(future_tasks) - for task in gen: - try: - key, ds = task.result() - datasets[key] = ds - except Exception as exc: - if not skip_on_error: - raise exc - self.datasets = self._create_derived_variables(datasets, skip_on_error) + self.datasets = self.to_dataset_dict() self.datasets = xc.Collection(self.datasets) return self.datasets From befc742b3e81e9a7fa31033795af3a34e5e778f8 Mon Sep 17 00:00:00 2001 From: Julia Kent Date: Wed, 24 Nov 2021 12:18:06 -0700 Subject: [PATCH 21/30] update docstring --- intake_esm/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/intake_esm/core.py b/intake_esm/core.py index fd886db7..b62733e6 100644 --- a/intake_esm/core.py +++ b/intake_esm/core.py @@ -587,7 +587,7 @@ def to_collection( **kwargs, ) -> xc.Collection: """ - Load a dictionary of datasets into a Collection of xarray datasets. + Load catalog entries into a Collection of xarray datasets. Parameters ---------- From b92b7a0d1d1886833ad73df54c7e84c528f9f94f Mon Sep 17 00:00:00 2001 From: Julia Kent Date: Wed, 24 Nov 2021 12:22:44 -0700 Subject: [PATCH 22/30] add to_dataset_dict input kwargs --- intake_esm/core.py | 11 +- to_datset_dict_test.ipynb | 1974 +++++++++++++++++++++++++++++++++++++ 2 files changed, 1983 insertions(+), 2 deletions(-) create mode 100644 to_datset_dict_test.ipynb diff --git a/intake_esm/core.py b/intake_esm/core.py index b62733e6..a6a83666 100644 --- a/intake_esm/core.py +++ b/intake_esm/core.py @@ -624,7 +624,7 @@ def to_collection( ... table_id="Amon", ... grid_label="gn", ... ) - >>> dsetw = cat.to_collection() + >>> dsets = cat.to_collection() >>> dsets.keys() dict_keys(['CMIP.BCC.BCC-CSM2-MR.historical.Amon.gn', 'ScenarioMIP.BCC.BCC-CSM2-MR.ssp585.Amon.gn']) >>> dsets["CMIP.BCC.BCC-CSM2-MR.historical.Amon.gn"] @@ -643,7 +643,14 @@ def to_collection( pr (member_id, time, lat, lon) float32 dask.array """ - self.datasets = self.to_dataset_dict() + self.datasets = self.to_dataset_dict(xarray_open_kwargs = xarray_open_kwargs, + xarray_combine_by_coords_kwargs = xarray_combine_by_coords_kwargs, + preprocess = preprocess, + storage_options = storage_options, + progressbar = progressbar, + aggregate = aggregate, + skip_on_error = skip_on_error, + **kwargs,) self.datasets = xc.Collection(self.datasets) return self.datasets diff --git a/to_datset_dict_test.ipynb b/to_datset_dict_test.ipynb new file mode 100644 index 00000000..5a2b1811 --- /dev/null +++ b/to_datset_dict_test.ipynb @@ -0,0 +1,1974 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "f13c8e39-78ed-439c-b289-533e4dddc3d4", + "metadata": {}, + "outputs": [], + "source": [ + "#pip install git+https://github.com/NCAR/xcollection.git" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6ebc7354-4358-4346-9c22-69c518f2208b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jkent/miniconda3/envs/xcollection-dev/lib/python3.9/site-packages/fastprogress/fastprogress.py:102: UserWarning: Couldn't import ipywidgets properly, progress bar will use console behavior\n", + " warn(\"Couldn't import ipywidgets properly, progress bar will use console behavior\")\n" + ] + } + ], + "source": [ + "import xcollection as xc\n", + "\n", + "import concurrent.futures\n", + "import typing\n", + "\n", + "import warnings\n", + "from copy import deepcopy\n", + "\n", + "import dask\n", + "import pandas as pd\n", + "import pydantic\n", + "import xarray as xr\n", + "import xcollection as xc\n", + "from fastprogress.fastprogress import progress_bar\n", + "from intake.catalog import Catalog\n", + "\n", + "from intake_esm.cat import ESMCatalogModel\n", + "from intake_esm.derived import DerivedVariableRegistry, default_registry\n", + "from intake_esm.source import ESMDataSource\n", + "\n", + "import ast\n", + "import intake\n", + "import pytest" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "851dd553-8f07-48f5-a94e-b0a8464ee99d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2021.8.17\n" + ] + } + ], + "source": [ + "import intake_esm\n", + "print(intake_esm.__version__)" + ] + }, + { + "cell_type": "raw", + "id": "40c7294c-e837-4629-913b-474604b8ce70", + "metadata": {}, + "source": [ + "class esm_datastore(Catalog):\n", + " \"\"\"\n", + " An intake plugin for parsing an ESM (Earth System Model) Collection/catalog\n", + " and loading assets (netCDF files and/or Zarr stores) into xarray datasets.\n", + " The in-memory representation for the catalog is a Pandas DataFrame.\n", + "\n", + " Parameters\n", + " ----------\n", + " obj : str, dict\n", + " If string, this must be a path or URL to an ESM collection JSON file.\n", + " If dict, this must be a dict representation of an ESM collection.\n", + " This dict must have two keys: 'esmcat' and 'df'. The 'esmcat' key must be a\n", + " dict representation of the ESM collection and the 'df' key must\n", + " be a Pandas DataFrame containing content that would otherwise be in a CSV file.\n", + " sep : str, optional\n", + " Delimiter to use when constructing a key for a query, by default '.'\n", + " registry : DerivedVariableRegistry, optional\n", + " Registry of derived variables to use, by default None. If not provided, uses the default registry.\n", + " read_csv_kwargs : dict, optional\n", + " Additional keyword arguments passed through to the :py:func:`~pandas.read_csv` function.\n", + " storage_options : dict, optional\n", + " Parameters passed to the backend file-system such as Google Cloud Storage,\n", + " Amazon Web Service S3.\n", + " intake_kwargs: dict, optional\n", + " Additional keyword arguments are passed through to the :py:class:`~intake.catalog.Catalog` base class.\n", + "\n", + " Examples\n", + " --------\n", + "\n", + " At import time, this plugin is available in intake's registry as `esm_datastore` and\n", + " can be accessed with `intake.open_esm_datastore()`:\n", + "\n", + " >>> import intake\n", + " >>> url = \"https://storage.googleapis.com/cmip6/pangeo-cmip6.json\"\n", + " >>> col = intake.open_esm_datastore(url)\n", + " >>> col.df.head()\n", + " activity_id institution_id source_id experiment_id ... variable_id grid_label zstore dcpp_init_year\n", + " 0 AerChemMIP BCC BCC-ESM1 ssp370 ... pr gn gs://cmip6/AerChemMIP/BCC/BCC-ESM1/ssp370/r1i1... NaN\n", + " 1 AerChemMIP BCC BCC-ESM1 ssp370 ... prsn gn gs://cmip6/AerChemMIP/BCC/BCC-ESM1/ssp370/r1i1... NaN\n", + " 2 AerChemMIP BCC BCC-ESM1 ssp370 ... tas gn gs://cmip6/AerChemMIP/BCC/BCC-ESM1/ssp370/r1i1... NaN\n", + " 3 AerChemMIP BCC BCC-ESM1 ssp370 ... tasmax gn gs://cmip6/AerChemMIP/BCC/BCC-ESM1/ssp370/r1i1... NaN\n", + " 4 AerChemMIP BCC BCC-ESM1 ssp370 ... tasmin gn gs://cmip6/AerChemMIP/BCC/BCC-ESM1/ssp370/r1i1... NaN\n", + " \"\"\"\n", + "\n", + " name = 'esm_datastore'\n", + " container = 'xarray'\n", + "\n", + " def __init__(\n", + " self,\n", + " obj: typing.Union[pydantic.FilePath, pydantic.AnyUrl, typing.Dict[str, typing.Any]],\n", + " *,\n", + " progressbar: bool = True,\n", + " sep: str = '.',\n", + " registry: typing.Optional[DerivedVariableRegistry] = None,\n", + " read_csv_kwargs: typing.Dict[str, typing.Any] = None,\n", + " storage_options: typing.Dict[str, typing.Any] = None,\n", + " intake_kwargs: typing.Dict[str, typing.Any] = None,\n", + " ):\n", + "\n", + " \"\"\"Intake Catalog representing an ESM Collection.\"\"\"\n", + " intake_kwargs = intake_kwargs or {}\n", + " super(esm_datastore, self).__init__(**intake_kwargs)\n", + " self.storage_options = storage_options or {}\n", + " self.read_csv_kwargs = read_csv_kwargs or {}\n", + " self.progressbar = progressbar\n", + " self.sep = sep\n", + " if isinstance(obj, dict):\n", + " self.esmcat = ESMCatalogModel.from_dict(obj)\n", + " else:\n", + " self.esmcat = ESMCatalogModel.load(\n", + " obj, storage_options=self.storage_options, read_csv_kwargs=read_csv_kwargs\n", + " )\n", + "\n", + " self.derivedcat = registry or default_registry\n", + " self._entries = {}\n", + " self._requested_variables = []\n", + " self.datasets = {}\n", + " self._validate_derivedcat()\n", + " \n", + " def to_dataset_dict(\n", + " self,\n", + " xarray_open_kwargs: typing.Dict[str, typing.Any] = None,\n", + " xarray_combine_by_coords_kwargs: typing.Dict[str, typing.Any] = None,\n", + " preprocess: typing.Callable = None,\n", + " storage_options: typing.Dict[pydantic.StrictStr, typing.Any] = None,\n", + " progressbar: pydantic.StrictBool = None,\n", + " aggregate: pydantic.StrictBool = None,\n", + " skip_on_error: pydantic.StrictBool = False,\n", + " **kwargs,\n", + " ) -> typing.Collection[str, xc.Collection]:\n", + " \"\"\"\n", + " Load catalog entries into a Collection of xarray datasets.\n", + "\n", + " Parameters\n", + " ----------\n", + " xarray_open_kwargs : dict\n", + " Keyword arguments to pass to :py:func:`~xarray.open_dataset` function\n", + " xarray_combine_by_coords_kwargs: : dict\n", + " Keyword arguments to pass to :py:func:`~xarray.combine_by_coords` function.\n", + " preprocess : callable, optional\n", + " If provided, call this function on each dataset prior to aggregation.\n", + " storage_options : dict, optional\n", + " Parameters passed to the backend file-system such as Google Cloud Storage,\n", + " Amazon Web Service S3.\n", + " progressbar : bool\n", + " If True, will print a progress bar to standard error (stderr)\n", + " when loading assets into :py:class:`~xarray.Dataset`.\n", + " aggregate : bool, optional\n", + " If False, no aggregation will be done.\n", + " skip_on_error : bool, optional\n", + " If True, skip datasets that cannot be loaded and/or variables we are unable to derive.\n", + "\n", + " Returns\n", + " -------\n", + " dsets : Collection\n", + " A Collection of xarray :py:class:`~xarray.Dataset`.\n", + "\n", + " Examples\n", + " --------\n", + " >>> import intake\n", + " >>> col = intake.open_esm_datastore(\"glade-cmip6.json\")\n", + " >>> cat = col.search(\n", + " ... source_id=[\"BCC-CSM2-MR\", \"CNRM-CM6-1\", \"CNRM-ESM2-1\"],\n", + " ... experiment_id=[\"historical\", \"ssp585\"],\n", + " ... variable_id=\"pr\",\n", + " ... table_id=\"Amon\",\n", + " ... grid_label=\"gn\",\n", + " ... )\n", + " >>> dsets = cat.to_dataset_dict()\n", + " >>> dsets.keys() ## change this and the following line!!\n", + " dict_keys(['CMIP.BCC.BCC-CSM2-MR.historical.Amon.gn', 'ScenarioMIP.BCC.BCC-CSM2-MR.ssp585.Amon.gn'])\n", + " >>> dsets[\"CMIP.BCC.BCC-CSM2-MR.historical.Amon.gn\"]\n", + " \n", + " Dimensions: (bnds: 2, lat: 160, lon: 320, member_id: 3, time: 1980)\n", + " Coordinates:\n", + " * lon (lon) float64 0.0 1.125 2.25 3.375 ... 355.5 356.6 357.8 358.9\n", + " * lat (lat) float64 -89.14 -88.03 -86.91 -85.79 ... 86.91 88.03 89.14\n", + " * time (time) object 1850-01-16 12:00:00 ... 2014-12-16 12:00:00\n", + " * member_id (member_id) \n", + " lon_bnds (lon, bnds) float64 dask.array\n", + " time_bnds (time, bnds) object dask.array\n", + " pr (member_id, time, lat, lon) float32 dask.array\n", + " \"\"\"\n", + "\n", + " # Return fast\n", + " if not self.keys():\n", + " warnings.warn(\n", + " 'There are no datasets to load! Returning an empty dictionary.',\n", + " UserWarning,\n", + " stacklevel=2,\n", + " )\n", + " return xc.Collection({})" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "ad0bdee8-f14c-46be-a8f2-5de6695bbc88", + "metadata": {}, + "outputs": [], + "source": [ + "#col = intake.open_esm_datastore(\"glade-cmip6.json\") # No such file?\n", + "# cat = col.search(\n", + "# source_id=[\"BCC-CSM2-MR\", \"CNRM-CM6-1\", \"CNRM-ESM2-1\"],\n", + "# experiment_id=[\"historical\", \"ssp585\"],\n", + "# variable_id=\"pr\",\n", + "# table_id=\"Amon\",\n", + "# grid_label=\"gn\",\n", + "# )\n", + "# dsets = cat.to_dataset_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "6e462d2b-6ecd-40c2-a4b8-7f08f69e9592", + "metadata": {}, + "outputs": [], + "source": [ + "#conda list intake" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f0aba3b9-835b-421f-b72c-d52fe9065c73", + "metadata": {}, + "outputs": [], + "source": [ + "#pip show intake_esm" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e251b53d-f221-4181-aed2-bf70c9805f96", + "metadata": {}, + "outputs": [], + "source": [ + "#import sys\n", + "#print(sys.executable)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b092add8-2696-4dbe-9082-a95622e77364", + "metadata": {}, + "outputs": [], + "source": [ + "#print(intake.registry)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "014e92a1-f0f7-46d3-a772-b6901a973d1c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + "

pangeo-cmip6 catalog with 7767 dataset(s) from 521075 asset(s):

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unique
activity_id18
institution_id36
source_id88
experiment_id170
member_id657
table_id37
variable_id709
grid_label10
zstore521075
dcpp_init_year60
version729
derived_variable_id0
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "url = \"https://raw.githubusercontent.com/NCAR/intake-esm-datastore/master/catalogs/pangeo-cmip6.json\"\n", + "col = intake.open_esm_datastore(url)\n", + "print(type(col))\n", + "col" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "94732e64-d07b-457b-b71d-130be06e0711", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

pangeo-cmip6 catalog with 27 dataset(s) from 173 asset(s):

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unique
activity_id2
institution_id13
source_id15
experiment_id2
member_id47
table_id1
variable_id1
grid_label1
zstore173
dcpp_init_year0
version29
derived_variable_id0
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "cat = col.search(\n", + " experiment_id=[\"historical\", \"ssp585\"],\n", + " table_id=\"Oyr\",\n", + " variable_id=\"o2\",\n", + " grid_label=\"gn\",\n", + ")\n", + "\n", + "cat" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "ef3c20ec-da29-416b-85ba-7837aa491f2b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
activity_idinstitution_idsource_idexperiment_idmember_idtable_idvariable_idgrid_labelzstoredcpp_init_yearversion
0CMIPIPSLIPSL-CM6A-LRhistoricalr24i1p1f1Oyro2gngs://cmip6/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/histor...NaN20180803
1CMIPIPSLIPSL-CM6A-LRhistoricalr25i1p1f1Oyro2gngs://cmip6/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/histor...NaN20180803
2CMIPIPSLIPSL-CM6A-LRhistoricalr10i1p1f1Oyro2gngs://cmip6/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/histor...NaN20180803
3CMIPIPSLIPSL-CM6A-LRhistoricalr11i1p1f1Oyro2gngs://cmip6/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/histor...NaN20180803
4CMIPIPSLIPSL-CM6A-LRhistoricalr21i1p1f1Oyro2gngs://cmip6/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/histor...NaN20180803
\n", + "
" + ], + "text/plain": [ + " activity_id institution_id source_id experiment_id member_id table_id \\\n", + "0 CMIP IPSL IPSL-CM6A-LR historical r24i1p1f1 Oyr \n", + "1 CMIP IPSL IPSL-CM6A-LR historical r25i1p1f1 Oyr \n", + "2 CMIP IPSL IPSL-CM6A-LR historical r10i1p1f1 Oyr \n", + "3 CMIP IPSL IPSL-CM6A-LR historical r11i1p1f1 Oyr \n", + "4 CMIP IPSL IPSL-CM6A-LR historical r21i1p1f1 Oyr \n", + "\n", + " variable_id grid_label zstore \\\n", + "0 o2 gn gs://cmip6/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/histor... \n", + "1 o2 gn gs://cmip6/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/histor... \n", + "2 o2 gn gs://cmip6/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/histor... \n", + "3 o2 gn gs://cmip6/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/histor... \n", + "4 o2 gn gs://cmip6/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/histor... \n", + "\n", + " dcpp_init_year version \n", + "0 NaN 20180803 \n", + "1 NaN 20180803 \n", + "2 NaN 20180803 \n", + "3 NaN 20180803 \n", + "4 NaN 20180803 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cat.df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "aa91a42f-aa5f-4c94-a95f-e1c0db820239", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/dd/_xm_pbpd3flgbvbnt7qhd70snnbpj_/T/ipykernel_22903/1715370549.py:1: DeprecationWarning: cdf_kwargs and zarr_kwargs are deprecated and will be removed in a future version. Please use xarray_open_kwargs instead.\n", + " dset_dict = cat.to_dataset_dict(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "--> The keys in the returned dictionary of datasets are constructed as follows:\n", + "\t'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'\n", + "â–ˆ\r" + ] + } + ], + "source": [ + "dset_dict = cat.to_dataset_dict(\n", + " zarr_kwargs={\"consolidated\": True, \"decode_times\": True}\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "6a78f1f0-71eb-4a04-a149-2903b8c38696", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['CMIP.IPSL.IPSL-CM5A2-INCA.historical.Oyr.gn', 'ScenarioMIP.CMCC.CMCC-ESM2.ssp585.Oyr.gn', 'CMIP.EC-Earth-Consortium.EC-Earth3-CC.historical.Oyr.gn', 'ScenarioMIP.MRI.MRI-ESM2-0.ssp585.Oyr.gn', 'ScenarioMIP.EC-Earth-Consortium.EC-Earth3-CC.ssp585.Oyr.gn', 'CMIP.CMCC.CMCC-ESM2.historical.Oyr.gn', 'ScenarioMIP.DWD.MPI-ESM1-2-HR.ssp585.Oyr.gn', 'ScenarioMIP.CCCma.CanESM5-CanOE.ssp585.Oyr.gn', 'CMIP.NCC.NorESM2-LM.historical.Oyr.gn', 'CMIP.NCC.NorESM2-MM.historical.Oyr.gn', 'CMIP.MRI.MRI-ESM2-0.historical.Oyr.gn', 'ScenarioMIP.MPI-M.MPI-ESM1-2-LR.ssp585.Oyr.gn', 'CMIP.HAMMOZ-Consortium.MPI-ESM-1-2-HAM.historical.Oyr.gn', 'CMIP.MPI-M.MPI-ESM1-2-LR.historical.Oyr.gn', 'ScenarioMIP.DKRZ.MPI-ESM1-2-HR.ssp585.Oyr.gn', 'CMIP.CCCma.CanESM5-CanOE.historical.Oyr.gn', 'ScenarioMIP.MIROC.MIROC-ES2L.ssp585.Oyr.gn', 'ScenarioMIP.NCC.NorESM2-MM.ssp585.Oyr.gn', 'ScenarioMIP.NCC.NorESM2-LM.ssp585.Oyr.gn', 'ScenarioMIP.IPSL.IPSL-CM6A-LR.ssp585.Oyr.gn', 'CMIP.MPI-M.MPI-ESM1-2-HR.historical.Oyr.gn', 'ScenarioMIP.NCAR.CESM2.ssp585.Oyr.gn', 'CMIP.MIROC.MIROC-ES2L.historical.Oyr.gn', 'ScenarioMIP.CCCma.CanESM5.ssp585.Oyr.gn', 'CMIP.CSIRO.ACCESS-ESM1-5.historical.Oyr.gn', 'CMIP.IPSL.IPSL-CM6A-LR.historical.Oyr.gn', 'CMIP.CCCma.CanESM5.historical.Oyr.gn'])" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dset_dict.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "1ae6b93a-d5cf-4fee-8576-51b4f951e2b0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "dset_collection = xc.Collection(dset_dict)\n", + "dset_collection" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "ae82495d-8418-47f1-b971-c168c40ca2c4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['CMIP.IPSL.IPSL-CM5A2-INCA.historical.Oyr.gn', 'ScenarioMIP.CMCC.CMCC-ESM2.ssp585.Oyr.gn', 'CMIP.EC-Earth-Consortium.EC-Earth3-CC.historical.Oyr.gn', 'ScenarioMIP.MRI.MRI-ESM2-0.ssp585.Oyr.gn', 'ScenarioMIP.EC-Earth-Consortium.EC-Earth3-CC.ssp585.Oyr.gn', 'CMIP.CMCC.CMCC-ESM2.historical.Oyr.gn', 'ScenarioMIP.DWD.MPI-ESM1-2-HR.ssp585.Oyr.gn', 'ScenarioMIP.CCCma.CanESM5-CanOE.ssp585.Oyr.gn', 'CMIP.NCC.NorESM2-LM.historical.Oyr.gn', 'CMIP.NCC.NorESM2-MM.historical.Oyr.gn', 'CMIP.MRI.MRI-ESM2-0.historical.Oyr.gn', 'ScenarioMIP.MPI-M.MPI-ESM1-2-LR.ssp585.Oyr.gn', 'CMIP.HAMMOZ-Consortium.MPI-ESM-1-2-HAM.historical.Oyr.gn', 'CMIP.MPI-M.MPI-ESM1-2-LR.historical.Oyr.gn', 'ScenarioMIP.DKRZ.MPI-ESM1-2-HR.ssp585.Oyr.gn', 'CMIP.CCCma.CanESM5-CanOE.historical.Oyr.gn', 'ScenarioMIP.MIROC.MIROC-ES2L.ssp585.Oyr.gn', 'ScenarioMIP.NCC.NorESM2-MM.ssp585.Oyr.gn', 'ScenarioMIP.NCC.NorESM2-LM.ssp585.Oyr.gn', 'ScenarioMIP.IPSL.IPSL-CM6A-LR.ssp585.Oyr.gn', 'CMIP.MPI-M.MPI-ESM1-2-HR.historical.Oyr.gn', 'ScenarioMIP.NCAR.CESM2.ssp585.Oyr.gn', 'CMIP.MIROC.MIROC-ES2L.historical.Oyr.gn', 'ScenarioMIP.CCCma.CanESM5.ssp585.Oyr.gn', 'CMIP.CSIRO.ACCESS-ESM1-5.historical.Oyr.gn', 'CMIP.IPSL.IPSL-CM6A-LR.historical.Oyr.gn', 'CMIP.CCCma.CanESM5.historical.Oyr.gn'])" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dset_collection.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "7f31b51d-e6a6-4437-87c7-a4b52021d6ef", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset>\n",
+       "Dimensions:         (y: 149, x: 182, nvertex: 4, deptht: 31, axis_nbounds: 2, member_id: 1, dcpp_init_year: 1, time: 165)\n",
+       "Coordinates:\n",
+       "    bounds_nav_lat  (y, x, nvertex) float32 dask.array<chunksize=(149, 182, 4), meta=np.ndarray>\n",
+       "    bounds_nav_lon  (y, x, nvertex) float32 dask.array<chunksize=(149, 182, 4), meta=np.ndarray>\n",
+       "  * deptht          (deptht) float32 5.0 15.0 25.0 ... 4.75e+03 5.25e+03\n",
+       "    deptht_bounds   (deptht, axis_nbounds) float32 dask.array<chunksize=(31, 2), meta=np.ndarray>\n",
+       "    nav_lat         (y, x) float32 dask.array<chunksize=(149, 182), meta=np.ndarray>\n",
+       "    nav_lon         (y, x) float32 dask.array<chunksize=(149, 182), meta=np.ndarray>\n",
+       "  * time            (time) object 1850-07-02 12:00:00 ... 2014-07-02 12:00:00\n",
+       "    time_bounds     (time, axis_nbounds) object dask.array<chunksize=(165, 2), meta=np.ndarray>\n",
+       "  * member_id       (member_id) <U8 'r1i1p1f1'\n",
+       "  * dcpp_init_year  (dcpp_init_year) float64 nan\n",
+       "Dimensions without coordinates: y, x, nvertex, axis_nbounds\n",
+       "Data variables:\n",
+       "    area            (y, x) float32 dask.array<chunksize=(149, 182), meta=np.ndarray>\n",
+       "    o2              (member_id, dcpp_init_year, time, deptht, y, x) float32 dask.array<chunksize=(1, 1, 34, 31, 149, 182), meta=np.ndarray>\n",
+       "Attributes: (12/66)\n",
+       "    CMIP6_CV_version:                 cv=6.2.15.1\n",
+       "    Conventions:                      CF-1.7 CMIP-6.2\n",
+       "    EXPID:                            historical\n",
+       "    activity_id:                      CMIP\n",
+       "    branch_method:                    standard\n",
+       "    branch_time_in_child:             0.0\n",
+       "    ...                               ...\n",
+       "    intake_esm_attrs/variable_id:     o2\n",
+       "    intake_esm_attrs/grid_label:      gn\n",
+       "    intake_esm_attrs/zstore:          gs://cmip6/CMIP6/CMIP/IPSL/IPSL-CM5A2-I...\n",
+       "    intake_esm_attrs/dcpp_init_year:  nan\n",
+       "    intake_esm_attrs/version:         20200729\n",
+       "    intake_esm_dataset_key:           CMIP.IPSL.IPSL-CM5A2-INCA.historical.Oy...
" + ], + "text/plain": [ + "\n", + "Dimensions: (y: 149, x: 182, nvertex: 4, deptht: 31, axis_nbounds: 2, member_id: 1, dcpp_init_year: 1, time: 165)\n", + "Coordinates:\n", + " bounds_nav_lat (y, x, nvertex) float32 dask.array\n", + " bounds_nav_lon (y, x, nvertex) float32 dask.array\n", + " * deptht (deptht) float32 5.0 15.0 25.0 ... 4.75e+03 5.25e+03\n", + " deptht_bounds (deptht, axis_nbounds) float32 dask.array\n", + " nav_lat (y, x) float32 dask.array\n", + " nav_lon (y, x) float32 dask.array\n", + " * time (time) object 1850-07-02 12:00:00 ... 2014-07-02 12:00:00\n", + " time_bounds (time, axis_nbounds) object dask.array\n", + " * member_id (member_id) \n", + " o2 (member_id, dcpp_init_year, time, deptht, y, x) float32 dask.array\n", + "Attributes: (12/66)\n", + " CMIP6_CV_version: cv=6.2.15.1\n", + " Conventions: CF-1.7 CMIP-6.2\n", + " EXPID: historical\n", + " activity_id: CMIP\n", + " branch_method: standard\n", + " branch_time_in_child: 0.0\n", + " ... ...\n", + " intake_esm_attrs/variable_id: o2\n", + " intake_esm_attrs/grid_label: gn\n", + " intake_esm_attrs/zstore: gs://cmip6/CMIP6/CMIP/IPSL/IPSL-CM5A2-I...\n", + " intake_esm_attrs/dcpp_init_year: nan\n", + " intake_esm_attrs/version: 20200729\n", + " intake_esm_dataset_key: CMIP.IPSL.IPSL-CM5A2-INCA.historical.Oy..." + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dset_collection['CMIP.IPSL.IPSL-CM5A2-INCA.historical.Oyr.gn']" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "7020c4c2-49c8-4a0e-938d-586e2a8d04af", + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'Collection' object has no attribute 'dims'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/var/folders/dd/_xm_pbpd3flgbvbnt7qhd70snnbpj_/T/ipykernel_22903/3401585551.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdset_collection\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdims\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m: 'Collection' object has no attribute 'dims'" + ] + } + ], + "source": [ + "dset_collection.dims" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45e0e16a-93b9-4ba8-af2e-04e7afc7e2ae", + "metadata": {}, + "outputs": [], + "source": [ + "cat = intake.open_esm_datastore(path)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 96a563104a8878a570add1c213836cca19cc563c Mon Sep 17 00:00:00 2001 From: Julia Kent <46687291+jukent@users.noreply.github.com> Date: Wed, 24 Nov 2021 12:23:58 -0700 Subject: [PATCH 23/30] Delete to_datset_dict_test.ipynb --- to_datset_dict_test.ipynb | 1974 ------------------------------------- 1 file changed, 1974 deletions(-) delete mode 100644 to_datset_dict_test.ipynb diff --git a/to_datset_dict_test.ipynb b/to_datset_dict_test.ipynb deleted file mode 100644 index 5a2b1811..00000000 --- a/to_datset_dict_test.ipynb +++ /dev/null @@ -1,1974 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "f13c8e39-78ed-439c-b289-533e4dddc3d4", - "metadata": {}, - "outputs": [], - "source": [ - "#pip install git+https://github.com/NCAR/xcollection.git" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "6ebc7354-4358-4346-9c22-69c518f2208b", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jkent/miniconda3/envs/xcollection-dev/lib/python3.9/site-packages/fastprogress/fastprogress.py:102: UserWarning: Couldn't import ipywidgets properly, progress bar will use console behavior\n", - " warn(\"Couldn't import ipywidgets properly, progress bar will use console behavior\")\n" - ] - } - ], - "source": [ - "import xcollection as xc\n", - "\n", - "import concurrent.futures\n", - "import typing\n", - "\n", - "import warnings\n", - "from copy import deepcopy\n", - "\n", - "import dask\n", - "import pandas as pd\n", - "import pydantic\n", - "import xarray as xr\n", - "import xcollection as xc\n", - "from fastprogress.fastprogress import progress_bar\n", - "from intake.catalog import Catalog\n", - "\n", - "from intake_esm.cat import ESMCatalogModel\n", - "from intake_esm.derived import DerivedVariableRegistry, default_registry\n", - "from intake_esm.source import ESMDataSource\n", - "\n", - "import ast\n", - "import intake\n", - "import pytest" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "851dd553-8f07-48f5-a94e-b0a8464ee99d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2021.8.17\n" - ] - } - ], - "source": [ - "import intake_esm\n", - "print(intake_esm.__version__)" - ] - }, - { - "cell_type": "raw", - "id": "40c7294c-e837-4629-913b-474604b8ce70", - "metadata": {}, - "source": [ - "class esm_datastore(Catalog):\n", - " \"\"\"\n", - " An intake plugin for parsing an ESM (Earth System Model) Collection/catalog\n", - " and loading assets (netCDF files and/or Zarr stores) into xarray datasets.\n", - " The in-memory representation for the catalog is a Pandas DataFrame.\n", - "\n", - " Parameters\n", - " ----------\n", - " obj : str, dict\n", - " If string, this must be a path or URL to an ESM collection JSON file.\n", - " If dict, this must be a dict representation of an ESM collection.\n", - " This dict must have two keys: 'esmcat' and 'df'. The 'esmcat' key must be a\n", - " dict representation of the ESM collection and the 'df' key must\n", - " be a Pandas DataFrame containing content that would otherwise be in a CSV file.\n", - " sep : str, optional\n", - " Delimiter to use when constructing a key for a query, by default '.'\n", - " registry : DerivedVariableRegistry, optional\n", - " Registry of derived variables to use, by default None. If not provided, uses the default registry.\n", - " read_csv_kwargs : dict, optional\n", - " Additional keyword arguments passed through to the :py:func:`~pandas.read_csv` function.\n", - " storage_options : dict, optional\n", - " Parameters passed to the backend file-system such as Google Cloud Storage,\n", - " Amazon Web Service S3.\n", - " intake_kwargs: dict, optional\n", - " Additional keyword arguments are passed through to the :py:class:`~intake.catalog.Catalog` base class.\n", - "\n", - " Examples\n", - " --------\n", - "\n", - " At import time, this plugin is available in intake's registry as `esm_datastore` and\n", - " can be accessed with `intake.open_esm_datastore()`:\n", - "\n", - " >>> import intake\n", - " >>> url = \"https://storage.googleapis.com/cmip6/pangeo-cmip6.json\"\n", - " >>> col = intake.open_esm_datastore(url)\n", - " >>> col.df.head()\n", - " activity_id institution_id source_id experiment_id ... variable_id grid_label zstore dcpp_init_year\n", - " 0 AerChemMIP BCC BCC-ESM1 ssp370 ... pr gn gs://cmip6/AerChemMIP/BCC/BCC-ESM1/ssp370/r1i1... NaN\n", - " 1 AerChemMIP BCC BCC-ESM1 ssp370 ... prsn gn gs://cmip6/AerChemMIP/BCC/BCC-ESM1/ssp370/r1i1... NaN\n", - " 2 AerChemMIP BCC BCC-ESM1 ssp370 ... tas gn gs://cmip6/AerChemMIP/BCC/BCC-ESM1/ssp370/r1i1... NaN\n", - " 3 AerChemMIP BCC BCC-ESM1 ssp370 ... tasmax gn gs://cmip6/AerChemMIP/BCC/BCC-ESM1/ssp370/r1i1... NaN\n", - " 4 AerChemMIP BCC BCC-ESM1 ssp370 ... tasmin gn gs://cmip6/AerChemMIP/BCC/BCC-ESM1/ssp370/r1i1... NaN\n", - " \"\"\"\n", - "\n", - " name = 'esm_datastore'\n", - " container = 'xarray'\n", - "\n", - " def __init__(\n", - " self,\n", - " obj: typing.Union[pydantic.FilePath, pydantic.AnyUrl, typing.Dict[str, typing.Any]],\n", - " *,\n", - " progressbar: bool = True,\n", - " sep: str = '.',\n", - " registry: typing.Optional[DerivedVariableRegistry] = None,\n", - " read_csv_kwargs: typing.Dict[str, typing.Any] = None,\n", - " storage_options: typing.Dict[str, typing.Any] = None,\n", - " intake_kwargs: typing.Dict[str, typing.Any] = None,\n", - " ):\n", - "\n", - " \"\"\"Intake Catalog representing an ESM Collection.\"\"\"\n", - " intake_kwargs = intake_kwargs or {}\n", - " super(esm_datastore, self).__init__(**intake_kwargs)\n", - " self.storage_options = storage_options or {}\n", - " self.read_csv_kwargs = read_csv_kwargs or {}\n", - " self.progressbar = progressbar\n", - " self.sep = sep\n", - " if isinstance(obj, dict):\n", - " self.esmcat = ESMCatalogModel.from_dict(obj)\n", - " else:\n", - " self.esmcat = ESMCatalogModel.load(\n", - " obj, storage_options=self.storage_options, read_csv_kwargs=read_csv_kwargs\n", - " )\n", - "\n", - " self.derivedcat = registry or default_registry\n", - " self._entries = {}\n", - " self._requested_variables = []\n", - " self.datasets = {}\n", - " self._validate_derivedcat()\n", - " \n", - " def to_dataset_dict(\n", - " self,\n", - " xarray_open_kwargs: typing.Dict[str, typing.Any] = None,\n", - " xarray_combine_by_coords_kwargs: typing.Dict[str, typing.Any] = None,\n", - " preprocess: typing.Callable = None,\n", - " storage_options: typing.Dict[pydantic.StrictStr, typing.Any] = None,\n", - " progressbar: pydantic.StrictBool = None,\n", - " aggregate: pydantic.StrictBool = None,\n", - " skip_on_error: pydantic.StrictBool = False,\n", - " **kwargs,\n", - " ) -> typing.Collection[str, xc.Collection]:\n", - " \"\"\"\n", - " Load catalog entries into a Collection of xarray datasets.\n", - "\n", - " Parameters\n", - " ----------\n", - " xarray_open_kwargs : dict\n", - " Keyword arguments to pass to :py:func:`~xarray.open_dataset` function\n", - " xarray_combine_by_coords_kwargs: : dict\n", - " Keyword arguments to pass to :py:func:`~xarray.combine_by_coords` function.\n", - " preprocess : callable, optional\n", - " If provided, call this function on each dataset prior to aggregation.\n", - " storage_options : dict, optional\n", - " Parameters passed to the backend file-system such as Google Cloud Storage,\n", - " Amazon Web Service S3.\n", - " progressbar : bool\n", - " If True, will print a progress bar to standard error (stderr)\n", - " when loading assets into :py:class:`~xarray.Dataset`.\n", - " aggregate : bool, optional\n", - " If False, no aggregation will be done.\n", - " skip_on_error : bool, optional\n", - " If True, skip datasets that cannot be loaded and/or variables we are unable to derive.\n", - "\n", - " Returns\n", - " -------\n", - " dsets : Collection\n", - " A Collection of xarray :py:class:`~xarray.Dataset`.\n", - "\n", - " Examples\n", - " --------\n", - " >>> import intake\n", - " >>> col = intake.open_esm_datastore(\"glade-cmip6.json\")\n", - " >>> cat = col.search(\n", - " ... source_id=[\"BCC-CSM2-MR\", \"CNRM-CM6-1\", \"CNRM-ESM2-1\"],\n", - " ... experiment_id=[\"historical\", \"ssp585\"],\n", - " ... variable_id=\"pr\",\n", - " ... table_id=\"Amon\",\n", - " ... grid_label=\"gn\",\n", - " ... )\n", - " >>> dsets = cat.to_dataset_dict()\n", - " >>> dsets.keys() ## change this and the following line!!\n", - " dict_keys(['CMIP.BCC.BCC-CSM2-MR.historical.Amon.gn', 'ScenarioMIP.BCC.BCC-CSM2-MR.ssp585.Amon.gn'])\n", - " >>> dsets[\"CMIP.BCC.BCC-CSM2-MR.historical.Amon.gn\"]\n", - " \n", - " Dimensions: (bnds: 2, lat: 160, lon: 320, member_id: 3, time: 1980)\n", - " Coordinates:\n", - " * lon (lon) float64 0.0 1.125 2.25 3.375 ... 355.5 356.6 357.8 358.9\n", - " * lat (lat) float64 -89.14 -88.03 -86.91 -85.79 ... 86.91 88.03 89.14\n", - " * time (time) object 1850-01-16 12:00:00 ... 2014-12-16 12:00:00\n", - " * member_id (member_id) \n", - " lon_bnds (lon, bnds) float64 dask.array\n", - " time_bnds (time, bnds) object dask.array\n", - " pr (member_id, time, lat, lon) float32 dask.array\n", - " \"\"\"\n", - "\n", - " # Return fast\n", - " if not self.keys():\n", - " warnings.warn(\n", - " 'There are no datasets to load! Returning an empty dictionary.',\n", - " UserWarning,\n", - " stacklevel=2,\n", - " )\n", - " return xc.Collection({})" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "ad0bdee8-f14c-46be-a8f2-5de6695bbc88", - "metadata": {}, - "outputs": [], - "source": [ - "#col = intake.open_esm_datastore(\"glade-cmip6.json\") # No such file?\n", - "# cat = col.search(\n", - "# source_id=[\"BCC-CSM2-MR\", \"CNRM-CM6-1\", \"CNRM-ESM2-1\"],\n", - "# experiment_id=[\"historical\", \"ssp585\"],\n", - "# variable_id=\"pr\",\n", - "# table_id=\"Amon\",\n", - "# grid_label=\"gn\",\n", - "# )\n", - "# dsets = cat.to_dataset_dict()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "6e462d2b-6ecd-40c2-a4b8-7f08f69e9592", - "metadata": {}, - "outputs": [], - "source": [ - "#conda list intake" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "f0aba3b9-835b-421f-b72c-d52fe9065c73", - "metadata": {}, - "outputs": [], - "source": [ - "#pip show intake_esm" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "e251b53d-f221-4181-aed2-bf70c9805f96", - "metadata": {}, - "outputs": [], - "source": [ - "#import sys\n", - "#print(sys.executable)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "b092add8-2696-4dbe-9082-a95622e77364", - "metadata": {}, - "outputs": [], - "source": [ - "#print(intake.registry)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "014e92a1-f0f7-46d3-a772-b6901a973d1c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/html": [ - "

pangeo-cmip6 catalog with 7767 dataset(s) from 521075 asset(s):

\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
unique
activity_id18
institution_id36
source_id88
experiment_id170
member_id657
table_id37
variable_id709
grid_label10
zstore521075
dcpp_init_year60
version729
derived_variable_id0
\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "url = \"https://raw.githubusercontent.com/NCAR/intake-esm-datastore/master/catalogs/pangeo-cmip6.json\"\n", - "col = intake.open_esm_datastore(url)\n", - "print(type(col))\n", - "col" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "94732e64-d07b-457b-b71d-130be06e0711", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "

pangeo-cmip6 catalog with 27 dataset(s) from 173 asset(s):

\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
unique
activity_id2
institution_id13
source_id15
experiment_id2
member_id47
table_id1
variable_id1
grid_label1
zstore173
dcpp_init_year0
version29
derived_variable_id0
\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "cat = col.search(\n", - " experiment_id=[\"historical\", \"ssp585\"],\n", - " table_id=\"Oyr\",\n", - " variable_id=\"o2\",\n", - " grid_label=\"gn\",\n", - ")\n", - "\n", - "cat" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "ef3c20ec-da29-416b-85ba-7837aa491f2b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
activity_idinstitution_idsource_idexperiment_idmember_idtable_idvariable_idgrid_labelzstoredcpp_init_yearversion
0CMIPIPSLIPSL-CM6A-LRhistoricalr24i1p1f1Oyro2gngs://cmip6/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/histor...NaN20180803
1CMIPIPSLIPSL-CM6A-LRhistoricalr25i1p1f1Oyro2gngs://cmip6/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/histor...NaN20180803
2CMIPIPSLIPSL-CM6A-LRhistoricalr10i1p1f1Oyro2gngs://cmip6/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/histor...NaN20180803
3CMIPIPSLIPSL-CM6A-LRhistoricalr11i1p1f1Oyro2gngs://cmip6/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/histor...NaN20180803
4CMIPIPSLIPSL-CM6A-LRhistoricalr21i1p1f1Oyro2gngs://cmip6/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/histor...NaN20180803
\n", - "
" - ], - "text/plain": [ - " activity_id institution_id source_id experiment_id member_id table_id \\\n", - "0 CMIP IPSL IPSL-CM6A-LR historical r24i1p1f1 Oyr \n", - "1 CMIP IPSL IPSL-CM6A-LR historical r25i1p1f1 Oyr \n", - "2 CMIP IPSL IPSL-CM6A-LR historical r10i1p1f1 Oyr \n", - "3 CMIP IPSL IPSL-CM6A-LR historical r11i1p1f1 Oyr \n", - "4 CMIP IPSL IPSL-CM6A-LR historical r21i1p1f1 Oyr \n", - "\n", - " variable_id grid_label zstore \\\n", - "0 o2 gn gs://cmip6/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/histor... \n", - "1 o2 gn gs://cmip6/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/histor... \n", - "2 o2 gn gs://cmip6/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/histor... \n", - "3 o2 gn gs://cmip6/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/histor... \n", - "4 o2 gn gs://cmip6/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/histor... \n", - "\n", - " dcpp_init_year version \n", - "0 NaN 20180803 \n", - "1 NaN 20180803 \n", - "2 NaN 20180803 \n", - "3 NaN 20180803 \n", - "4 NaN 20180803 " - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cat.df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "aa91a42f-aa5f-4c94-a95f-e1c0db820239", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/var/folders/dd/_xm_pbpd3flgbvbnt7qhd70snnbpj_/T/ipykernel_22903/1715370549.py:1: DeprecationWarning: cdf_kwargs and zarr_kwargs are deprecated and will be removed in a future version. Please use xarray_open_kwargs instead.\n", - " dset_dict = cat.to_dataset_dict(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "--> The keys in the returned dictionary of datasets are constructed as follows:\n", - "\t'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'\n", - "â–ˆ\r" - ] - } - ], - "source": [ - "dset_dict = cat.to_dataset_dict(\n", - " zarr_kwargs={\"consolidated\": True, \"decode_times\": True}\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "6a78f1f0-71eb-4a04-a149-2903b8c38696", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['CMIP.IPSL.IPSL-CM5A2-INCA.historical.Oyr.gn', 'ScenarioMIP.CMCC.CMCC-ESM2.ssp585.Oyr.gn', 'CMIP.EC-Earth-Consortium.EC-Earth3-CC.historical.Oyr.gn', 'ScenarioMIP.MRI.MRI-ESM2-0.ssp585.Oyr.gn', 'ScenarioMIP.EC-Earth-Consortium.EC-Earth3-CC.ssp585.Oyr.gn', 'CMIP.CMCC.CMCC-ESM2.historical.Oyr.gn', 'ScenarioMIP.DWD.MPI-ESM1-2-HR.ssp585.Oyr.gn', 'ScenarioMIP.CCCma.CanESM5-CanOE.ssp585.Oyr.gn', 'CMIP.NCC.NorESM2-LM.historical.Oyr.gn', 'CMIP.NCC.NorESM2-MM.historical.Oyr.gn', 'CMIP.MRI.MRI-ESM2-0.historical.Oyr.gn', 'ScenarioMIP.MPI-M.MPI-ESM1-2-LR.ssp585.Oyr.gn', 'CMIP.HAMMOZ-Consortium.MPI-ESM-1-2-HAM.historical.Oyr.gn', 'CMIP.MPI-M.MPI-ESM1-2-LR.historical.Oyr.gn', 'ScenarioMIP.DKRZ.MPI-ESM1-2-HR.ssp585.Oyr.gn', 'CMIP.CCCma.CanESM5-CanOE.historical.Oyr.gn', 'ScenarioMIP.MIROC.MIROC-ES2L.ssp585.Oyr.gn', 'ScenarioMIP.NCC.NorESM2-MM.ssp585.Oyr.gn', 'ScenarioMIP.NCC.NorESM2-LM.ssp585.Oyr.gn', 'ScenarioMIP.IPSL.IPSL-CM6A-LR.ssp585.Oyr.gn', 'CMIP.MPI-M.MPI-ESM1-2-HR.historical.Oyr.gn', 'ScenarioMIP.NCAR.CESM2.ssp585.Oyr.gn', 'CMIP.MIROC.MIROC-ES2L.historical.Oyr.gn', 'ScenarioMIP.CCCma.CanESM5.ssp585.Oyr.gn', 'CMIP.CSIRO.ACCESS-ESM1-5.historical.Oyr.gn', 'CMIP.IPSL.IPSL-CM6A-LR.historical.Oyr.gn', 'CMIP.CCCma.CanESM5.historical.Oyr.gn'])" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dset_dict.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "1ae6b93a-d5cf-4fee-8576-51b4f951e2b0", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "dset_collection = xc.Collection(dset_dict)\n", - "dset_collection" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "ae82495d-8418-47f1-b971-c168c40ca2c4", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['CMIP.IPSL.IPSL-CM5A2-INCA.historical.Oyr.gn', 'ScenarioMIP.CMCC.CMCC-ESM2.ssp585.Oyr.gn', 'CMIP.EC-Earth-Consortium.EC-Earth3-CC.historical.Oyr.gn', 'ScenarioMIP.MRI.MRI-ESM2-0.ssp585.Oyr.gn', 'ScenarioMIP.EC-Earth-Consortium.EC-Earth3-CC.ssp585.Oyr.gn', 'CMIP.CMCC.CMCC-ESM2.historical.Oyr.gn', 'ScenarioMIP.DWD.MPI-ESM1-2-HR.ssp585.Oyr.gn', 'ScenarioMIP.CCCma.CanESM5-CanOE.ssp585.Oyr.gn', 'CMIP.NCC.NorESM2-LM.historical.Oyr.gn', 'CMIP.NCC.NorESM2-MM.historical.Oyr.gn', 'CMIP.MRI.MRI-ESM2-0.historical.Oyr.gn', 'ScenarioMIP.MPI-M.MPI-ESM1-2-LR.ssp585.Oyr.gn', 'CMIP.HAMMOZ-Consortium.MPI-ESM-1-2-HAM.historical.Oyr.gn', 'CMIP.MPI-M.MPI-ESM1-2-LR.historical.Oyr.gn', 'ScenarioMIP.DKRZ.MPI-ESM1-2-HR.ssp585.Oyr.gn', 'CMIP.CCCma.CanESM5-CanOE.historical.Oyr.gn', 'ScenarioMIP.MIROC.MIROC-ES2L.ssp585.Oyr.gn', 'ScenarioMIP.NCC.NorESM2-MM.ssp585.Oyr.gn', 'ScenarioMIP.NCC.NorESM2-LM.ssp585.Oyr.gn', 'ScenarioMIP.IPSL.IPSL-CM6A-LR.ssp585.Oyr.gn', 'CMIP.MPI-M.MPI-ESM1-2-HR.historical.Oyr.gn', 'ScenarioMIP.NCAR.CESM2.ssp585.Oyr.gn', 'CMIP.MIROC.MIROC-ES2L.historical.Oyr.gn', 'ScenarioMIP.CCCma.CanESM5.ssp585.Oyr.gn', 'CMIP.CSIRO.ACCESS-ESM1-5.historical.Oyr.gn', 'CMIP.IPSL.IPSL-CM6A-LR.historical.Oyr.gn', 'CMIP.CCCma.CanESM5.historical.Oyr.gn'])" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dset_collection.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "7f31b51d-e6a6-4437-87c7-a4b52021d6ef", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
<xarray.Dataset>\n",
-       "Dimensions:         (y: 149, x: 182, nvertex: 4, deptht: 31, axis_nbounds: 2, member_id: 1, dcpp_init_year: 1, time: 165)\n",
-       "Coordinates:\n",
-       "    bounds_nav_lat  (y, x, nvertex) float32 dask.array<chunksize=(149, 182, 4), meta=np.ndarray>\n",
-       "    bounds_nav_lon  (y, x, nvertex) float32 dask.array<chunksize=(149, 182, 4), meta=np.ndarray>\n",
-       "  * deptht          (deptht) float32 5.0 15.0 25.0 ... 4.75e+03 5.25e+03\n",
-       "    deptht_bounds   (deptht, axis_nbounds) float32 dask.array<chunksize=(31, 2), meta=np.ndarray>\n",
-       "    nav_lat         (y, x) float32 dask.array<chunksize=(149, 182), meta=np.ndarray>\n",
-       "    nav_lon         (y, x) float32 dask.array<chunksize=(149, 182), meta=np.ndarray>\n",
-       "  * time            (time) object 1850-07-02 12:00:00 ... 2014-07-02 12:00:00\n",
-       "    time_bounds     (time, axis_nbounds) object dask.array<chunksize=(165, 2), meta=np.ndarray>\n",
-       "  * member_id       (member_id) <U8 'r1i1p1f1'\n",
-       "  * dcpp_init_year  (dcpp_init_year) float64 nan\n",
-       "Dimensions without coordinates: y, x, nvertex, axis_nbounds\n",
-       "Data variables:\n",
-       "    area            (y, x) float32 dask.array<chunksize=(149, 182), meta=np.ndarray>\n",
-       "    o2              (member_id, dcpp_init_year, time, deptht, y, x) float32 dask.array<chunksize=(1, 1, 34, 31, 149, 182), meta=np.ndarray>\n",
-       "Attributes: (12/66)\n",
-       "    CMIP6_CV_version:                 cv=6.2.15.1\n",
-       "    Conventions:                      CF-1.7 CMIP-6.2\n",
-       "    EXPID:                            historical\n",
-       "    activity_id:                      CMIP\n",
-       "    branch_method:                    standard\n",
-       "    branch_time_in_child:             0.0\n",
-       "    ...                               ...\n",
-       "    intake_esm_attrs/variable_id:     o2\n",
-       "    intake_esm_attrs/grid_label:      gn\n",
-       "    intake_esm_attrs/zstore:          gs://cmip6/CMIP6/CMIP/IPSL/IPSL-CM5A2-I...\n",
-       "    intake_esm_attrs/dcpp_init_year:  nan\n",
-       "    intake_esm_attrs/version:         20200729\n",
-       "    intake_esm_dataset_key:           CMIP.IPSL.IPSL-CM5A2-INCA.historical.Oy...
" - ], - "text/plain": [ - "\n", - "Dimensions: (y: 149, x: 182, nvertex: 4, deptht: 31, axis_nbounds: 2, member_id: 1, dcpp_init_year: 1, time: 165)\n", - "Coordinates:\n", - " bounds_nav_lat (y, x, nvertex) float32 dask.array\n", - " bounds_nav_lon (y, x, nvertex) float32 dask.array\n", - " * deptht (deptht) float32 5.0 15.0 25.0 ... 4.75e+03 5.25e+03\n", - " deptht_bounds (deptht, axis_nbounds) float32 dask.array\n", - " nav_lat (y, x) float32 dask.array\n", - " nav_lon (y, x) float32 dask.array\n", - " * time (time) object 1850-07-02 12:00:00 ... 2014-07-02 12:00:00\n", - " time_bounds (time, axis_nbounds) object dask.array\n", - " * member_id (member_id) \n", - " o2 (member_id, dcpp_init_year, time, deptht, y, x) float32 dask.array\n", - "Attributes: (12/66)\n", - " CMIP6_CV_version: cv=6.2.15.1\n", - " Conventions: CF-1.7 CMIP-6.2\n", - " EXPID: historical\n", - " activity_id: CMIP\n", - " branch_method: standard\n", - " branch_time_in_child: 0.0\n", - " ... ...\n", - " intake_esm_attrs/variable_id: o2\n", - " intake_esm_attrs/grid_label: gn\n", - " intake_esm_attrs/zstore: gs://cmip6/CMIP6/CMIP/IPSL/IPSL-CM5A2-I...\n", - " intake_esm_attrs/dcpp_init_year: nan\n", - " intake_esm_attrs/version: 20200729\n", - " intake_esm_dataset_key: CMIP.IPSL.IPSL-CM5A2-INCA.historical.Oy..." - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dset_collection['CMIP.IPSL.IPSL-CM5A2-INCA.historical.Oyr.gn']" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "7020c4c2-49c8-4a0e-938d-586e2a8d04af", - "metadata": {}, - "outputs": [ - { - "ename": "AttributeError", - "evalue": "'Collection' object has no attribute 'dims'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/var/folders/dd/_xm_pbpd3flgbvbnt7qhd70snnbpj_/T/ipykernel_22903/3401585551.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdset_collection\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdims\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m: 'Collection' object has no attribute 'dims'" - ] - } - ], - "source": [ - "dset_collection.dims" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "45e0e16a-93b9-4ba8-af2e-04e7afc7e2ae", - "metadata": {}, - "outputs": [], - "source": [ - "cat = intake.open_esm_datastore(path)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From dfdb763eba5875ca5133364712594ab82a50dfed Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 24 Nov 2021 19:27:46 +0000 Subject: [PATCH 24/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- intake_esm/core.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/intake_esm/core.py b/intake_esm/core.py index b89b22ee..09b0f668 100644 --- a/intake_esm/core.py +++ b/intake_esm/core.py @@ -6,8 +6,8 @@ import dask import pandas as pd import pydantic -import xcollection as xc import xarray as xr +import xcollection as xc from fastprogress.fastprogress import progress_bar from intake.catalog import Catalog @@ -248,8 +248,7 @@ def __dir__(self) -> typing.List[str]: rv = [ 'df', 'to_dataset_dict', - 'to_collection' - 'to_dask', + 'to_collection' 'to_dask', 'keys', 'serialize', 'datasets', @@ -644,14 +643,16 @@ def to_collection( pr (member_id, time, lat, lon) float32 dask.array """ - self.datasets = self.to_dataset_dict(xarray_open_kwargs = xarray_open_kwargs, - xarray_combine_by_coords_kwargs = xarray_combine_by_coords_kwargs, - preprocess = preprocess, - storage_options = storage_options, - progressbar = progressbar, - aggregate = aggregate, - skip_on_error = skip_on_error, - **kwargs,) + self.datasets = self.to_dataset_dict( + xarray_open_kwargs=xarray_open_kwargs, + xarray_combine_by_coords_kwargs=xarray_combine_by_coords_kwargs, + preprocess=preprocess, + storage_options=storage_options, + progressbar=progressbar, + aggregate=aggregate, + skip_on_error=skip_on_error, + **kwargs, + ) self.datasets = xc.Collection(self.datasets) return self.datasets From 755dce201be3309b559f653aaca60b922e76d34a Mon Sep 17 00:00:00 2001 From: Julia Kent Date: Wed, 24 Nov 2021 12:33:12 -0700 Subject: [PATCH 25/30] add test --- tests/test_core.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test_core.py b/tests/test_core.py index 1e7d7319..35c18b5c 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -5,6 +5,7 @@ import pydantic import pytest import xarray as xr +import xcollection as xc import intake_esm @@ -240,6 +241,15 @@ def test_to_dataset_dict(path, query, xarray_open_kwargs): assert len(ds.__dask_keys__()) > 0 assert ds.time.encoding +def test_to_collection(path, query, xarray_open_kwargs): + cat = intake.open_esm_datastore(path) + cat_sub = cat.search(**query) + coll = cat_sub.to_collection(xarray_open_kwargs=xarray_open_kwargs) + _, ds = coll.popitem() + assert 'member_id' in ds.dims + assert len(ds.__dask_keys__()) > 0 + assert ds.time.encoding + assert isinstance(coll, xc.Collection) @pytest.mark.parametrize( 'path, query, xarray_open_kwargs', From 5dfea2795a67d94db5573af455bf9de1ea3dd0ff Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 24 Nov 2021 19:33:31 +0000 Subject: [PATCH 26/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_core.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_core.py b/tests/test_core.py index 35c18b5c..8cb54b34 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -241,6 +241,7 @@ def test_to_dataset_dict(path, query, xarray_open_kwargs): assert len(ds.__dask_keys__()) > 0 assert ds.time.encoding + def test_to_collection(path, query, xarray_open_kwargs): cat = intake.open_esm_datastore(path) cat_sub = cat.search(**query) @@ -251,6 +252,7 @@ def test_to_collection(path, query, xarray_open_kwargs): assert ds.time.encoding assert isinstance(coll, xc.Collection) + @pytest.mark.parametrize( 'path, query, xarray_open_kwargs', [ From d65e9b1e57e9a1190b38e4dd80d06aa702538211 Mon Sep 17 00:00:00 2001 From: Julia Kent Date: Wed, 24 Nov 2021 12:43:57 -0700 Subject: [PATCH 27/30] get parameterize --- tests/test_core.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/tests/test_core.py b/tests/test_core.py index 8cb54b34..9bbd305e 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -241,7 +241,28 @@ def test_to_dataset_dict(path, query, xarray_open_kwargs): assert len(ds.__dask_keys__()) > 0 assert ds.time.encoding - +@pytest.mark.parametrize( + 'path, query, xarray_open_kwargs', + [ + ( + zarr_col_pangeo_cmip6, + dict( + variable_id=['pr'], + experiment_id='ssp370', + activity_id='AerChemMIP', + source_id='BCC-ESM1', + table_id='Amon', + grid_label='gn', + ), + {'consolidated': True, 'backend_kwargs': {'storage_options': {'token': 'anon'}}}, + ), + ( + cdf_col_sample_cmip6, + dict(source_id=['CNRM-ESM2-1', 'CNRM-CM6-1', 'BCC-ESM1'], variable_id=['tasmax']), + {'chunks': {'time': 1}}, + ), + ], +) def test_to_collection(path, query, xarray_open_kwargs): cat = intake.open_esm_datastore(path) cat_sub = cat.search(**query) From f8aa2140ac42690560948a799e5aa022b885a04b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 24 Nov 2021 19:44:18 +0000 Subject: [PATCH 28/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_core.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_core.py b/tests/test_core.py index 9bbd305e..8fb2fdac 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -241,6 +241,7 @@ def test_to_dataset_dict(path, query, xarray_open_kwargs): assert len(ds.__dask_keys__()) > 0 assert ds.time.encoding + @pytest.mark.parametrize( 'path, query, xarray_open_kwargs', [ From 0ee96ad4bb23edb89eb7cb0146732302443d7655 Mon Sep 17 00:00:00 2001 From: Julia Kent Date: Wed, 24 Nov 2021 12:55:32 -0700 Subject: [PATCH 29/30] use published xcollection --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index d8c65ec2..630f89e1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,4 @@ requests>=2.24.0 xarray>=0.19,!=0.20.0,!=0.20.1 zarr>=2.5 pydantic>=1.8.2 -git+https://github.com/NCAR/xcollection.git +xcollection diff --git a/setup.py b/setup.py index b3019f05..86a160ad 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ from setuptools import find_packages, setup with open('requirements.txt') as f: - install_requires = f.read().strip().split('\n')[0:-1] + install_requires = f.read().strip().split('\n') if exists('README.md'): From 254f7095dc7d7799b1653c3b186c0791912f505a Mon Sep 17 00:00:00 2001 From: Julia Kent <46687291+jukent@users.noreply.github.com> Date: Wed, 24 Nov 2021 13:01:41 -0700 Subject: [PATCH 30/30] Update intake_esm/core.py Co-authored-by: Anderson Banihirwe --- intake_esm/core.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/intake_esm/core.py b/intake_esm/core.py index 09b0f668..3b994716 100644 --- a/intake_esm/core.py +++ b/intake_esm/core.py @@ -248,7 +248,8 @@ def __dir__(self) -> typing.List[str]: rv = [ 'df', 'to_dataset_dict', - 'to_collection' 'to_dask', + 'to_collection', + 'to_dask', 'keys', 'serialize', 'datasets',