From 94be70d09b83fb0fac84c1410a3f3358afc29838 Mon Sep 17 00:00:00 2001 From: Charles Stern <62192187+cisaacstern@users.noreply.github.com> Date: Fri, 6 May 2022 14:49:09 -0700 Subject: [PATCH 01/11] add BaseRecipe.get_execution_context --- pangeo_forge_recipes/recipes/base.py | 10 ++++++++++ setup.cfg | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/pangeo_forge_recipes/recipes/base.py b/pangeo_forge_recipes/recipes/base.py index cea1e92f..1762ad4e 100644 --- a/pangeo_forge_recipes/recipes/base.py +++ b/pangeo_forge_recipes/recipes/base.py @@ -4,6 +4,8 @@ from dataclasses import dataclass, field, replace from typing import Callable, ClassVar +import pkg_resources # type: ignore + from ..executors.base import Pipeline from ..patterns import FilePattern, prune_pattern from ..serialization import dataclass_sha256 @@ -50,6 +52,14 @@ def to_beam(self): def sha256(self): return dataclass_sha256(self, ignore_keys=self._hash_exclude_) + def get_execution_context(self): + return dict( + # See https://stackoverflow.com/a/2073599 re: version + version=pkg_resources.require("registrar")[0].version, + recipe_hash=self.sha256(), + inputs_hash=self.file_pattern.sha256(), + ) + RecipeCompiler = Callable[[BaseRecipe], Pipeline] diff --git a/setup.cfg b/setup.cfg index ac6165d9..56fb3930 100644 --- a/setup.cfg +++ b/setup.cfg @@ -55,7 +55,7 @@ max-line-length = 100 [isort] known_first_party=pangeo_forge_recipes -known_third_party=aiohttp,apache_beam,click,dask,fsspec,kerchunk,mypy_extensions,numpy,pandas,prefect,pytest,pytest_lazyfixture,setuptools,xarray,yaml,zarr +known_third_party=aiohttp,apache_beam,click,dask,fsspec,kerchunk,mypy_extensions,numpy,pandas,pkg_resources,prefect,pytest,pytest_lazyfixture,setuptools,xarray,yaml,zarr multi_line_output=3 include_trailing_comma=True force_grid_wrap=0 From 3ca6066b4f9ad6b19747f70a0c5ae11b779589da Mon Sep 17 00:00:00 2001 From: Charles Stern <62192187+cisaacstern@users.noreply.github.com> Date: Mon, 9 May 2022 09:18:26 -0700 Subject: [PATCH 02/11] get pangeo-forge-recipes (not registrar) version --- pangeo_forge_recipes/recipes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pangeo_forge_recipes/recipes/base.py b/pangeo_forge_recipes/recipes/base.py index 1762ad4e..03c0e842 100644 --- a/pangeo_forge_recipes/recipes/base.py +++ b/pangeo_forge_recipes/recipes/base.py @@ -55,7 +55,7 @@ def sha256(self): def get_execution_context(self): return dict( # See https://stackoverflow.com/a/2073599 re: version - version=pkg_resources.require("registrar")[0].version, + version=pkg_resources.require("pangeo-forge-recipes")[0].version, recipe_hash=self.sha256(), inputs_hash=self.file_pattern.sha256(), ) From a5b6f8f8519193ab2a07a0b499e68adbe579b073 Mon Sep 17 00:00:00 2001 From: Charles Stern <62192187+cisaacstern@users.noreply.github.com> Date: Mon, 9 May 2022 12:56:47 -0700 Subject: [PATCH 03/11] add test_execution_context --- pangeo_forge_recipes/recipes/base.py | 4 +-- tests/recipe_tests/test_execution_context.py | 29 ++++++++++++++++++++ 2 files changed, 31 insertions(+), 2 deletions(-) create mode 100644 tests/recipe_tests/test_execution_context.py diff --git a/pangeo_forge_recipes/recipes/base.py b/pangeo_forge_recipes/recipes/base.py index 03c0e842..d0e5c35d 100644 --- a/pangeo_forge_recipes/recipes/base.py +++ b/pangeo_forge_recipes/recipes/base.py @@ -56,8 +56,8 @@ def get_execution_context(self): return dict( # See https://stackoverflow.com/a/2073599 re: version version=pkg_resources.require("pangeo-forge-recipes")[0].version, - recipe_hash=self.sha256(), - inputs_hash=self.file_pattern.sha256(), + recipe_hash=self.sha256().hex(), + inputs_hash=self.file_pattern.sha256().hex(), ) diff --git a/tests/recipe_tests/test_execution_context.py b/tests/recipe_tests/test_execution_context.py new file mode 100644 index 00000000..b5a42c20 --- /dev/null +++ b/tests/recipe_tests/test_execution_context.py @@ -0,0 +1,29 @@ +import re + +import pandas as pd +import pytest + +from pangeo_forge_recipes.patterns import ConcatDim, FilePattern +from pangeo_forge_recipes.recipes import HDFReferenceRecipe, XarrayZarrRecipe + + +@pytest.fixture +def pattern_for_execution_context(): + dates = pd.date_range("1981-09-01", "1981-09-04", freq="D") + + def make_url(time): + return f"https://data-provider.org/{time}.nc" + + time_concat_dim = ConcatDim("time", dates, nitems_per_file=1) + return FilePattern(make_url, time_concat_dim) + + +@pytest.mark.parametrize("recipe_cls", [XarrayZarrRecipe, HDFReferenceRecipe]) +def test_execution_context(recipe_cls, pattern_for_execution_context): + + recipe = recipe_cls(pattern_for_execution_context) + ec = recipe.get_execution_context() + + assert re.match(r"^([0-9]+)\.([0-9]+)\.([0-9]+)$", ec["version"].split(".dev")[0]) + assert isinstance(ec["recipe_hash"], str) and len(ec["recipe_hash"]) == 64 + assert isinstance(ec["inputs_hash"], str) and len(ec["inputs_hash"]) == 64 From 5dd3f45c2b9a644fc101b8663242066dfae5a626 Mon Sep 17 00:00:00 2001 From: Charles Stern <62192187+cisaacstern@users.noreply.github.com> Date: Wed, 11 May 2022 16:42:00 -0700 Subject: [PATCH 04/11] write pangeo-forge execution context to store in XarrayZarrRecipe.prepare_target --- pangeo_forge_recipes/recipes/xarray_zarr.py | 4 +++ tests/recipe_tests/test_execution_context.py | 31 +++++++++----------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/pangeo_forge_recipes/recipes/xarray_zarr.py b/pangeo_forge_recipes/recipes/xarray_zarr.py index bb00bf9c..e77a5aa1 100644 --- a/pangeo_forge_recipes/recipes/xarray_zarr.py +++ b/pangeo_forge_recipes/recipes/xarray_zarr.py @@ -577,6 +577,10 @@ def filter_init_chunks(chunk_key): recipe_meta = {"input_sequence_lens": input_sequence_lens} config.storage_config.metadata[_GLOBAL_METADATA_KEY] = recipe_meta + zgroup = zarr.open_group(config.target_mapper) + for k, v in config.get_execution_context().items(): + zgroup.attrs[f"pangeo-forge:{k}"] = v + def store_chunk(chunk_key: ChunkKey, *, config: XarrayZarrRecipe) -> None: if config.storage_config.target is None: diff --git a/tests/recipe_tests/test_execution_context.py b/tests/recipe_tests/test_execution_context.py index b5a42c20..20a3e250 100644 --- a/tests/recipe_tests/test_execution_context.py +++ b/tests/recipe_tests/test_execution_context.py @@ -1,29 +1,26 @@ import re -import pandas as pd import pytest +import xarray as xr +import zarr -from pangeo_forge_recipes.patterns import ConcatDim, FilePattern -from pangeo_forge_recipes.recipes import HDFReferenceRecipe, XarrayZarrRecipe +from pangeo_forge_recipes.recipes import XarrayZarrRecipe -@pytest.fixture -def pattern_for_execution_context(): - dates = pd.date_range("1981-09-01", "1981-09-04", freq="D") +@pytest.mark.parametrize("recipe_cls", [XarrayZarrRecipe]) # HDFReferenceRecipe]) +def test_execution_context(recipe_cls, netcdf_local_file_pattern_sequential): - def make_url(time): - return f"https://data-provider.org/{time}.nc" - - time_concat_dim = ConcatDim("time", dates, nitems_per_file=1) - return FilePattern(make_url, time_concat_dim) - - -@pytest.mark.parametrize("recipe_cls", [XarrayZarrRecipe, HDFReferenceRecipe]) -def test_execution_context(recipe_cls, pattern_for_execution_context): - - recipe = recipe_cls(pattern_for_execution_context) + recipe = recipe_cls(netcdf_local_file_pattern_sequential) ec = recipe.get_execution_context() assert re.match(r"^([0-9]+)\.([0-9]+)\.([0-9]+)$", ec["version"].split(".dev")[0]) assert isinstance(ec["recipe_hash"], str) and len(ec["recipe_hash"]) == 64 assert isinstance(ec["inputs_hash"], str) and len(ec["inputs_hash"]) == 64 + + recipe.to_function()() + zgroup = zarr.open_group(recipe.target_mapper) + ds = xr.open_zarr(recipe.target_mapper, consolidated=True) + + for k, v in ec.items(): + assert zgroup.attrs[f"pangeo-forge:{k}"] == v + assert ds.attrs[f"pangeo-forge:{k}"] == v From 2cb2e598c2dc6c88c697dc340a6d5a36d35fd262 Mon Sep 17 00:00:00 2001 From: Charles Stern <62192187+cisaacstern@users.noreply.github.com> Date: Wed, 11 May 2022 17:03:46 -0700 Subject: [PATCH 05/11] drop execution context vars from XarrayZarrRecipe equality tests --- tests/recipe_tests/test_XarrayZarrRecipe.py | 27 +++++++++++++++------ 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/tests/recipe_tests/test_XarrayZarrRecipe.py b/tests/recipe_tests/test_XarrayZarrRecipe.py index 54c41cbf..f6053179 100644 --- a/tests/recipe_tests/test_XarrayZarrRecipe.py +++ b/tests/recipe_tests/test_XarrayZarrRecipe.py @@ -21,6 +21,17 @@ from pangeo_forge_recipes.storage import MetadataTarget, StorageConfig +def drop_execution_context_attrs(ds: xr.Dataset) -> xr.Dataset: + """Drop pangeo-forge execution context attrs from a dataset.""" + + ds_copy = ds.copy(deep=True) + to_drop = [k for k in ds_copy.attrs if k.startswith("pangeo-forge:")] + for k in to_drop: + del ds_copy.attrs[k] + + return ds_copy + + def make_netCDFtoZarr_recipe( file_pattern, xarray_dataset, target, cache, metadata, extra_kwargs=None ): @@ -120,7 +131,7 @@ def test_recipe(recipe_fixture, execute_recipe): rec = RecipeClass(file_pattern, **kwargs) execute_recipe(rec) ds_actual = xr.open_zarr(target.get_mapper()).load() - xr.testing.assert_identical(ds_actual, ds_expected) + xr.testing.assert_identical(drop_execution_context_attrs(ds_actual), ds_expected) @pytest.mark.parametrize("get_mapper_from", ["storage_config", "target", "target_mapper"]) @@ -139,7 +150,7 @@ def test_recipe_default_storage(recipe_fixture, execute_recipe, get_mapper_from) elif get_mapper_from == "target_mapper": mapper = rec.target_mapper ds_actual = xr.open_zarr(mapper).load() - xr.testing.assert_identical(ds_actual, ds_expected) + xr.testing.assert_identical(drop_execution_context_attrs(ds_actual), ds_expected) @pytest.mark.parametrize("recipe_fixture", all_recipes) @@ -150,7 +161,7 @@ def test_recipe_with_references(recipe_fixture, execute_recipe): rec = RecipeClass(file_pattern, open_input_with_kerchunk=True, **kwargs) execute_recipe(rec) ds_actual = xr.open_zarr(target.get_mapper()).load() - xr.testing.assert_identical(ds_actual, ds_expected) + xr.testing.assert_identical(drop_execution_context_attrs(ds_actual), ds_expected) @pytest.mark.parametrize("recipe_fixture", all_recipes) @@ -195,7 +206,7 @@ def test_recipe_caching_copying(recipe, execute_recipe, cache_inputs, copy_input ) execute_recipe(rec) ds_actual = xr.open_zarr(target.get_mapper()).load() - xr.testing.assert_identical(ds_actual, ds_expected) + xr.testing.assert_identical(drop_execution_context_attrs(ds_actual), ds_expected) # function passed to preprocessing @@ -228,7 +239,7 @@ def test_process(recipe_fixture, execute_recipe, process_input, process_chunk): assert not ds_actual.identical(ds_expected) ds_expected = incr_date(ds_expected) - xr.testing.assert_identical(ds_actual, ds_expected) + xr.testing.assert_identical(drop_execution_context_attrs(ds_actual), ds_expected) def do_actual_chunks_test( @@ -303,7 +314,7 @@ def do_actual_chunks_test( for dim in ds_actual.dims: assert store[dim].chunks == ds_actual[dim].shape - xr.testing.assert_identical(ds_actual, ds_expected) + xr.testing.assert_identical(drop_execution_context_attrs(ds_actual), ds_expected) @pytest.mark.parametrize("inputs_per_chunk,subset_inputs", [(1, {}), (1, {"time": 2}), (2, {})]) @@ -376,7 +387,7 @@ def test_no_consolidate_dimension_coordinates(netCDFtoZarr_recipe): rec.consolidate_dimension_coordinates = False rec.to_function()() ds_actual = xr.open_zarr(target.get_mapper()).load() - xr.testing.assert_identical(ds_actual, ds_expected) + xr.testing.assert_identical(drop_execution_context_attrs(ds_actual), ds_expected) store = zarr.open_consolidated(target.get_mapper()) assert store["time"].chunks == (file_pattern.nitems_per_input["time"],) @@ -399,7 +410,7 @@ def test_consolidate_dimension_coordinates_with_coordinateless_dimension( rec = RecipeClass(file_pattern, **kwargs) rec.to_function()() ds_actual = xr.open_zarr(target.get_mapper()).load() - xr.testing.assert_identical(ds_actual, ds_expected) + xr.testing.assert_identical(drop_execution_context_attrs(ds_actual), ds_expected) def test_lock_timeout(netCDFtoZarr_recipe_sequential_only, execute_recipe_no_dask): From d356bd7ee41874991e6d9608ef5bde2d2e15c19d Mon Sep 17 00:00:00 2001 From: Charles Stern <62192187+cisaacstern@users.noreply.github.com> Date: Wed, 11 May 2022 17:48:23 -0700 Subject: [PATCH 06/11] update execution context version asserts for actions runners compatibility --- setup.cfg | 2 +- tests/recipe_tests/test_execution_context.py | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/setup.cfg b/setup.cfg index 56fb3930..7c435917 100644 --- a/setup.cfg +++ b/setup.cfg @@ -55,7 +55,7 @@ max-line-length = 100 [isort] known_first_party=pangeo_forge_recipes -known_third_party=aiohttp,apache_beam,click,dask,fsspec,kerchunk,mypy_extensions,numpy,pandas,pkg_resources,prefect,pytest,pytest_lazyfixture,setuptools,xarray,yaml,zarr +known_third_party=aiohttp,apache_beam,click,dask,fsspec,kerchunk,mypy_extensions,numpy,packaging,pandas,pkg_resources,prefect,pytest,pytest_lazyfixture,setuptools,xarray,yaml,zarr multi_line_output=3 include_trailing_comma=True force_grid_wrap=0 diff --git a/tests/recipe_tests/test_execution_context.py b/tests/recipe_tests/test_execution_context.py index 20a3e250..020a005d 100644 --- a/tests/recipe_tests/test_execution_context.py +++ b/tests/recipe_tests/test_execution_context.py @@ -1,8 +1,7 @@ -import re - import pytest import xarray as xr import zarr +from packaging import version from pangeo_forge_recipes.recipes import XarrayZarrRecipe @@ -13,7 +12,11 @@ def test_execution_context(recipe_cls, netcdf_local_file_pattern_sequential): recipe = recipe_cls(netcdf_local_file_pattern_sequential) ec = recipe.get_execution_context() - assert re.match(r"^([0-9]+)\.([0-9]+)\.([0-9]+)$", ec["version"].split(".dev")[0]) + ec_version = version.parse(ec["version"]) + assert ec_version.is_devrelease # should be True for editable installs used in tests + assert isinstance(ec_version.major, int) and 0 <= ec_version.major <= 1 + assert isinstance(ec_version.minor, int) and 0 <= ec_version.major <= 99 + assert isinstance(ec["recipe_hash"], str) and len(ec["recipe_hash"]) == 64 assert isinstance(ec["inputs_hash"], str) and len(ec["inputs_hash"]) == 64 From d3fcf0fdf7baee85fb16d7db5f0d1f5a1b39f0d3 Mon Sep 17 00:00:00 2001 From: Charles Stern <62192187+cisaacstern@users.noreply.github.com> Date: Wed, 11 May 2022 18:06:39 -0700 Subject: [PATCH 07/11] execution context docs first pass --- .../development/release_notes.md | 6 +++++ .../recipe_user_guide/execution.md | 23 +++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/docs/pangeo_forge_recipes/development/release_notes.md b/docs/pangeo_forge_recipes/development/release_notes.md index 4dcee1f5..e6687025 100644 --- a/docs/pangeo_forge_recipes/development/release_notes.md +++ b/docs/pangeo_forge_recipes/development/release_notes.md @@ -1,5 +1,11 @@ # Release Notes +## v0.9.1 - Unreleased + +- Persist Pangeo Forge execution context metadata in target datasets. This information, which includes +the `pangeo-forge-recipes` version as well as recipe and input hashes, attaches execution provenance +to the dataset itself. {pull}`359` + ## v0.9 - 2022-05-11 - **Breaking changes:** Deprecated `XarrayZarrRecipe` manual stage methods. Manual execution can be diff --git a/docs/pangeo_forge_recipes/recipe_user_guide/execution.md b/docs/pangeo_forge_recipes/recipe_user_guide/execution.md index 8d4ffa65..14661c5c 100644 --- a/docs/pangeo_forge_recipes/recipe_user_guide/execution.md +++ b/docs/pangeo_forge_recipes/recipe_user_guide/execution.md @@ -91,3 +91,26 @@ with beam.Pipeline() as p: By default the pipeline runs using Beam's [DirectRunner](https://beam.apache.org/documentation/runners/direct/). See [runners](https://beam.apache.org/documentation/#runners) for more. + + +## Execution context + +All Pangeo Forge {doc}`recipes` contain a `.get_execution_context()` method which returns the +following metadata: + +```{code-block} python +{ + "pangeo-forge:version": "{pangeo_forge_recipes version installed at time of execution}" + "pangeo-forge:recipe_hash": "{recipe hash as returned by `recipe.sha256()`}" + "pangeo-forge:inputs_hash": "{file pattern hash as returned by `recipe.file_pattern.sha256()`}" +} +``` + +Each recipe class defines where to store this metadata: + +- `XarrayZarrRecipe`: Added to Zarr group attributes, and therefore also available via the +`xarray.Dataset.attrs` when opening Zarr stores with xarray. +- `HDFReferenceRecipe`: + +The execution context metadata which is persisted in the target dataset is used for tracking +dataset provenance. From 62183f35badc963d36b0622954d03d76865a494f Mon Sep 17 00:00:00 2001 From: Charles Stern <62192187+cisaacstern@users.noreply.github.com> Date: Thu, 12 May 2022 09:39:11 -0700 Subject: [PATCH 08/11] mark HDFReference recipe execution context as TODO in docs --- docs/pangeo_forge_recipes/recipe_user_guide/execution.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/pangeo_forge_recipes/recipe_user_guide/execution.md b/docs/pangeo_forge_recipes/recipe_user_guide/execution.md index 14661c5c..2652114c 100644 --- a/docs/pangeo_forge_recipes/recipe_user_guide/execution.md +++ b/docs/pangeo_forge_recipes/recipe_user_guide/execution.md @@ -110,7 +110,7 @@ Each recipe class defines where to store this metadata: - `XarrayZarrRecipe`: Added to Zarr group attributes, and therefore also available via the `xarray.Dataset.attrs` when opening Zarr stores with xarray. -- `HDFReferenceRecipe`: +- `HDFReferenceRecipe`: TODO The execution context metadata which is persisted in the target dataset is used for tracking dataset provenance. From b781e309920ebcc7306b6891183041598c7a7cac Mon Sep 17 00:00:00 2001 From: Charles Stern <62192187+cisaacstern@users.noreply.github.com> Date: Thu, 12 May 2022 09:42:55 -0700 Subject: [PATCH 09/11] add assert identical wrapper to test_XarrayZarrRecipe --- tests/recipe_tests/test_XarrayZarrRecipe.py | 25 ++++++++++++++------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/tests/recipe_tests/test_XarrayZarrRecipe.py b/tests/recipe_tests/test_XarrayZarrRecipe.py index f6053179..bd761c9c 100644 --- a/tests/recipe_tests/test_XarrayZarrRecipe.py +++ b/tests/recipe_tests/test_XarrayZarrRecipe.py @@ -32,6 +32,15 @@ def drop_execution_context_attrs(ds: xr.Dataset) -> xr.Dataset: return ds_copy +def assert_identical(ds1: xr.Dataset, ds2: xr.Dataset): + """Assert that two datasets are identical, excluding execution context attrs.""" + + xr.testing.assert_identical( + drop_execution_context_attrs(ds1), + drop_execution_context_attrs(ds2), + ) + + def make_netCDFtoZarr_recipe( file_pattern, xarray_dataset, target, cache, metadata, extra_kwargs=None ): @@ -131,7 +140,7 @@ def test_recipe(recipe_fixture, execute_recipe): rec = RecipeClass(file_pattern, **kwargs) execute_recipe(rec) ds_actual = xr.open_zarr(target.get_mapper()).load() - xr.testing.assert_identical(drop_execution_context_attrs(ds_actual), ds_expected) + assert_identical(ds_actual, ds_expected) @pytest.mark.parametrize("get_mapper_from", ["storage_config", "target", "target_mapper"]) @@ -150,7 +159,7 @@ def test_recipe_default_storage(recipe_fixture, execute_recipe, get_mapper_from) elif get_mapper_from == "target_mapper": mapper = rec.target_mapper ds_actual = xr.open_zarr(mapper).load() - xr.testing.assert_identical(drop_execution_context_attrs(ds_actual), ds_expected) + assert_identical(ds_actual, ds_expected) @pytest.mark.parametrize("recipe_fixture", all_recipes) @@ -161,7 +170,7 @@ def test_recipe_with_references(recipe_fixture, execute_recipe): rec = RecipeClass(file_pattern, open_input_with_kerchunk=True, **kwargs) execute_recipe(rec) ds_actual = xr.open_zarr(target.get_mapper()).load() - xr.testing.assert_identical(drop_execution_context_attrs(ds_actual), ds_expected) + assert_identical(ds_actual, ds_expected) @pytest.mark.parametrize("recipe_fixture", all_recipes) @@ -206,7 +215,7 @@ def test_recipe_caching_copying(recipe, execute_recipe, cache_inputs, copy_input ) execute_recipe(rec) ds_actual = xr.open_zarr(target.get_mapper()).load() - xr.testing.assert_identical(drop_execution_context_attrs(ds_actual), ds_expected) + assert_identical(ds_actual, ds_expected) # function passed to preprocessing @@ -239,7 +248,7 @@ def test_process(recipe_fixture, execute_recipe, process_input, process_chunk): assert not ds_actual.identical(ds_expected) ds_expected = incr_date(ds_expected) - xr.testing.assert_identical(drop_execution_context_attrs(ds_actual), ds_expected) + assert_identical(ds_actual, ds_expected) def do_actual_chunks_test( @@ -314,7 +323,7 @@ def do_actual_chunks_test( for dim in ds_actual.dims: assert store[dim].chunks == ds_actual[dim].shape - xr.testing.assert_identical(drop_execution_context_attrs(ds_actual), ds_expected) + assert_identical(ds_actual, ds_expected) @pytest.mark.parametrize("inputs_per_chunk,subset_inputs", [(1, {}), (1, {"time": 2}), (2, {})]) @@ -387,7 +396,7 @@ def test_no_consolidate_dimension_coordinates(netCDFtoZarr_recipe): rec.consolidate_dimension_coordinates = False rec.to_function()() ds_actual = xr.open_zarr(target.get_mapper()).load() - xr.testing.assert_identical(drop_execution_context_attrs(ds_actual), ds_expected) + assert_identical(ds_actual, ds_expected) store = zarr.open_consolidated(target.get_mapper()) assert store["time"].chunks == (file_pattern.nitems_per_input["time"],) @@ -410,7 +419,7 @@ def test_consolidate_dimension_coordinates_with_coordinateless_dimension( rec = RecipeClass(file_pattern, **kwargs) rec.to_function()() ds_actual = xr.open_zarr(target.get_mapper()).load() - xr.testing.assert_identical(drop_execution_context_attrs(ds_actual), ds_expected) + assert_identical(ds_actual, ds_expected) def test_lock_timeout(netCDFtoZarr_recipe_sequential_only, execute_recipe_no_dask): From a84793db5ee9424740389ae64970f56cbf07c806 Mon Sep 17 00:00:00 2001 From: Charles Stern <62192187+cisaacstern@users.noreply.github.com> Date: Thu, 12 May 2022 09:56:02 -0700 Subject: [PATCH 10/11] add json handler for funcs, update test bc funcs are now serializable --- pangeo_forge_recipes/serialization.py | 3 +++ tests/test_serialization.py | 10 +++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pangeo_forge_recipes/serialization.py b/pangeo_forge_recipes/serialization.py index edeb56ff..e5eb30c6 100644 --- a/pangeo_forge_recipes/serialization.py +++ b/pangeo_forge_recipes/serialization.py @@ -1,3 +1,4 @@ +import inspect from collections.abc import Collection from dataclasses import asdict from enum import Enum @@ -17,6 +18,8 @@ def either_encode_or_hash(obj: Any): return obj.value elif hasattr(obj, "sha256"): return obj.sha256().hex() + elif inspect.isfunction(obj): + return inspect.getsource(obj) raise TypeError(f"object of type {type(obj).__name__} not serializable") diff --git a/tests/test_serialization.py b/tests/test_serialization.py index 689911db..45fa52c1 100644 --- a/tests/test_serialization.py +++ b/tests/test_serialization.py @@ -1,6 +1,6 @@ from dataclasses import asdict, dataclass, field from datetime import datetime, timedelta -from typing import Callable, Optional +from typing import Optional import pandas as pd import pytest @@ -171,17 +171,17 @@ class NewRelease(cls): def test_either_encode_or_hash_raises(): - def f(): + class A: pass @dataclass class HasUnserializableField: - unserializable_field: Callable = f + unserializable_field: type = A - expected_msg = f"object of type {type(f).__name__} not serializable" + expected_msg = f"object of type {type(A).__name__} not serializable" with pytest.raises(TypeError, match=expected_msg): - either_encode_or_hash(f) + either_encode_or_hash(A) with pytest.raises(TypeError, match=expected_msg): # in practice, we never actually call ``either_encode_or_hash`` directly. From 149108cd2e04c7e6f415c56830fa51b00977d282 Mon Sep 17 00:00:00 2001 From: Charles Stern <62192187+cisaacstern@users.noreply.github.com> Date: Mon, 6 Jun 2022 10:07:47 -0700 Subject: [PATCH 11/11] use ds.copy() without deep=True --- tests/recipe_tests/test_XarrayZarrRecipe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/recipe_tests/test_XarrayZarrRecipe.py b/tests/recipe_tests/test_XarrayZarrRecipe.py index bd761c9c..3c3f7ff5 100644 --- a/tests/recipe_tests/test_XarrayZarrRecipe.py +++ b/tests/recipe_tests/test_XarrayZarrRecipe.py @@ -24,7 +24,7 @@ def drop_execution_context_attrs(ds: xr.Dataset) -> xr.Dataset: """Drop pangeo-forge execution context attrs from a dataset.""" - ds_copy = ds.copy(deep=True) + ds_copy = ds.copy() to_drop = [k for k in ds_copy.attrs if k.startswith("pangeo-forge:")] for k in to_drop: del ds_copy.attrs[k]