diff --git a/docs/catalogs/arguments.rst b/docs/catalogs/arguments.rst index 58b573f0..55d62023 100644 --- a/docs/catalogs/arguments.rst +++ b/docs/catalogs/arguments.rst @@ -150,13 +150,6 @@ You can find the full API documentation for smaller_table = filter_nonsense(smaller_table) yield smaller_table.to_pandas() - def provenance_info(self) -> dict: - provenance_info = { - "input_reader_type": "StarrReader", - "chunksize": self.chunksize, - } - return provenance_info - ... args = ImportArguments( diff --git a/docs/guide/index_table.rst b/docs/guide/index_table.rst index fc7232b5..9bff8ea1 100644 --- a/docs/guide/index_table.rst +++ b/docs/guide/index_table.rst @@ -120,7 +120,7 @@ string sorting will be smart enough to collate the various strings appropriately .. code-block:: python - divisions = [f"Gaia DR3 {i}" for i in range(10000, 99999, 12)] + divisions = [f"Gaia DR3 {i}" for i in range(10_000, 99_999, 12)] divisions.append("Gaia DR3 999999988604363776") Getting hints from ``_metadata`` diff --git a/src/.pylintrc b/src/.pylintrc index 25eff61d..4bf8b358 100644 --- a/src/.pylintrc +++ b/src/.pylintrc @@ -280,6 +280,8 @@ ignored-parents= # Maximum number of arguments for function / method. max-args=10 +max-positional-arguments=15 + # Maximum number of attributes for a class (see R0902). max-attributes=20 diff --git a/src/hats_import/catalog/arguments.py b/src/hats_import/catalog/arguments.py index cb56c025..d6851c8d 100644 --- a/src/hats_import/catalog/arguments.py +++ b/src/hats_import/catalog/arguments.py @@ -128,7 +128,9 @@ def _check_arguments(self): # Basic checks complete - make more checks and create directories where necessary self.input_paths = find_input_paths(self.input_path, "**/*.*", self.input_file_list) - def to_table_properties(self, total_rows: int) -> TableProperties: + def to_table_properties( + self, total_rows: int, highest_order: int, moc_sky_fraction: float + ) -> TableProperties: """Catalog-type-specific dataset info.""" info = { "catalog_name": self.output_artifact_name, @@ -136,32 +138,13 @@ def to_table_properties(self, total_rows: int) -> TableProperties: "total_rows": total_rows, "ra_column": self.ra_column, "dec_column": self.dec_column, - } + "hats_cols_sort": self.sort_columns, + "hats_max_rows": self.pixel_threshold, + "hats_order": highest_order, + "moc_sky_fraction": f"{moc_sky_fraction:0.5f}", + } | self.extra_property_dict() return TableProperties(**info) - def additional_runtime_provenance_info(self) -> dict: - file_reader_info = {"type": self.file_reader} - if isinstance(self.file_reader, InputReader): - file_reader_info = self.file_reader.provenance_info() - return { - "catalog_name": self.output_artifact_name, - "catalog_type": self.catalog_type, - "input_path": self.input_path, - "input_paths": self.input_paths, - "input_file_list": self.input_file_list, - "ra_column": self.ra_column, - "dec_column": self.dec_column, - "use_healpix_29": self.use_healpix_29, - "sort_columns": self.sort_columns, - "constant_healpix_order": self.constant_healpix_order, - "lowest_healpix_order": self.lowest_healpix_order, - "highest_healpix_order": self.highest_healpix_order, - "pixel_threshold": self.pixel_threshold, - "mapping_healpix_order": self.mapping_healpix_order, - "debug_stats_only": self.debug_stats_only, - "file_reader_info": file_reader_info, - } - def check_healpix_order_range(order, field_name, lower_bound=0, upper_bound=hipscat_id.SPATIAL_INDEX_ORDER): """Helper method to check if the ``order`` is within the range determined by the diff --git a/src/hats_import/catalog/file_readers.py b/src/hats_import/catalog/file_readers.py index a647e192..40c05be5 100644 --- a/src/hats_import/catalog/file_readers.py +++ b/src/hats_import/catalog/file_readers.py @@ -98,21 +98,6 @@ def read(self, input_file, read_columns=None): DataFrame containing chunk of file info. """ - def provenance_info(self) -> dict: - """Create dictionary of parameters for provenance tracking. - - If any `storage_options` have been provided as kwargs, we will replace the - value with ``REDACTED`` for the purpose of writing to provenance info, as it - may contain user names or API keys. - - Returns: - dictionary with all argument_name -> argument_value as key -> value pairs. - """ - all_args = vars(self) - if "kwargs" in all_args and "storage_options" in all_args["kwargs"]: - all_args["kwargs"]["storage_options"] = "REDACTED" - return {"input_reader_type": type(self).__name__, **vars(self)} - def regular_file_exists(self, input_file, **_kwargs): """Check that the `input_file` points to a single regular file diff --git a/src/hats_import/catalog/run_import.py b/src/hats_import/catalog/run_import.py index 7ae0476f..be08b42f 100644 --- a/src/hats_import/catalog/run_import.py +++ b/src/hats_import/catalog/run_import.py @@ -7,7 +7,7 @@ import os import pickle -import hats.io.write_metadata as io +import hats.io.file_io as io from hats.catalog import PartitionInfo from hats.io import paths from hats.io.parquet_metadata import write_parquet_metadata @@ -122,11 +122,6 @@ def run(args, client): # All done - write out the metadata if resume_plan.should_run_finishing: with resume_plan.print_progress(total=5, stage_name="Finishing") as step_progress: - catalog_info = args.to_table_properties(total_rows) - catalog_info.to_properties_file(args.catalog_path) - step_progress.update(1) - ## TODO - optionally write out arguments file - step_progress.update(1) partition_info = PartitionInfo.from_healpix(resume_plan.get_destination_pixels()) partition_info_file = paths.get_partition_info_pointer(args.catalog_path) partition_info.write_to_file(partition_info_file) @@ -140,7 +135,12 @@ def run(args, client): else: partition_info.write_to_metadata_files(args.catalog_path) step_progress.update(1) - io.write_fits_map(args.catalog_path, raw_histogram) + catalog_info = args.to_table_properties( + total_rows, partition_info.get_highest_order(), partition_info.calculate_fractional_coverage() + ) + catalog_info.to_properties_file(args.catalog_path) + step_progress.update(1) + io.write_fits_image(raw_histogram, paths.get_point_map_file_pointer(args.catalog_path)) step_progress.update(1) resume_plan.clean_resume_files() step_progress.update(1) diff --git a/src/hats_import/index/arguments.py b/src/hats_import/index/arguments.py index 0cbf53a7..aba9a700 100644 --- a/src/hats_import/index/arguments.py +++ b/src/hats_import/index/arguments.py @@ -85,22 +85,11 @@ def to_table_properties(self, total_rows: int) -> TableProperties: """Catalog-type-specific dataset info.""" info = { "catalog_name": self.output_artifact_name, - "total_rows": total_rows, "catalog_type": "index", + "total_rows": total_rows, "primary_catalog": str(self.input_catalog_path), "indexing_column": self.indexing_column, - } - if len(self.extra_columns) > 0: - info["extra_columns"] = self.extra_columns + "extra_columns": self.extra_columns, + } | self.extra_property_dict() return TableProperties(**info) - - def additional_runtime_provenance_info(self) -> dict: - return { - "input_catalog_path": self.input_catalog_path, - "indexing_column": self.indexing_column, - "extra_columns": self.extra_columns, - "include_healpix_29": self.include_healpix_29, - "include_order_pixel": self.include_order_pixel, - "include_radec": self.include_radec, - } diff --git a/src/hats_import/index/run_index.py b/src/hats_import/index/run_index.py index b26eaf4d..2547e883 100644 --- a/src/hats_import/index/run_index.py +++ b/src/hats_import/index/run_index.py @@ -1,6 +1,6 @@ """Create columnar index of hats table using dask for parallelization""" -from hats.io import file_io, parquet_metadata, write_metadata +from hats.io import file_io, parquet_metadata import hats_import.index.map_reduce as mr from hats_import.index.arguments import IndexArguments diff --git a/src/hats_import/margin_cache/margin_cache.py b/src/hats_import/margin_cache/margin_cache.py index 1f83daf6..4042cde1 100644 --- a/src/hats_import/margin_cache/margin_cache.py +++ b/src/hats_import/margin_cache/margin_cache.py @@ -1,5 +1,5 @@ from hats.catalog import PartitionInfo -from hats.io import file_io, parquet_metadata, paths, write_metadata +from hats.io import file_io, parquet_metadata, paths import hats_import.margin_cache.margin_cache_map_reduce as mcmr from hats_import.margin_cache.margin_cache_resume_plan import MarginCachePlan @@ -63,11 +63,13 @@ def generate_margin_cache(args, client): partition_info = PartitionInfo.read_from_file(metadata_path) partition_info_file = paths.get_partition_info_pointer(args.catalog_path) partition_info.write_to_file(partition_info_file) - step_progress.update(1) - margin_catalog_info = args.to_table_properties(int(total_rows)) + margin_catalog_info = args.to_table_properties( + int(total_rows), + partition_info.get_highest_order(), + partition_info.calculate_fractional_coverage(), + ) margin_catalog_info.to_properties_file(args.catalog_path) - ## TODO - optionally write out arguments file step_progress.update(1) file_io.remove_directory(args.tmp_path, ignore_errors=True) step_progress.update(1) diff --git a/src/hats_import/margin_cache/margin_cache_arguments.py b/src/hats_import/margin_cache/margin_cache_arguments.py index 41e1b1b8..703933a8 100644 --- a/src/hats_import/margin_cache/margin_cache_arguments.py +++ b/src/hats_import/margin_cache/margin_cache_arguments.py @@ -75,7 +75,9 @@ def _check_arguments(self): if margin_pixel_mindist * 60.0 < self.margin_threshold: raise ValueError("margin pixels must be larger than margin_threshold") - def to_table_properties(self, total_rows) -> TableProperties: + def to_table_properties( + self, total_rows: int, highest_order: int, moc_sky_fraction: float + ) -> TableProperties: """Catalog-type-specific dataset info.""" info = { "catalog_name": self.output_artifact_name, @@ -85,13 +87,7 @@ def to_table_properties(self, total_rows) -> TableProperties: "dec_column": self.catalog.catalog_info.dec_column, "primary_catalog": str(self.input_catalog_path), "margin_threshold": self.margin_threshold, - } + "hats_order": highest_order, + "moc_sky_fraction": f"{moc_sky_fraction:0.5f}", + } | self.extra_property_dict() return TableProperties(**info) - - def additional_runtime_provenance_info(self) -> dict: - return { - "input_catalog_path": self.input_catalog_path, - "margin_threshold": self.margin_threshold, - "margin_order": self.margin_order, - "debug_filter_pixel_list": self.debug_filter_pixel_list, - } diff --git a/src/hats_import/margin_cache/margin_cache_map_reduce.py b/src/hats_import/margin_cache/margin_cache_map_reduce.py index bda17b74..f6ac74a7 100644 --- a/src/hats_import/margin_cache/margin_cache_map_reduce.py +++ b/src/hats_import/margin_cache/margin_cache_map_reduce.py @@ -4,7 +4,6 @@ import pyarrow as pa import pyarrow.dataset as ds from hats import pixel_math -from hats.catalog.partition_info import PartitionInfo from hats.io import file_io, paths from hats.pixel_math.healpix_pixel import HealpixPixel @@ -112,22 +111,22 @@ def _to_pixel_shard( shard_path = paths.pixel_catalog_file(partition_dir, source_pixel) rename_columns = { - PartitionInfo.METADATA_ORDER_COLUMN_NAME: f"margin_{PartitionInfo.METADATA_ORDER_COLUMN_NAME}", - PartitionInfo.METADATA_DIR_COLUMN_NAME: f"margin_{PartitionInfo.METADATA_DIR_COLUMN_NAME}", - PartitionInfo.METADATA_PIXEL_COLUMN_NAME: f"margin_{PartitionInfo.METADATA_PIXEL_COLUMN_NAME}", + paths.PARTITION_ORDER: paths.MARGIN_ORDER, + paths.PARTITION_DIR: paths.MARGIN_DIR, + paths.PARTITION_PIXEL: paths.MARGIN_PIXEL, } margin_data = margin_data.rename(columns=rename_columns) - margin_data[PartitionInfo.METADATA_ORDER_COLUMN_NAME] = pixel.order - margin_data[PartitionInfo.METADATA_DIR_COLUMN_NAME] = pixel.dir - margin_data[PartitionInfo.METADATA_PIXEL_COLUMN_NAME] = pixel.pixel + margin_data[paths.PARTITION_ORDER] = pixel.order + margin_data[paths.PARTITION_DIR] = pixel.dir + margin_data[paths.PARTITION_PIXEL] = pixel.pixel margin_data = margin_data.astype( { - PartitionInfo.METADATA_ORDER_COLUMN_NAME: np.uint8, - PartitionInfo.METADATA_DIR_COLUMN_NAME: np.uint64, - PartitionInfo.METADATA_PIXEL_COLUMN_NAME: np.uint64, + paths.PARTITION_ORDER: np.uint8, + paths.PARTITION_DIR: np.uint64, + paths.PARTITION_PIXEL: np.uint64, } ) margin_data = margin_data.sort_index() @@ -152,9 +151,9 @@ def reduce_margin_shards( schema = file_io.read_parquet_metadata(original_catalog_metadata).schema.to_arrow_schema() schema = ( - schema.append(pa.field("margin_Norder", pa.uint8())) - .append(pa.field("margin_Dir", pa.uint64())) - .append(pa.field("margin_Npix", pa.uint64())) + schema.append(pa.field(paths.MARGIN_ORDER, pa.uint8())) + .append(pa.field(paths.MARGIN_DIR, pa.uint64())) + .append(pa.field(paths.MARGIN_PIXEL, pa.uint64())) ) data = ds.dataset(shard_dir, format="parquet", schema=schema) full_df = data.to_table().to_pandas() diff --git a/src/hats_import/runtime_arguments.py b/src/hats_import/runtime_arguments.py index fcab3277..d619b50b 100644 --- a/src/hats_import/runtime_arguments.py +++ b/src/hats_import/runtime_arguments.py @@ -4,12 +4,14 @@ import re from dataclasses import dataclass -from importlib.metadata import version +from datetime import datetime, timezone from pathlib import Path from hats.io import file_io from upath import UPath +import hats_import + # pylint: disable=too-many-instance-attributes @@ -22,6 +24,11 @@ class RuntimeArguments: """base path where new catalog should be output""" output_artifact_name: str = "" """short, convenient name for the catalog""" + addl_hats_properties: dict = None + """Any additional keyword arguments you would like to provide when writing + the `properties` file for the final HATS table. e.g. + {"hats_cols_default":"id, mjd", "hats_cols_survey_id":"unique_id", + "creator_did": "ivo://CDS/P/2MASS/J"}""" ## Execution tmp_dir: str | Path | UPath | None = None @@ -103,36 +110,19 @@ def _check_arguments(self): else: self.resume_tmp = self.tmp_path - def provenance_info(self) -> dict: - """Fill all known information in a dictionary for provenance tracking. - - Returns: - dictionary with all argument_name -> argument_value as key -> value pairs. - """ - runtime_args = { - "catalog_name": self.output_artifact_name, - "output_path": self.output_path, - "output_artifact_name": self.output_artifact_name, - "tmp_dir": self.tmp_dir, - "dask_tmp": self.dask_tmp, - "dask_n_workers": self.dask_n_workers, - "dask_threads_per_worker": self.dask_threads_per_worker, - "catalog_path": self.catalog_path, - "tmp_path": self.tmp_path, - } - - runtime_args.update(self.additional_runtime_provenance_info()) - provenance_info = { - "tool_name": "hats_import", - "version": version("hats-import"), - "runtime_args": runtime_args, - } - - return provenance_info - - def additional_runtime_provenance_info(self): - """Any additional runtime args to be included in provenance info from subclasses""" - return {} + def extra_property_dict(self): + properties = {} + properties["hats_builder"] = f"hats-import v{hats_import.__version__}" + + now = datetime.now(tz=timezone.utc) + properties["hats_creation_date"] = now.strftime("%Y-%m-%dT%H:%M%Z") + properties["hats_estsize"] = int(_estimate_dir_size(self.catalog_path) / 1024) + properties["hats_release_date"] = "2024-09-18" + properties["hats_version"] = "v0.1" + + if self.addl_hats_properties: + properties = properties | self.addl_hats_properties + return properties def find_input_paths(input_path="", file_matcher="", input_file_list=None): @@ -166,3 +156,13 @@ def find_input_paths(input_path="", file_matcher="", input_file_list=None): if len(input_paths) == 0: raise FileNotFoundError("No input files found") return input_paths + + +def _estimate_dir_size(dir): + total_size = 0 + for item in dir.iterdir(): + if item.is_dir(): + total_size += _estimate_dir_size(item) + else: + total_size += item.stat().st_size + return total_size diff --git a/src/hats_import/soap/arguments.py b/src/hats_import/soap/arguments.py index 560ee60a..841ada97 100644 --- a/src/hats_import/soap/arguments.py +++ b/src/hats_import/soap/arguments.py @@ -65,7 +65,7 @@ def _check_arguments(self): if self.compute_partition_size < 100_000: raise ValueError("compute_partition_size must be at least 100_000") - def to_table_properties(self, total_rows: int) -> TableProperties: + def to_table_properties(self, total_rows=10, highest_order=4, moc_sky_fraction=22 / 7) -> TableProperties: """Catalog-type-specific dataset info.""" info = { "catalog_name": self.output_artifact_name, @@ -78,16 +78,7 @@ def to_table_properties(self, total_rows: int) -> TableProperties: "join_column_association": "source_id", "join_catalog": str(self.source_catalog_dir), "contains_leaf_files": self.write_leaf_files, - } + "hats_order": highest_order, + "moc_sky_fraction": f"{moc_sky_fraction:0.5f}", + } | self.extra_property_dict() return TableProperties(**info) - - def additional_runtime_provenance_info(self) -> dict: - return { - "object_catalog_dir": self.object_catalog_dir, - "object_id_column": self.object_id_column, - "source_catalog_dir": self.source_catalog_dir, - "source_object_id_column": self.source_object_id_column, - "source_id_column": self.source_id_column, - "compute_partition_size": self.compute_partition_size, - "write_leaf_files": self.write_leaf_files, - } diff --git a/src/hats_import/soap/run_soap.py b/src/hats_import/soap/run_soap.py index 462f6fde..8bd59459 100644 --- a/src/hats_import/soap/run_soap.py +++ b/src/hats_import/soap/run_soap.py @@ -3,8 +3,8 @@ The actual logic of the map reduce is in the `map_reduce.py` file. """ -from hats.catalog.association_catalog.partition_join_info import PartitionJoinInfo -from hats.io import parquet_metadata, paths, write_metadata +from hats.catalog import PartitionInfo, PartitionJoinInfo +from hats.io import parquet_metadata, paths from hats_import.soap.arguments import SoapArguments from hats_import.soap.map_reduce import combine_partial_results, count_joins, reduce_joins @@ -57,7 +57,10 @@ def run(args, client): else: total_rows = combine_partial_results(args.tmp_path, args.catalog_path) step_progress.update(1) - catalog_info = args.to_table_properties(total_rows) + partition_info = PartitionInfo.read_from_dir(args.catalog_path) + catalog_info = args.to_table_properties( + total_rows, partition_info.get_highest_order(), partition_info.calculate_fractional_coverage() + ) catalog_info.to_properties_file(args.catalog_path) step_progress.update(1) ## TODO - optionally write out arguments file diff --git a/src/hats_import/verification/arguments.py b/src/hats_import/verification/arguments.py index 207793db..d17a30ed 100644 --- a/src/hats_import/verification/arguments.py +++ b/src/hats_import/verification/arguments.py @@ -42,10 +42,3 @@ def _check_arguments(self): self.input_catalog = Catalog.read_hats(catalog_path=self.input_catalog_path) if not self.input_catalog_path: self.input_catalog_path = self.input_catalog.catalog_path - - def additional_runtime_provenance_info(self) -> dict: - return { - "pipeline": "verification pipeline", - "input_catalog_path": self.input_catalog_path, - "field_distribution_cols": self.field_distribution_cols, - } diff --git a/tests/hats_import/catalog/test_argument_validation.py b/tests/hats_import/catalog/test_argument_validation.py index e0ddff1e..132c43ac 100644 --- a/tests/hats_import/catalog/test_argument_validation.py +++ b/tests/hats_import/catalog/test_argument_validation.py @@ -1,10 +1,8 @@ """Tests of argument validation""" import pytest -from hats.io import write_metadata from hats_import.catalog.arguments import ImportArguments, check_healpix_order_range -from hats_import.catalog.file_readers import CsvReader # pylint: disable=protected-access @@ -214,58 +212,13 @@ def test_to_table_properties(blank_data_dir, tmp_path): output_path=tmp_path, tmp_dir=tmp_path, progress_bar=False, + addl_hats_properties={"hats_cols_default": "id, mjd", "obs_regime": "Optical"}, ) - catalog_info = args.to_table_properties(total_rows=10) + catalog_info = args.to_table_properties(total_rows=10, highest_order=4, moc_sky_fraction=22 / 7) assert catalog_info.catalog_name == "catalog" assert catalog_info.total_rows == 10 - - -@pytest.mark.skip("provenance") -def test_provenance_info(blank_data_dir, tmp_path): - """Verify that provenance info includes catalog-specific fields.""" - args = ImportArguments( - output_artifact_name="catalog", - input_path=blank_data_dir, - file_reader="csv", - output_path=tmp_path, - tmp_dir=tmp_path, - progress_bar=False, - ) - - runtime_args = args.provenance_info()["runtime_args"] - assert "epoch" in runtime_args - - -@pytest.mark.skip("provenance") -def test_write_provenance_info(formats_dir, tmp_path): - """Verify that provenance info can be written to JSON file.""" - input_file = formats_dir / "gaia_minimum.csv" - schema_file = formats_dir / "gaia_minimum_schema.parquet" - - args = ImportArguments( - output_artifact_name="gaia_minimum", - input_file_list=[input_file], - file_reader=CsvReader( - comment="#", - header=None, - schema_file=schema_file, - ), - ra_column="ra", - dec_column="dec", - sort_columns="solution_id", - use_schema_file=schema_file, - output_path=tmp_path, - dask_tmp=tmp_path, - highest_healpix_order=2, - pixel_threshold=3_000, - progress_bar=False, - ) - - write_metadata.write_provenance_info( - catalog_base_dir=args.catalog_path, - dataset_info=args.to_catalog_info(0), - tool_args=args.provenance_info(), - ) + assert catalog_info.default_columns == ["id", "mjd"] + assert catalog_info.__pydantic_extra__["obs_regime"] == "Optical" def test_check_healpix_order_range(): diff --git a/tests/hats_import/catalog/test_file_readers.py b/tests/hats_import/catalog/test_file_readers.py index f88612bb..de95e3c8 100644 --- a/tests/hats_import/catalog/test_file_readers.py +++ b/tests/hats_import/catalog/test_file_readers.py @@ -1,6 +1,5 @@ """Test dataframe-generating file readers""" -import hats.io.write_metadata as io import numpy as np import pandas as pd import pyarrow as pa @@ -230,34 +229,6 @@ def test_csv_reader_pipe_delimited(formats_pipe_csv, tmp_path): assert np.all(column_types == expected_column_types) -@pytest.mark.skip("provenance") -def test_csv_reader_provenance_info(tmp_path, basic_catalog_info): - """Test that we get some provenance info and it is parseable into JSON.""" - reader = CsvReader( - header=None, - sep="|", - column_names=["letters", "ints", "empty", "numeric"], - type_map={ - "letters": object, - "ints": int, - "empty": "Int64", - "numeric": int, - }, - storage_options={"user_name": "user_pii", "user_key": "SECRETS!"}, - ) - provenance_info = reader.provenance_info() - catalog_base_dir = tmp_path / "test_catalog" - catalog_base_dir.mkdir(parents=True) - io.write_provenance_info(catalog_base_dir, basic_catalog_info, provenance_info) - - with open(catalog_base_dir / "provenance_info.json", "r", encoding="utf-8") as file: - data = file.read() - assert "test_catalog" in data - assert "REDACTED" in data - assert "user_pii" not in data - assert "SECRETS" not in data - - def test_indexed_csv_reader(indexed_files_dir): # Chunksize covers all the inputs. total_chunks = 0 @@ -331,16 +302,6 @@ def test_indexed_parquet_reader(indexed_files_dir): assert total_chunks == 29 -@pytest.mark.skip("provenance") -def test_parquet_reader_provenance_info(tmp_path, basic_catalog_info): - """Test that we get some provenance info and it is parseable into JSON.""" - reader = ParquetReader(chunksize=1) - provenance_info = reader.provenance_info() - catalog_base_dir = tmp_path / "test_catalog" - catalog_base_dir.mkdir(parents=True) - io.write_provenance_info(catalog_base_dir, basic_catalog_info, provenance_info) - - def test_parquet_reader_columns(parquet_shards_shard_44_0): """Verify we can read a subset of columns.""" column_subset = ["id", "dec"] @@ -389,13 +350,3 @@ def test_read_fits_columns(formats_fits): FitsReader(skip_column_names=["ra_error", "dec_error"]).read(formats_fits, read_columns=["ra", "dec"]) ) assert list(frame.columns) == ["ra", "dec"] - - -@pytest.mark.skip("provenance") -def test_fits_reader_provenance_info(tmp_path, basic_catalog_info): - """Test that we get some provenance info and it is parseable into JSON.""" - reader = FitsReader() - provenance_info = reader.provenance_info() - catalog_base_dir = tmp_path / "test_catalog" - catalog_base_dir.mkdir(parents=True) - io.write_provenance_info(catalog_base_dir, basic_catalog_info, provenance_info) diff --git a/tests/hats_import/index/test_index_argument.py b/tests/hats_import/index/test_index_argument.py index 10d334f0..c8d26c59 100644 --- a/tests/hats_import/index/test_index_argument.py +++ b/tests/hats_import/index/test_index_argument.py @@ -167,18 +167,3 @@ def test_to_table_properties(small_sky_object_catalog, tmp_path): catalog_info = args.to_table_properties(total_rows=10) assert catalog_info.catalog_name == args.output_artifact_name assert catalog_info.total_rows == 10 - - -def test_provenance_info(small_sky_object_catalog, tmp_path): - """Verify that provenance info includes index-specific fields.""" - args = IndexArguments( - input_catalog_path=small_sky_object_catalog, - indexing_column="id", - output_path=tmp_path, - output_artifact_name="small_sky_object_index", - include_healpix_29=True, - include_order_pixel=True, - ) - - runtime_args = args.provenance_info()["runtime_args"] - assert "input_catalog_path" in runtime_args diff --git a/tests/hats_import/margin_cache/test_arguments_margin_cache.py b/tests/hats_import/margin_cache/test_arguments_margin_cache.py index ee7cbb4f..75d20589 100644 --- a/tests/hats_import/margin_cache/test_arguments_margin_cache.py +++ b/tests/hats_import/margin_cache/test_arguments_margin_cache.py @@ -1,7 +1,6 @@ """Tests of margin cache generation arguments""" import pytest -from hats.io import write_metadata from hats.pixel_math.healpix_pixel import HealpixPixel from hats_import.margin_cache.margin_cache_arguments import MarginCacheArguments @@ -122,28 +121,8 @@ def test_to_table_properties(small_sky_source_catalog, tmp_path): output_artifact_name="catalog_cache", margin_order=4, ) - catalog_info = args.to_table_properties(total_rows=10) + catalog_info = args.to_table_properties(total_rows=10, highest_order=4, moc_sky_fraction=22 / 7) assert catalog_info.catalog_name == args.output_artifact_name assert catalog_info.total_rows == 10 assert catalog_info.ra_column == "source_ra" assert catalog_info.dec_column == "source_dec" - - -@pytest.mark.skip("provenance") -def test_provenance_info(small_sky_source_catalog, tmp_path): - """Verify that provenance info includes margin-cache-specific fields.""" - args = MarginCacheArguments( - margin_threshold=5.0, - input_catalog_path=small_sky_source_catalog, - output_path=tmp_path, - output_artifact_name="catalog_cache", - margin_order=4, - debug_filter_pixel_list=[HealpixPixel(1, 44)], - ) - - runtime_args = args.provenance_info()["runtime_args"] - assert "margin_threshold" in runtime_args - - write_metadata.write_provenance_info( - catalog_base_dir=args.catalog_path, dataset_info=args.to_catalog_info(20_000), tool_args=runtime_args - ) diff --git a/tests/hats_import/margin_cache/test_margin_cache.py b/tests/hats_import/margin_cache/test_margin_cache.py index 9543ac46..1e4768b0 100644 --- a/tests/hats_import/margin_cache/test_margin_cache.py +++ b/tests/hats_import/margin_cache/test_margin_cache.py @@ -4,7 +4,6 @@ import numpy.testing as npt import pandas as pd import pytest -from hats.catalog import PartitionInfo from hats.catalog.healpix_dataset.healpix_dataset import HealpixDataset from hats.io import paths from hats.pixel_math.healpix_pixel import HealpixPixel @@ -38,13 +37,13 @@ def test_margin_cache_gen(small_sky_source_catalog, tmp_path, dask_client): assert len(data) == 13 - assert all(data[PartitionInfo.METADATA_ORDER_COLUMN_NAME] == norder) - assert all(data[PartitionInfo.METADATA_PIXEL_COLUMN_NAME] == npix) - assert all(data[PartitionInfo.METADATA_DIR_COLUMN_NAME] == int(npix / 10000) * 10000) + assert all(data[paths.PARTITION_ORDER] == norder) + assert all(data[paths.PARTITION_PIXEL] == npix) + assert all(data[paths.PARTITION_DIR] == int(npix / 10_000) * 10_000) - assert data.dtypes[PartitionInfo.METADATA_ORDER_COLUMN_NAME] == np.uint8 - assert data.dtypes[PartitionInfo.METADATA_DIR_COLUMN_NAME] == np.uint64 - assert data.dtypes[PartitionInfo.METADATA_PIXEL_COLUMN_NAME] == np.uint64 + assert data.dtypes[paths.PARTITION_ORDER] == np.uint8 + assert data.dtypes[paths.PARTITION_PIXEL] == np.uint64 + assert data.dtypes[paths.PARTITION_DIR] == np.uint64 npt.assert_array_equal( data.columns, diff --git a/tests/hats_import/test_runtime_arguments.py b/tests/hats_import/test_runtime_arguments.py index 523ca933..2bb16874 100644 --- a/tests/hats_import/test_runtime_arguments.py +++ b/tests/hats_import/test_runtime_arguments.py @@ -123,15 +123,48 @@ def test_dask_args(tmp_path): ) -def test_provenance_info(tmp_path): - """Verify that provenance info ONLY includes general runtime fields.""" +def test_extra_property_dict(test_data_dir): args = RuntimeArguments( - output_artifact_name="catalog", - output_path=tmp_path, - tmp_dir=tmp_path, - dask_tmp=tmp_path, - progress_bar=False, + output_artifact_name="small_sky_source_catalog", + output_path=test_data_dir, + ) + + properties = args.extra_property_dict() + assert list(properties.keys()) == [ + "hats_builder", + "hats_creation_date", + "hats_estsize", + "hats_release_date", + "hats_version", + ] + + # Most values are dynamic, but these are some safe assumptions. + assert properties["hats_builder"].startswith("hats") + assert properties["hats_creation_date"].startswith("20") + assert properties["hats_estsize"] > 1_000 + assert properties["hats_release_date"].startswith("20") + assert properties["hats_version"].startswith("v") + + args = RuntimeArguments( + output_artifact_name="small_sky_source_catalog", + output_path=test_data_dir, + addl_hats_properties={"foo": "bar"}, ) - runtime_args = args.provenance_info()["runtime_args"] - assert len(runtime_args) == 9 + properties = args.extra_property_dict() + assert list(properties.keys()) == [ + "hats_builder", + "hats_creation_date", + "hats_estsize", + "hats_release_date", + "hats_version", + "foo", + ] + + # Most values are dynamic, but these are some safe assumptions. + assert properties["hats_builder"].startswith("hats") + assert properties["hats_creation_date"].startswith("20") + assert properties["hats_estsize"] > 1_000 + assert properties["hats_release_date"].startswith("20") + assert properties["hats_version"].startswith("v") + assert properties["foo"] == "bar" diff --git a/tests/hats_import/verification/test_verification_arguments.py b/tests/hats_import/verification/test_verification_arguments.py index 2b41e853..dd2203ba 100644 --- a/tests/hats_import/verification/test_verification_arguments.py +++ b/tests/hats_import/verification/test_verification_arguments.py @@ -53,8 +53,12 @@ def test_good_paths(tmp_path, small_sky_object_catalog): assert str(args.tmp_path).startswith(tmp_path_str) +@pytest.mark.timeout(5) def test_catalog_object(tmp_path, small_sky_object_catalog): - """Required arguments are provided, and paths are found.""" + """Required arguments are provided, and paths are found. + + NB: This is currently the last test in alpha-order, and may require additional + time to teardown fixtures.""" small_sky_catalog_object = Catalog.read_hats(catalog_path=small_sky_object_catalog) tmp_path_str = str(tmp_path) args = VerificationArguments( @@ -65,18 +69,3 @@ def test_catalog_object(tmp_path, small_sky_object_catalog): assert args.input_catalog_path == small_sky_object_catalog assert str(args.output_path) == tmp_path_str assert str(args.tmp_path).startswith(tmp_path_str) - - -@pytest.mark.timeout(5) -def test_provenance_info(small_sky_object_catalog, tmp_path): - """Verify that provenance info includes verification-specific fields. - NB: This is currently the last test in alpha-order, and may require additional - time to teardown fixtures.""" - args = VerificationArguments( - input_catalog_path=small_sky_object_catalog, - output_path=tmp_path, - output_artifact_name="small_sky_object_verification_report", - ) - - runtime_args = args.provenance_info()["runtime_args"] - assert "input_catalog_path" in runtime_args