From f35ac183863b23f910b798104c75dfe3e1a6850d Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Mon, 18 Dec 2023 17:45:34 +1300 Subject: [PATCH 1/4] Read GeoParquet files using parquet reader Let GeoParquet files with the file extension `*.geoparquet` or `*.gpq` be readable by the default parquet reader. --- src/datasets/packaged_modules/__init__.py | 1 + tests/test_load.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/src/datasets/packaged_modules/__init__.py b/src/datasets/packaged_modules/__init__.py index a9fe30d7f5d..dec91e098ed 100644 --- a/src/datasets/packaged_modules/__init__.py +++ b/src/datasets/packaged_modules/__init__.py @@ -57,6 +57,7 @@ def _hash_python_lines(lines: List[str]) -> str: _EXTENSION_TO_MODULE.update({ext.upper(): ("imagefolder", {}) for ext in imagefolder.ImageFolder.EXTENSIONS}) _EXTENSION_TO_MODULE.update({ext: ("audiofolder", {}) for ext in audiofolder.AudioFolder.EXTENSIONS}) _EXTENSION_TO_MODULE.update({ext.upper(): ("audiofolder", {}) for ext in audiofolder.AudioFolder.EXTENSIONS}) +_EXTENSION_TO_MODULE.update({".geoparquet": ("parquet", {}), ".gpq": {"parquet", {}}}) _MODULE_SUPPORTS_METADATA = {"imagefolder", "audiofolder"} # Used to filter data files based on extensions given a module name diff --git a/tests/test_load.py b/tests/test_load.py index 66321ef8c47..807a6596b18 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -318,6 +318,8 @@ def metric_loading_script_dir(tmp_path): (["train.json"], "json", {}), (["train.jsonl"], "json", {}), (["train.parquet"], "parquet", {}), + (["train.geoparquet"], "parquet", {}), + (["train.gpq"], "parquet", {}), (["train.arrow"], "arrow", {}), (["train.txt"], "text", {}), (["uppercase.TXT"], "text", {}), From 88c05c924081383a6da1b2f2f1c5df5a15f41fa8 Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Mon, 18 Dec 2023 20:30:17 +1300 Subject: [PATCH 2/4] Fix typo curly and round brackets --- src/datasets/packaged_modules/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/packaged_modules/__init__.py b/src/datasets/packaged_modules/__init__.py index dec91e098ed..a3c3d258d2e 100644 --- a/src/datasets/packaged_modules/__init__.py +++ b/src/datasets/packaged_modules/__init__.py @@ -57,7 +57,7 @@ def _hash_python_lines(lines: List[str]) -> str: _EXTENSION_TO_MODULE.update({ext.upper(): ("imagefolder", {}) for ext in imagefolder.ImageFolder.EXTENSIONS}) _EXTENSION_TO_MODULE.update({ext: ("audiofolder", {}) for ext in audiofolder.AudioFolder.EXTENSIONS}) _EXTENSION_TO_MODULE.update({ext.upper(): ("audiofolder", {}) for ext in audiofolder.AudioFolder.EXTENSIONS}) -_EXTENSION_TO_MODULE.update({".geoparquet": ("parquet", {}), ".gpq": {"parquet", {}}}) +_EXTENSION_TO_MODULE.update({".geoparquet": ("parquet", {}), ".gpq": ("parquet", {})}) _MODULE_SUPPORTS_METADATA = {"imagefolder", "audiofolder"} # Used to filter data files based on extensions given a module name From 0b76cdadb413d3d03dc21e19eeb55cb9abebcbb8 Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Fri, 22 Dec 2023 12:56:29 +1300 Subject: [PATCH 3/4] Add test to read geoparquet file Getting a sample GeoParquet file from https://github.com/opengeospatial/geoparquet/raw/v1.0.0/examples/example.parquet, saving it with a .geoparquet extension, and try to load it back, checking that column dtypes are correct. Also decided to put .geoparquet and .gpq in the _EXTENSION_TO_MODULE dictionary directly. --- src/datasets/packaged_modules/__init__.py | 3 ++- tests/fixtures/files.py | 9 +++++++++ tests/io/test_parquet.py | 18 ++++++++++++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/src/datasets/packaged_modules/__init__.py b/src/datasets/packaged_modules/__init__.py index 1279df820cf..9efa5c3eb4a 100644 --- a/src/datasets/packaged_modules/__init__.py +++ b/src/datasets/packaged_modules/__init__.py @@ -50,6 +50,8 @@ def _hash_python_lines(lines: List[str]) -> str: ".json": ("json", {}), ".jsonl": ("json", {}), ".parquet": ("parquet", {}), + ".geoparquet": ("parquet", {}), + ".gpq": ("parquet", {}), ".arrow": ("arrow", {}), ".txt": ("text", {}), ".tar": ("webdataset", {}), @@ -58,7 +60,6 @@ def _hash_python_lines(lines: List[str]) -> str: _EXTENSION_TO_MODULE.update({ext.upper(): ("imagefolder", {}) for ext in imagefolder.ImageFolder.EXTENSIONS}) _EXTENSION_TO_MODULE.update({ext: ("audiofolder", {}) for ext in audiofolder.AudioFolder.EXTENSIONS}) _EXTENSION_TO_MODULE.update({ext.upper(): ("audiofolder", {}) for ext in audiofolder.AudioFolder.EXTENSIONS}) -_EXTENSION_TO_MODULE.update({".geoparquet": ("parquet", {}), ".gpq": ("parquet", {})}) _MODULE_SUPPORTS_METADATA = {"imagefolder", "audiofolder"} # Used to filter data files based on extensions given a module name diff --git a/tests/fixtures/files.py b/tests/fixtures/files.py index 76bf830676d..d275031340e 100644 --- a/tests/fixtures/files.py +++ b/tests/fixtures/files.py @@ -7,6 +7,7 @@ import textwrap import zipfile +import pandas as pd import pyarrow as pa import pyarrow.parquet as pq import pytest @@ -332,6 +333,14 @@ def parquet_path(tmp_path_factory): return path +@pytest.fixture(scope="session") +def geoparquet_path(tmp_path_factory): + df = pd.read_parquet(path="https://github.com/opengeospatial/geoparquet/raw/v1.0.0/examples/example.parquet") + path = str(tmp_path_factory.mktemp("data") / "dataset.geoparquet") + df.to_parquet(path=path) + return path + + @pytest.fixture(scope="session") def json_list_of_dicts_path(tmp_path_factory): path = str(tmp_path_factory.mktemp("data") / "dataset.json") diff --git a/tests/io/test_parquet.py b/tests/io/test_parquet.py index bf17bc142ff..34478b6c47d 100644 --- a/tests/io/test_parquet.py +++ b/tests/io/test_parquet.py @@ -69,6 +69,24 @@ def test_dataset_from_parquet_path_type(path_type, parquet_path, tmp_path): _check_parquet_dataset(dataset, expected_features) +def test_parquet_read_geoparquet(geoparquet_path, tmp_path): + cache_dir = tmp_path / "cache" + dataset = ParquetDatasetReader(path_or_paths=geoparquet_path, cache_dir=cache_dir).read() + + expected_features = { + "pop_est": "float64", + "continent": "string", + "name": "string", + "gdp_md_est": "int64", + "geometry": "binary", + } + assert isinstance(dataset, Dataset) + assert dataset.num_rows == 5 + assert dataset.num_columns == 6 + assert dataset.column_names == ["pop_est", "continent", "name", "iso_a3", "gdp_md_est", "geometry"] + for feature, expected_dtype in expected_features.items(): + assert dataset.features[feature].dtype == expected_dtype + def _check_parquet_datasetdict(dataset_dict, expected_features, splits=("train",)): assert isinstance(dataset_dict, (DatasetDict, IterableDatasetDict)) for split in splits: From 7088f585557807a63673cdc58900d7ce56146cf7 Mon Sep 17 00:00:00 2001 From: Wei Ji <23487320+weiji14@users.noreply.github.com> Date: Thu, 11 Jan 2024 14:08:53 +1300 Subject: [PATCH 4/4] Lint --- tests/io/test_parquet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/io/test_parquet.py b/tests/io/test_parquet.py index 34478b6c47d..3e5ddee113e 100644 --- a/tests/io/test_parquet.py +++ b/tests/io/test_parquet.py @@ -87,6 +87,7 @@ def test_parquet_read_geoparquet(geoparquet_path, tmp_path): for feature, expected_dtype in expected_features.items(): assert dataset.features[feature].dtype == expected_dtype + def _check_parquet_datasetdict(dataset_dict, expected_features, splits=("train",)): assert isinstance(dataset_dict, (DatasetDict, IterableDatasetDict)) for split in splits: