huggingface · lhoestq · Jan 26, 2024 · Dec 18, 2023 · Dec 18, 2023 · Dec 21, 2023
diff --git a/src/datasets/packaged_modules/__init__.py b/src/datasets/packaged_modules/__init__.py
@@ -50,6 +50,8 @@ def _hash_python_lines(lines: List[str]) -> str:
     ".json": ("json", {}),
     ".jsonl": ("json", {}),
     ".parquet": ("parquet", {}),
+    ".geoparquet": ("parquet", {}),
+    ".gpq": ("parquet", {}),
     ".arrow": ("arrow", {}),
     ".txt": ("text", {}),
     ".tar": ("webdataset", {}),

diff --git a/tests/fixtures/files.py b/tests/fixtures/files.py
@@ -7,6 +7,7 @@
 import textwrap
 import zipfile
 
+import pandas as pd
 import pyarrow as pa
 import pyarrow.parquet as pq
 import pytest
@@ -332,6 +333,14 @@ def parquet_path(tmp_path_factory):
     return path
 
 
+@pytest.fixture(scope="session")
+def geoparquet_path(tmp_path_factory):
+    df = pd.read_parquet(path="https://github.com/opengeospatial/geoparquet/raw/v1.0.0/examples/example.parquet")
+    path = str(tmp_path_factory.mktemp("data") / "dataset.geoparquet")
+    df.to_parquet(path=path)
+    return path
+
+
 @pytest.fixture(scope="session")
 def json_list_of_dicts_path(tmp_path_factory):
     path = str(tmp_path_factory.mktemp("data") / "dataset.json")

diff --git a/tests/io/test_parquet.py b/tests/io/test_parquet.py
@@ -69,6 +69,25 @@ def test_dataset_from_parquet_path_type(path_type, parquet_path, tmp_path):
     _check_parquet_dataset(dataset, expected_features)
 
 
+def test_parquet_read_geoparquet(geoparquet_path, tmp_path):
+    cache_dir = tmp_path / "cache"
+    dataset = ParquetDatasetReader(path_or_paths=geoparquet_path, cache_dir=cache_dir).read()
+
+    expected_features = {
+        "pop_est": "float64",
+        "continent": "string",
+        "name": "string",
+        "gdp_md_est": "int64",
+        "geometry": "binary",
+    }
+    assert isinstance(dataset, Dataset)
+    assert dataset.num_rows == 5
+    assert dataset.num_columns == 6
+    assert dataset.column_names == ["pop_est", "continent", "name", "iso_a3", "gdp_md_est", "geometry"]
+    for feature, expected_dtype in expected_features.items():
+        assert dataset.features[feature].dtype == expected_dtype
+
+
 def _check_parquet_datasetdict(dataset_dict, expected_features, splits=("train",)):
     assert isinstance(dataset_dict, (DatasetDict, IterableDatasetDict))
     for split in splits:

diff --git a/tests/test_load.py b/tests/test_load.py
@@ -318,6 +318,8 @@ def metric_loading_script_dir(tmp_path):
         (["train.json"], "json", {}),
         (["train.jsonl"], "json", {}),
         (["train.parquet"], "parquet", {}),
+        (["train.geoparquet"], "parquet", {}),
+        (["train.gpq"], "parquet", {}),
         (["train.arrow"], "arrow", {}),
         (["train.txt"], "text", {}),
         (["uppercase.TXT"], "text", {}),