From 3ca48ed98f6eca8eb29b0e9486f12d17ed6b6922 Mon Sep 17 00:00:00 2001 From: Jonathan de Bruin Date: Thu, 21 Sep 2023 21:59:17 +0200 Subject: [PATCH] Add support for Pangaea datasets (#45) --- README.md | 2 +- datahugger/base.py | 5 +++- datahugger/config.py | 3 ++- datahugger/services.py | 40 +++++++++++++++++++++++++++++ docs/repositories.md | 2 +- scripts/estimate_repos_supported.py | 1 + tests/test_repositories.py | 3 +++ 7 files changed, 52 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index bb188c4..a73297a 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Datahugger is a tool to download scientific datasets, software, and code from a ## Supported repositories -Datahugger offers support for more than [376 generic and specific (scientific) repositories](https://j535d165.github.io/datahugger/repositories) (and more to come!). +Datahugger offers support for more than [377 generic and specific (scientific) repositories](https://j535d165.github.io/datahugger/repositories) (and more to come!). [![Datahugger support Zenodo, Dataverse, DataOne, GitHub, FigShare, HuggingFace, Mendeley Data, Dryad, OSF, and many more](https://github.com/J535D165/datahugger/raw/main/docs/images/logos.png)](https://j535d165.github.io/datahugger/repositories) diff --git a/datahugger/base.py b/datahugger/base.py index 0363ba2..cdb6024 100644 --- a/datahugger/base.py +++ b/datahugger/base.py @@ -303,7 +303,10 @@ def _get( ): if ( len(self.files) == 1 - and self.files[0]["link"].endswith(".zip") + and ( + self.files[0]["link"].endswith(".zip") + or self.files[0]["name"].endswith(".zip") + ) and self.unzip ): self._unpack_single_folder(self.files[0]["link"], output_folder) diff --git a/datahugger/config.py b/datahugger/config.py index b873ada..3176c87 100644 --- a/datahugger/config.py +++ b/datahugger/config.py @@ -9,6 +9,7 @@ from datahugger.services import HuggingFaceDataset from datahugger.services import MendeleyDataset from datahugger.services import OSFDataset +from datahugger.services import PangaeaDataset from datahugger.services import ZenodoDataset # fast lookup @@ -40,7 +41,7 @@ "get.iedadata.org": DataOneDataset, "usap-dc.org": DataOneDataset, "iys.hakai.org": DataOneDataset, - # "doi.pangaea.de": DataOneDataset, + "doi.pangaea.de": PangaeaDataset, "rvdata.us": DataOneDataset, "sead-published.ncsa.illinois.edu": DataOneDataset, # DataVerse repositories (extracted from re3data) diff --git a/datahugger/services.py b/datahugger/services.py index 20e87db..5f10f42 100644 --- a/datahugger/services.py +++ b/datahugger/services.py @@ -1,4 +1,5 @@ import io +import re import xml.etree.ElementTree as ET import zipfile from pathlib import Path @@ -142,6 +143,45 @@ def files(self): return self._files +class PangaeaDataset(DatasetDownloader): + """Downloader for PangaeaDataset repository.""" + + REGEXP_ID = r"doi\.pangaea\.de/(?P.*)" + + # the base entry point of the REST API + API_URL = "https://doi.pangaea.de/" + + @property + def files(self): + # get the difference between collection and file + r = requests.get( + f"{self.API_URL}{self._params['record_id']}?format=metadata_jsonld" + ) + r.raise_for_status() + dists = r.json()["distribution"] + + if isinstance(dists, dict): + dists = [dists] + + files = [] + for d in dists: + if d["encodingFormat"] in ["text/tab-separated-values", "application/zip"]: + r_filename = requests.head(d["contentUrl"]) + content_d = r_filename.headers["content-disposition"] + + files.append( + { + "link": d["contentUrl"], + "name": re.findall("filename=(.+)", content_d)[0], + "size": None, + "hash": None, + "hash_type": None, + } + ) + + return files + + class DSpaceDataset(DatasetDownloader): """Downloader for DSpaceDataset repositories.""" diff --git a/docs/repositories.md b/docs/repositories.md index b072b28..98dc447 100644 --- a/docs/repositories.md +++ b/docs/repositories.md @@ -1,6 +1,6 @@ # Supported repositories -Datahugger offers support for more than 376 generic and specific (scientific) repositories (and more to come!). +Datahugger offers support for more than 377 generic and specific (scientific) repositories (and more to come!). ![Datahugger support Zenodo, Dataverse, DataOne, GitHub, FigShare, HuggingFace, Mendeley Data, Dryad, OSF, and many more](images/logos.png) diff --git a/scripts/estimate_repos_supported.py b/scripts/estimate_repos_supported.py index 5895602..01cba52 100644 --- a/scripts/estimate_repos_supported.py +++ b/scripts/estimate_repos_supported.py @@ -16,6 +16,7 @@ def count_repos(): "dryad": 1, "github": 1, "huggingface": 1, + "pangaea": 1, } print(counts) diff --git a/tests/test_repositories.py b/tests/test_repositories.py index 954aaab..aca2a6e 100644 --- a/tests/test_repositories.py +++ b/tests/test_repositories.py @@ -55,6 +55,9 @@ ), # huggingface # ("10.57967/hf/0034", "test.csv"), + # Pangaea + ("https://doi.org/10.1594/PANGAEA.954547", "Gubbio_age.tab"), + ("https://doi.pangaea.de/10.1594/PANGAEA.954543", "AA_age.tab"), ]