Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Repo2Docker outdated for CaltechDATA #1397

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 41 additions & 10 deletions repo2docker/contentproviders/zenodo.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,6 @@ class Zenodo(DoiProvider):

def __init__(self):
super().__init__()
# We need the hostname (url where records are), api url (for metadata),
# filepath (path to files in metadata), filename (path to filename in
# metadata), download (path to file download URL), and type (path to item type in metadata)
self.hosts = [
{
"hostname": [
Expand All @@ -30,31 +27,34 @@ def __init__(self):
"filename": "key",
"download": "links.content",
"type": "metadata.upload_type",
"is_caltech": False,
},
{
"hostname": [
"https://zenodo.org/record/",
"http://zenodo.org/record/",
"https://zenodo.org/records/",
"http://zenodo.org/record/",
],
"api": "https://zenodo.org/api/records/",
"files": "links.files",
"filepath": "entries",
"filename": "key",
"download": "links.content",
"type": "metadata.upload_type",
"is_caltech": False,
},
{
"hostname": [
"https://data.caltech.edu/records/",
"http://data.caltech.edu/records/",
],
"api": "https://data.caltech.edu/api/record/",
"files": "",
"filepath": "metadata.electronic_location_and_access",
"filename": "electronic_name.0",
"download": "uniform_resource_identifier",
"type": "metadata.resourceType.resourceTypeGeneral",
"api": "https://data.caltech.edu/api/records/",
"files": "links.files",
"filepath": "entries",
"filename": "key",
"download": "links.content",
"type": "metadata.upload_type",
"is_caltech": True,
},
]

Expand Down Expand Up @@ -93,6 +93,37 @@ def fetch(self, spec, output_dir, yield_output=False):
for file_ref in files:
yield from self.fetch_file(file_ref, host, output_dir, unzip=only_one_file)

def fetch_file(self, file_ref, host, output_dir, unzip=True):
"""Fetch and save a file from Zenodo."""
filename = deep_get(file_ref, host["filename"])
if host["is_caltech"]:
# Construct the direct download URL for Caltech Data
download_url = (
f"https://data.caltech.edu/records/{self.record_id}/files/{filename}"
)
else:
# Use the standard Zenodo download URL structure
download_url = deep_get(file_ref, host["download"])

# Create output directory
makedirs(output_dir, exist_ok=True)

output_path = path.join(output_dir, filename)
yield f"Downloading {filename} to {output_path}\n"

# Get file using a streaming approach
response = self.urlopen(download_url)
content = response.content # Get the binary content

# Write the content to file
with open(output_path, "wb") as fp:
fp.write(content)

if unzip and filename.endswith(".zip"):
yield f"Extracting {filename} to {output_dir}\n"
shutil.unpack_archive(output_path, output_dir)
os.remove(output_path)

@property
def content_id(self):
"""The Zenodo record ID as the content of a record is immutable"""
Expand Down
Loading