Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] Add a repo provider for CKAN datasets #1833

Merged
merged 7 commits into from
Jul 6, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions binderhub/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
from .ratelimit import RateLimiter
from .registry import DockerRegistry
from .repoproviders import (
CKANProvider,
DataverseProvider,
FigshareProvider,
GistRepoProvider,
Expand Down Expand Up @@ -586,6 +587,7 @@ def _default_build_namespace(self):
"figshare": FigshareProvider,
"hydroshare": HydroshareProvider,
"dataverse": DataverseProvider,
"ckan": CKANProvider,
},
config=True,
help="""
Expand Down
3 changes: 2 additions & 1 deletion binderhub/event-schemas/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
"Zenodo",
"Figshare",
"Hydroshare",
"Dataverse"
"Dataverse",
"CKAN"
],
"description": "Provider for the repository being launched"
},
Expand Down
1 change: 1 addition & 0 deletions binderhub/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"figshare": "Figshare",
"hydroshare": "Hydroshare",
"dataverse": "Dataverse",
"ckan": "CKAN",
}


Expand Down
67 changes: 67 additions & 0 deletions binderhub/repoproviders.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,73 @@ def get_build_slug(self):
return f"hydroshare-{self.record_id}"


class CKANProvider(RepoProvider):
"""Provide contents of a CKAN dataset
Users must provide a spec consisting of the CKAN dataset URL.
"""

name = Unicode("CKAN")

display_name = "CKAN dataset"

url_regex = r"/dataset/[a-z0-9_\\-]*$"

labels = {
"text": "CKAN dataset URL (https://demo.ckan.org/dataset/sample-dataset-1)",
"tag_text": "Git ref (branch, tag, or commit)",
"ref_prop_disabled": True,
"label_prop_disabled": True,
}

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.repo = urllib.parse.unquote(self.spec)

async def get_resolved_ref(self):
parsed_repo = urlparse(self.repo)
self.dataset_id = parsed_repo.path.rsplit("/", maxsplit=1)[1]

client = AsyncHTTPClient()

api = parsed_repo._replace(
path=re.sub(self.url_regex, "/api/3/action/", parsed_repo.path)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment about using regexes as in jupyterhub/repo2docker#1336 (comment).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. I have made changes according to your suggestion.

).geturl()

package_show_url = f"{api}package_show?id={self.dataset_id}"

try:
r = await client.fetch(package_show_url, user_agent="BinderHub")
except HTTPError:
return None

def parse_date(json_body):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given that this function is used only once, let's just inline that here. Otherwise the function gets redefined each time the parent function is called. Also adds another layer of indirection that's otherwise not needed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure. I will fix this as soon as possible.

json_response = json.loads(json_body)
date = json_response["result"]["metadata_modified"]
parsed_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%f")
epoch = parsed_date.replace(tzinfo=timezone(timedelta(0))).timestamp()
# truncate the timestamp
return str(int(epoch))

self.record_id = f"{self.dataset_id}.v{parse_date(r.body)}"

return self.record_id

async def get_resolved_spec(self):
if not hasattr(self, "record_id"):
await self.get_resolved_ref()
return self.repo

def get_repo_url(self):
return self.repo

async def get_resolved_ref_url(self):
resolved_spec = await self.get_resolved_spec()
return resolved_spec

def get_build_slug(self):
return f"ckan-{self.dataset_id}"


class GitRepoProvider(RepoProvider):
"""Bare bones git repo provider.

Expand Down
3 changes: 2 additions & 1 deletion binderhub/static/js/src/form.js
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ export function getBuildFormValues() {
providerPrefix === "zenodo" ||
providerPrefix === "figshare" ||
providerPrefix === "dataverse" ||
providerPrefix === "hydroshare"
providerPrefix === "hydroshare" ||
providerPrefix === "ckan"
) {
ref = "";
}
Expand Down
29 changes: 29 additions & 0 deletions binderhub/tests/test_repoproviders.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from tornado.ioloop import IOLoop

from binderhub.repoproviders import (
CKANProvider,
DataverseProvider,
FigshareProvider,
GistRepoProvider,
Expand Down Expand Up @@ -209,6 +210,34 @@ async def test_dataverse(
assert spec == resolved_spec


@pytest.mark.parametrize(
"spec,resolved_spec,resolved_ref,resolved_ref_url,build_slug",
[
[
"https://demo.ckan.org/dataset/sample-dataset-1",
"https://demo.ckan.org/dataset/sample-dataset-1",
"sample-dataset-1.v",
"https://demo.ckan.org/dataset/sample-dataset-1",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add some additional tests here for /history/ as well as ?activity_id= URLs?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, added.

"ckan-sample-dataset-1",
],
],
)
async def test_ckan(spec, resolved_spec, resolved_ref, resolved_ref_url, build_slug):
provider = CKANProvider(spec=spec)

ref = await provider.get_resolved_ref()
assert resolved_ref in ref

slug = provider.get_build_slug()
assert slug == build_slug
repo_url = provider.get_repo_url()
assert repo_url == spec
ref_url = await provider.get_resolved_ref_url()
assert ref_url == resolved_ref_url
spec = await provider.get_resolved_spec()
assert spec == resolved_spec


@pytest.mark.github_api
@pytest.mark.parametrize(
"repo,unresolved_ref,resolved_ref",
Expand Down
2 changes: 2 additions & 0 deletions docs/source/developer/repoproviders.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ Currently supported providers, their prefixes and specs are:
+------------+--------------------+-------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
| Dataverse | ``dataverse`` | ``<dataverse-DOI>`` | `Dataverse <https://dataverse.org/>`_ is open source research data repository software installed all over the world. |
+------------+--------------------+-------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
| CKAN | ``ckan`` | ``<url-escaped-url>/<dataset-id>`` | `CKAN <https://ckan.org/>`_ is an open source data management system. |
+------------+--------------------+-------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
| Git | ``git`` | ``<url-escaped-url>/<commit-sha>`` | A generic repository provider for URLs that point directly to a git repository. |
+------------+--------------------+-------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+

Expand Down
5 changes: 5 additions & 0 deletions docs/source/reference/repoproviders.rst
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,11 @@ Module: :mod:`binderhub.repoproviders`
.. autoconfigurable:: DataverseProvider
:members:

:class:`CKANProvider`
---------------------------

.. autoconfigurable:: CKANProvider
:members:

:class:`GitRepoProvider`
---------------------------
Expand Down
Loading