Cleanup URL parsing mechanisms

yuvipanda · Jun 11, 2024 · 45fb7b9 · 45fb7b9
1 parent 4961341
commit 45fb7b9
Showing 1 changed file with 15 additions and 11 deletions.
diff --git a/repo2docker/contentproviders/ckan.py b/repo2docker/contentproviders/ckan.py
@@ -43,28 +43,32 @@ def detect(self, source, ref=None, extra_args=None):
         if not parsed_url.netloc:
             return None
 
-        url_parts_1 = parsed_url.path.split("/history/")
-        url_parts_2 = url_parts_1[0].split("/")
-        if url_parts_2[-2] == "dataset":
-            self.dataset_id = url_parts_2[-1]
-        else:
+        if "/dataset/" not in parsed_url.path:
+            # Not actually a dataset
             return None
 
-        api_url_path = "/api/3/action/"
+        # CKAN may be under a URL prefix, and we should accomodate that
+        url_prefix, dataset_url = parsed_url.path.split("/dataset/")
+
+        dataset_url_parts = dataset_url.split("/")
+        self.dataset_id = dataset_url_parts[0]
+
         api_url = parsed_url._replace(
-            path="/".join(url_parts_2[:-2]) + api_url_path, query=""
+            path=f"{url_prefix}/api/3/action/", query=""
         ).geturl()
 
         status_show_url = f"{api_url}status_show"
         resp = self.urlopen(status_show_url)
         if resp.status_code == 200:
 
-            # handle the activites
+            # Activity ID may be present either as a query parameter, activity_id
+            # or as part of the URL, under `/history/<activity-id>`. If `/history/`
+            # is present, that takes precedence over `activity_id`
             activity_id = None
-            if parse_qs(parsed_url.query).get("activity_id") is not None:
+            if "history" in dataset_url_parts:
+                activity_id = dataset_url_parts[dataset_url_parts.index("history") + 1]
+            elif parse_qs(parsed_url.query).get("activity_id") is not None:
                 activity_id = parse_qs(parsed_url.query).get("activity_id")[0]
-            if len(url_parts_1) == 2:
-                activity_id = url_parts_1[-1]
 
             self.version = self._fetch_version(api_url)
             return {