Merge pull request #82 from nasa/harmony-1714

Harmony 1714 - Disambiguate files when downloading files that have the same name from the same job
nasa · Mar 26, 2024 · 8d778c8 · 8d778c8
2 parents 5d35a0d + df60c5a
commit 8d778c8
Show file tree

Hide file tree

Showing 2 changed files with 59 additions and 2 deletions.
diff --git a/harmony/harmony.py b/harmony/harmony.py
@@ -26,6 +26,7 @@
 from tabnanny import check
 import time
 import platform
+from uuid import UUID
 from requests import Response
 from requests.exceptions import JSONDecodeError
 import requests.models
@@ -1078,13 +1079,57 @@ def result_urls(self,
                 if link['rel'] == 'data':
                     yield link['href']
 
+    def _is_staged_result(self, url: str) -> str:
+        """Check if the URL indicates that the data is associated with actual
+        service ouputs (as opposed to a download link for example).
+
+        Args:
+            url: The location (URL) of the file to be downloaded
+
+        Returns:
+            A boolean indicating whether the data is staged data.
+        """
+        url_parts = url.split('/')
+        possible_uuid = url_parts[-3]
+        possible_item_id = url_parts[-2]
+        try:
+            uuid_obj = UUID(possible_uuid, version=4)
+        except ValueError:
+            return False
+        if str(uuid_obj) != possible_uuid:
+            return False
+        if not possible_item_id.isnumeric():
+            return False
+        return True
+
+    def get_download_filename_from_url(self, url: str) -> str:
+        """For a given URL, returns the filename that will be used for download.
+        It will include a Harmony generated ID prefix if the data is staged.
+
+        Args:
+            url: The location (URL) of the file to be downloaded
+
+        Returns:
+            The filename that will be used to name the downloaded file.
+        """
+        url_parts = url.split('/')
+        original_filename = url_parts[-1]
+
+        is_staged_result = self._is_staged_result(url)
+        if not is_staged_result:
+            return original_filename
+        item_id = url_parts[-2]
+        return f'{item_id}_{original_filename}'
+
     def _download_file(self, url: str, directory: str = '', overwrite: bool = False) -> str:
         """Downloads data, saves it to a file, and returns the filename.
 
         Performance should be close to native with an appropriate chunk size. This can be changed
         via environment variable DOWNLOAD_CHUNK_SIZE.
 
-        Filenames are automatically determined by using the latter portion of the provided URL.
+        Filenames are automatically determined by using the latter portion of the provided URL
+        and will be prefixed by the item id generated by Harmony when data was transformed
+        from the original.
 
         Args:
             url: The location (URL) of the file to be downloaded
@@ -1099,7 +1144,7 @@ def _download_file(self, url: str, directory: str = '', overwrite: bool = False)
         """
         chunksize = int(self.config.DOWNLOAD_CHUNK_SIZE)
         session = self._session()
-        filename = url.split('/')[-1]
+        filename = self.get_download_filename_from_url(url)
 
         if directory:
             filename = os.path.join(directory, filename)

diff --git a/tests/test_client.py b/tests/test_client.py
@@ -1162,6 +1162,18 @@ def side_effect_for_get_json(extra_links) -> List[str]:
 
     return [status_running1, status_running2, status_paused, status_resumed, status_successful, status_successful]
 
+def test_get_file_name_staged_link():
+    # For staged results, the filename should get prefixed with the work item id, to avoid collisions
+    client = Client(should_validate_auth=False)
+    actual_file_name = client.get_download_filename_from_url('https://harmony.earthdata.nasa.gov/service-results/staging-bucket/a7aee059-7531-4388-86e0-85af1de9c31a/1047412/C1254854453-LARC_CLOUD_merged.nc4')
+    assert actual_file_name == '1047412_C1254854453-LARC_CLOUD_merged.nc4'
+
+def test_get_file_name_non_staged_link():
+    # In this case, e.g. for a direct download data link, the filename should just be the last part of the URL path
+    client = Client(should_validate_auth=False)
+    actual_file_name = client.get_download_filename_from_url('https://harmony.earthdata.nasa.gov/service-results/test-data/C1261703151-EEDTEST/ATL08_20181014001049_02350102_006_02.h5')
+    assert actual_file_name == 'ATL08_20181014001049_02350102_006_02.h5'
+
 @pytest.mark.parametrize('link_type', [LinkType.http, LinkType.https, LinkType.s3])
 def test_iterator(link_type, mocker):
     extra_links = extra_links_for_iteration(link_type.value)