Skip to content

Commit

Permalink
Merge pull request #82 from nasa/harmony-1714
Browse files Browse the repository at this point in the history
Harmony 1714 - Disambiguate files when downloading files that have the same name from the same job
  • Loading branch information
vinnyinverso authored Mar 26, 2024
2 parents 5d35a0d + df60c5a commit 8d778c8
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 2 deletions.
49 changes: 47 additions & 2 deletions harmony/harmony.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from tabnanny import check
import time
import platform
from uuid import UUID
from requests import Response
from requests.exceptions import JSONDecodeError
import requests.models
Expand Down Expand Up @@ -1078,13 +1079,57 @@ def result_urls(self,
if link['rel'] == 'data':
yield link['href']

def _is_staged_result(self, url: str) -> str:
"""Check if the URL indicates that the data is associated with actual
service ouputs (as opposed to a download link for example).
Args:
url: The location (URL) of the file to be downloaded
Returns:
A boolean indicating whether the data is staged data.
"""
url_parts = url.split('/')
possible_uuid = url_parts[-3]
possible_item_id = url_parts[-2]
try:
uuid_obj = UUID(possible_uuid, version=4)
except ValueError:
return False
if str(uuid_obj) != possible_uuid:
return False
if not possible_item_id.isnumeric():
return False
return True

def get_download_filename_from_url(self, url: str) -> str:
"""For a given URL, returns the filename that will be used for download.
It will include a Harmony generated ID prefix if the data is staged.
Args:
url: The location (URL) of the file to be downloaded
Returns:
The filename that will be used to name the downloaded file.
"""
url_parts = url.split('/')
original_filename = url_parts[-1]

is_staged_result = self._is_staged_result(url)
if not is_staged_result:
return original_filename
item_id = url_parts[-2]
return f'{item_id}_{original_filename}'

def _download_file(self, url: str, directory: str = '', overwrite: bool = False) -> str:
"""Downloads data, saves it to a file, and returns the filename.
Performance should be close to native with an appropriate chunk size. This can be changed
via environment variable DOWNLOAD_CHUNK_SIZE.
Filenames are automatically determined by using the latter portion of the provided URL.
Filenames are automatically determined by using the latter portion of the provided URL
and will be prefixed by the item id generated by Harmony when data was transformed
from the original.
Args:
url: The location (URL) of the file to be downloaded
Expand All @@ -1099,7 +1144,7 @@ def _download_file(self, url: str, directory: str = '', overwrite: bool = False)
"""
chunksize = int(self.config.DOWNLOAD_CHUNK_SIZE)
session = self._session()
filename = url.split('/')[-1]
filename = self.get_download_filename_from_url(url)

if directory:
filename = os.path.join(directory, filename)
Expand Down
12 changes: 12 additions & 0 deletions tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -1162,6 +1162,18 @@ def side_effect_for_get_json(extra_links) -> List[str]:

return [status_running1, status_running2, status_paused, status_resumed, status_successful, status_successful]

def test_get_file_name_staged_link():
# For staged results, the filename should get prefixed with the work item id, to avoid collisions
client = Client(should_validate_auth=False)
actual_file_name = client.get_download_filename_from_url('https://harmony.earthdata.nasa.gov/service-results/staging-bucket/a7aee059-7531-4388-86e0-85af1de9c31a/1047412/C1254854453-LARC_CLOUD_merged.nc4')
assert actual_file_name == '1047412_C1254854453-LARC_CLOUD_merged.nc4'

def test_get_file_name_non_staged_link():
# In this case, e.g. for a direct download data link, the filename should just be the last part of the URL path
client = Client(should_validate_auth=False)
actual_file_name = client.get_download_filename_from_url('https://harmony.earthdata.nasa.gov/service-results/test-data/C1261703151-EEDTEST/ATL08_20181014001049_02350102_006_02.h5')
assert actual_file_name == 'ATL08_20181014001049_02350102_006_02.h5'

@pytest.mark.parametrize('link_type', [LinkType.http, LinkType.https, LinkType.s3])
def test_iterator(link_type, mocker):
extra_links = extra_links_for_iteration(link_type.value)
Expand Down

0 comments on commit 8d778c8

Please sign in to comment.