From c78092f9e9b991446f3df56b292cf04978911df0 Mon Sep 17 00:00:00 2001 From: Piotr Date: Thu, 17 Oct 2024 14:24:46 +0200 Subject: [PATCH] Do not re-download attachment if exit on disk (#15) --- .../salesforce/download.py | 89 +++++++------------ test/salesforce/test_download.py | 61 +++++++------ 2 files changed, 67 insertions(+), 83 deletions(-) diff --git a/src/salesforce_archivist/salesforce/download.py b/src/salesforce_archivist/salesforce/download.py index ea2cb38..eb7d1d9 100644 --- a/src/salesforce_archivist/salesforce/download.py +++ b/src/salesforce_archivist/salesforce/download.py @@ -206,78 +206,55 @@ def __init__( self._stop_event = threading.Event() self._max_workers = max_workers - def _download_content_version_from_sf( - self, downloaded_versions_list: DownloadedList, version: ContentVersion, download_path: str + def _download_file_from_sf_api( + self, downloaded_list: DownloadedList, download_obj: Union[ContentVersion, Attachment], download_path: str ) -> None: - downloaded_version = downloaded_versions_list.get(version) + pass + + def download_file_from_sf( + self, + downloaded_list: DownloadedList, + download_obj: Union[ContentVersion, Attachment], + download_path: str, + ) -> None: + downloaded_file = downloaded_list.get(download_obj) # file exist under the path that we want to download into if os.path.exists(download_path): - # if no version exist in downloaded list add a new downloaded version with this path - if downloaded_version is None: - downloaded_version = DownloadedSalesforceObject( - obj_id=version.id, + # if no file exist in downloaded list add a new downloaded version with this path + if downloaded_file is None: + downloaded_file = DownloadedSalesforceObject( + obj_id=download_obj.id, path=download_path, ) - downloaded_versions_list.add(downloaded_version) + downloaded_list.add(downloaded_file) - # version is on downloaded list and version points to existing file on disk - elif downloaded_version is not None and os.path.exists(downloaded_version.path): - # copy existing file if download path is different from already downloaded version path in the list - if downloaded_version.path != download_path: + # file is on downloaded list and it points to existing file on disk + elif downloaded_file is not None and os.path.exists(downloaded_file.path): + # copy existing file if download path is different from already downloaded path in the list + if downloaded_file.path != download_path: os.makedirs(os.path.dirname(download_path), exist_ok=True) - shutil.copy(downloaded_version.path, download_path) + shutil.copy(downloaded_file.path, download_path) - # download version using SF API and add to the list + # download file using SF API and add to the list else: - result = self._client.download_content_version(version) + if isinstance(download_obj, ContentVersion): + result = self._client.download_content_version(download_obj) + elif isinstance(download_obj, Attachment): + result = self._client.download_attachment(download_obj) + else: + raise ValueError("Unknown object type provided {type}".format(type=type(download_obj))) + os.makedirs(os.path.dirname(download_path), exist_ok=True) with open(download_path, "wb") as file: for chunk in result.iter_content(chunk_size=1024): if chunk: file.write(chunk) - downloaded_version = DownloadedSalesforceObject( - obj_id=version.id, + downloaded_file = DownloadedSalesforceObject( + obj_id=download_obj.id, path=download_path, ) - downloaded_versions_list.add(downloaded_version) - - def _download_attachment_from_sf( - self, downloaded_attachment_list: DownloadedList, attachment: Attachment, download_path: str - ) -> None: - result = self._client.download_attachment(attachment) - os.makedirs(os.path.dirname(download_path), exist_ok=True) - with open(download_path, "wb") as file: - for chunk in result.iter_content(chunk_size=1024): - if chunk: - file.write(chunk) - - downloaded_attachment = DownloadedSalesforceObject( - obj_id=attachment.id, - path=download_path, - ) - downloaded_attachment_list.add(downloaded_attachment) - - def download_from_sf( - self, - downloaded_list: DownloadedList, - download_obj: Union[ContentVersion, Attachment], - download_path: str, - ) -> None: - if isinstance(download_obj, ContentVersion): - self._download_content_version_from_sf( - downloaded_versions_list=downloaded_list, - version=download_obj, - download_path=download_path, - ) - elif isinstance(download_obj, Attachment): - self._download_attachment_from_sf( - downloaded_attachment_list=downloaded_list, - attachment=download_obj, - download_path=download_path, - ) - else: - raise ValueError("Unknown object type provided {type}".format(type=type(download_obj))) + downloaded_list.add(downloaded_file) def _print_download_msg(self, msg: str, error: bool = False) -> None: try: @@ -313,7 +290,7 @@ def download_or_wait( error = False try: self._wait_if_api_usage_limit() - self.download_from_sf( + self.download_file_from_sf( downloaded_list=downloaded_list, download_obj=download_obj, download_path=download_path ) except StopDownloadException: diff --git a/test/salesforce/test_download.py b/test/salesforce/test_download.py index c38bf17..30a85c2 100644 --- a/test/salesforce/test_download.py +++ b/test/salesforce/test_download.py @@ -288,7 +288,7 @@ def test_downloader_download_will_gracefully_shutdown(shutdown_mock, submit_mock shutdown_mock.assert_has_calls([call(wait=True), call(wait=True, cancel_futures=True)]) -@patch.object(Downloader, "download_from_sf", side_effect=RuntimeError) +@patch.object(Downloader, "download_file_from_sf", side_effect=RuntimeError) def test_downloader_download_will_return_download_stats(download_mock): archivist_obj = ArchivistObject(data_dir="/fake/dir", obj_type="User") link_list = ContentDocumentLinkList(data_dir=archivist_obj.obj_dir) @@ -323,7 +323,7 @@ def test_downloader_download_will_return_download_stats(download_mock): @patch("os.path.exists") -def test_downloader_download_from_sf_will_add_already_downloaded_object_to_list( +def test_downloader_download_file_from_sf_will_add_already_downloaded_object_to_list( exist_mock, ): exist_mock.return_value = True @@ -337,25 +337,32 @@ def test_downloader_download_from_sf_will_add_already_downloaded_object_to_list( version_number=1, content_size=10, ) + attachment = Attachment(attachment_id="ID", parent_id="PID", content_size=10, name="Name") downloaded_list = DownloadedList(data_dir=archivist_obj.obj_dir, file_name="downloaded_versions.csv") sf_client = Mock() downloader = Downloader( sf_client=sf_client, ) - downloader.download_from_sf(downloaded_list=downloaded_list, download_obj=version, download_path="/fake/path") - exist_mock.assert_called_once() - assert len(downloaded_list) == 1 + downloader.download_file_from_sf(downloaded_list=downloaded_list, download_obj=version, download_path="/fake/path") + downloader.download_file_from_sf( + downloaded_list=downloaded_list, download_obj=attachment, download_path="/fake/path" + ) + assert exist_mock.call_count == 2 + assert len(downloaded_list) == 2 assert downloaded_list.get(obj=version).id == version.id + assert downloaded_list.get(obj=attachment).id == attachment.id + assert sf_client.download_attachment.call_count == 0 + assert sf_client.download_content_version.call_count == 0 -def test_downloader_download_from_sf_will_copy_existing_file_to_new_path(): +def test_downloader_download_file_from_sf_will_copy_existing_file_to_new_path(): with tempfile.TemporaryDirectory() as tmp_dir: archivist_obj = ArchivistObject(data_dir=tmp_dir, obj_type="User") already_downloaded_path = os.path.join(archivist_obj.obj_dir, "files", "file1.txt") to_download_path = os.path.join(archivist_obj.obj_dir, "files", "file2.txt") download_list_mock = MagicMock() - version1 = ContentVersion( + obj1 = ContentVersion( version_id="CID", document_id="DOC1", checksum="c", @@ -364,7 +371,7 @@ def test_downloader_download_from_sf_will_copy_existing_file_to_new_path(): version_number=1, content_size=10, ) - version2 = ContentVersion( + obj2 = ContentVersion( version_id="CID", document_id="DOC2", checksum="c", @@ -374,38 +381,38 @@ def test_downloader_download_from_sf_will_copy_existing_file_to_new_path(): content_size=10, ) download_list_mock.__iter__.return_value = [ - (version1, already_downloaded_path), - (version2, to_download_path), + (obj1, already_downloaded_path), + (obj2, to_download_path), ] os.makedirs(os.path.dirname(already_downloaded_path), exist_ok=True) file_contents = b"test" with open(already_downloaded_path, "wb") as downloaded_file: downloaded_file.write(file_contents) - downloaded_version_list = DownloadedList(data_dir=archivist_obj.obj_dir, file_name="downloaded_versions.csv") - downloaded_version = DownloadedSalesforceObject( - obj_id=version1.id, + downloaded_list = DownloadedList(data_dir=archivist_obj.obj_dir, file_name="downloaded_versions.csv") + downloaded_obj = DownloadedSalesforceObject( + obj_id=obj1.id, path=already_downloaded_path, ) - downloaded_version_list.add(downloaded_version) + downloaded_list.add(downloaded_obj) sf_client = Mock() downloader = Downloader( sf_client=sf_client, ) - downloader.download_from_sf( - download_obj=version2, + downloader.download_file_from_sf( + download_obj=obj2, download_path=to_download_path, - downloaded_list=downloaded_version_list, + downloaded_list=downloaded_list, ) assert os.path.exists(to_download_path) with open(to_download_path, "rb") as new_file: assert new_file.read() == file_contents -def test_downloader_download_from_sf_will_download_version_from_salesforce(): +def test_downloader_download_file_from_sf_will_download_version_from_salesforce(): with tempfile.TemporaryDirectory() as tmp_dir: archivist_obj = ArchivistObject(data_dir=tmp_dir, obj_type="User") - version = ContentVersion( + obj = ContentVersion( version_id="VID1", document_id="DOC1", checksum="c1", @@ -424,8 +431,8 @@ def test_downloader_download_from_sf_will_download_version_from_salesforce(): sf_client=sf_client, ) path = os.path.join(tmp_dir, "test.txt") - downloader.download_from_sf( - download_obj=version, + downloader.download_file_from_sf( + download_obj=obj, download_path=path, downloaded_list=downloaded_list, ) @@ -434,10 +441,10 @@ def test_downloader_download_from_sf_will_download_version_from_salesforce(): assert file.read() == b"test" -def test_downloader_download_from_sf_will_download_attachment_from_salesforce(): +def test_downloader_download_file_from_sf_will_download_attachment_from_salesforce(): with tempfile.TemporaryDirectory() as tmp_dir: - archivist_obj = ArchivistObject(data_dir=tmp_dir, obj_type="User") - attachment = Attachment( + archivist_obj = ArchivistObject(data_dir=tmp_dir, obj_type="Attachment") + obj = Attachment( attachment_id="ID", parent_id="PID", content_size=10, @@ -453,8 +460,8 @@ def test_downloader_download_from_sf_will_download_attachment_from_salesforce(): sf_client=sf_client, ) path = os.path.join(tmp_dir, "test.txt") - downloader.download_from_sf( - download_obj=attachment, + downloader.download_file_from_sf( + download_obj=obj, download_path=path, downloaded_list=downloaded_list, ) @@ -477,7 +484,7 @@ def usage_side_effect(refresh: bool) -> ApiUsage: sf_client.get_api_usage.side_effect = lambda refresh=False: usage_side_effect(refresh=refresh) download_list_mock = MagicMock() download_list_mock.__iter__.return_value = [] - with patch.object(Downloader, "download_from_sf"): + with patch.object(Downloader, "download_file_from_sf"): wait = 7 downloader = Downloader( sf_client=sf_client,