From 7b2d0003d850ca2dbca1e13b7232ab9bbfdb4b7a Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Fri, 3 Mar 2023 20:45:58 +0100 Subject: [PATCH 1/6] don't save dataset info to cache dir when verification mode is 'no_checks' --- src/datasets/builder.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/datasets/builder.py b/src/datasets/builder.py index 943a2f13db7..b1ca164ed00 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -357,10 +357,11 @@ def __init__( os.makedirs(self._cache_dir_root, exist_ok=True) lock_path = os.path.join(self._cache_dir_root, self._cache_dir.replace(os.sep, "_") + ".lock") with FileLock(lock_path): - if os.path.exists(self._cache_dir): # check if data exist - if len(os.listdir(self._cache_dir)) > 0: - logger.info("Overwrite dataset info from restored data version.") - self.info = DatasetInfo.from_directory(self._cache_dir) + if os.path.exists(self._cache_dir) > 0: # check if data exist + if len(os.listdir(self._cache_dir)): + logger.info("Overwrite dataset info from restored data version if exists.") + if os.path.exists(path_join(self._cache_dir, config.DATASET_INFO_FILENAME)): + self.info = DatasetInfo.from_directory(self._cache_dir) else: # dir exists but no data, remove the empty dir as data aren't available anymore logger.warning( f"Old caching folder {self._cache_dir} for dataset {self.name} exists but not data were found. Removing it. " @@ -880,7 +881,8 @@ def incomplete_dir(dirname): self.info.download_checksums = dl_manager.get_recorded_sizes_checksums() self.info.size_in_bytes = self.info.dataset_size + self.info.download_size # Save info - self._save_info() + if verification_mode is not VerificationMode.NO_CHECKS: + self._save_info() # Download post processing resources self.download_post_processing_resources(dl_manager) @@ -1705,7 +1707,7 @@ def _prepare_split( is_local = not is_remote_filesystem(self._fs) path_join = os.path.join if is_local else posixpath.join - if self.info.splits is not None: + if self.info.splits is not None: # and verification_mode == " .. " split_info = self.info.splits[split_generator.name] else: split_info = split_generator.split_info From 89721b28d3b6654197e771897f5e65f85e02cc0e Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Fri, 3 Mar 2023 20:51:50 +0100 Subject: [PATCH 2/6] remove comment --- src/datasets/builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/builder.py b/src/datasets/builder.py index b1ca164ed00..46fbe50d59c 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -1707,7 +1707,7 @@ def _prepare_split( is_local = not is_remote_filesystem(self._fs) path_join = os.path.join if is_local else posixpath.join - if self.info.splits is not None: # and verification_mode == " .. " + if self.info.splits is not None: split_info = self.info.splits[split_generator.name] else: split_info = split_generator.split_info From e45f3ea6467e1f1c8db84f3239520de36d8948ca Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Wed, 8 Mar 2023 11:24:17 +0100 Subject: [PATCH 3/6] fix outdated verification mode values: 'none' -> 'no_checks', 'all' -> 'all_checks' --- src/datasets/builder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/datasets/builder.py b/src/datasets/builder.py index 46fbe50d59c..88a89530f0f 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -719,7 +719,7 @@ def download_and_prepare( ``` """ if ignore_verifications != "deprecated": - verification_mode = "none" if ignore_verifications else "full" + verification_mode = "no_checks" if ignore_verifications else "all_checks" warnings.warn( "'ignore_verifications' was deprecated in favor of 'verification_mode' in version 2.9.1 and will be removed in 3.0.0.\n" f"You can remove this warning by passing 'verification_mode={verification_mode}' instead.", @@ -1080,7 +1080,7 @@ def as_dataset( ``` """ if ignore_verifications != "deprecated": - verification_mode = "none" if ignore_verifications else "full" + verification_mode = "no_checks" if ignore_verifications else "all_checks" warnings.warn( "'ignore_verifications' was deprecated in favor of 'verification' in version 2.9.1 and will be removed in 3.0.0.\n" f"You can remove this warning by passing 'verification_mode={verification_mode}' instead.", From 4b00c9d377fe80aafc92a0b95665b0c5333d7a83 Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Wed, 8 Mar 2023 12:34:48 +0100 Subject: [PATCH 4/6] change string repr of verification_mode to enum to avoid bugs if we change values again --- src/datasets/builder.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/datasets/builder.py b/src/datasets/builder.py index 88a89530f0f..f24df9e955d 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -375,7 +375,7 @@ def __init__( # Set download manager self.dl_manager = None - # Record infos even if verification_mode="none"; used by "datasets-cli test" to generate file checksums for (deprecated) dataset_infos.json + # Set to True by "datasets-cli test" to generate file checksums for (deprecated) dataset_infos.json independently of verification_mode value self._record_infos = False # Enable streaming (e.g. it patches "open" to work with remote files) @@ -719,10 +719,10 @@ def download_and_prepare( ``` """ if ignore_verifications != "deprecated": - verification_mode = "no_checks" if ignore_verifications else "all_checks" + verification_mode = VerificationMode.NO_CHECKS if ignore_verifications else VerificationMode.ALL_CHECKS warnings.warn( "'ignore_verifications' was deprecated in favor of 'verification_mode' in version 2.9.1 and will be removed in 3.0.0.\n" - f"You can remove this warning by passing 'verification_mode={verification_mode}' instead.", + f"You can remove this warning by passing 'verification_mode={verification_mode.value}' instead.", FutureWarning, ) if use_auth_token != "deprecated": @@ -1080,10 +1080,10 @@ def as_dataset( ``` """ if ignore_verifications != "deprecated": - verification_mode = "no_checks" if ignore_verifications else "all_checks" + verification_mode = verification_mode.NO_CHECKS if ignore_verifications else VerificationMode.ALL_CHECKS warnings.warn( "'ignore_verifications' was deprecated in favor of 'verification' in version 2.9.1 and will be removed in 3.0.0.\n" - f"You can remove this warning by passing 'verification_mode={verification_mode}' instead.", + f"You can remove this warning by passing 'verification_mode={verification_mode.value}' instead.", FutureWarning, ) is_local = not is_remote_filesystem(self._fs) From 315a9bd11b0c843eb997576828cc20129df6b938 Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Wed, 8 Mar 2023 13:18:49 +0100 Subject: [PATCH 5/6] revert all changes except for fixing verification_mode values --- src/datasets/builder.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/datasets/builder.py b/src/datasets/builder.py index f24df9e955d..d6a0ef9f89d 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -357,11 +357,12 @@ def __init__( os.makedirs(self._cache_dir_root, exist_ok=True) lock_path = os.path.join(self._cache_dir_root, self._cache_dir.replace(os.sep, "_") + ".lock") with FileLock(lock_path): - if os.path.exists(self._cache_dir) > 0: # check if data exist - if len(os.listdir(self._cache_dir)): + if os.path.exists(self._cache_dir): # check if data exist + if len(os.listdir(self._cache_dir)) > 0 and os.path.exists( + path_join(self._cache_dir, config.DATASET_INFO_FILENAME) + ): logger.info("Overwrite dataset info from restored data version if exists.") - if os.path.exists(path_join(self._cache_dir, config.DATASET_INFO_FILENAME)): - self.info = DatasetInfo.from_directory(self._cache_dir) + self.info = DatasetInfo.from_directory(self._cache_dir) else: # dir exists but no data, remove the empty dir as data aren't available anymore logger.warning( f"Old caching folder {self._cache_dir} for dataset {self.name} exists but not data were found. Removing it. " @@ -375,7 +376,7 @@ def __init__( # Set download manager self.dl_manager = None - # Set to True by "datasets-cli test" to generate file checksums for (deprecated) dataset_infos.json independently of verification_mode value + # Set to True by "datasets-cli test" to generate file checksums for (deprecated) dataset_infos.json independently of verification_mode value. self._record_infos = False # Enable streaming (e.g. it patches "open" to work with remote files) @@ -881,8 +882,7 @@ def incomplete_dir(dirname): self.info.download_checksums = dl_manager.get_recorded_sizes_checksums() self.info.size_in_bytes = self.info.dataset_size + self.info.download_size # Save info - if verification_mode is not VerificationMode.NO_CHECKS: - self._save_info() + self._save_info() # Download post processing resources self.download_post_processing_resources(dl_manager) From dc747fbd8aa7c4a83168696b3ce7716d69e74b0f Mon Sep 17 00:00:00 2001 From: polinaeterna Date: Wed, 8 Mar 2023 21:21:01 +0100 Subject: [PATCH 6/6] fix: delete dir when there are no files --- src/datasets/builder.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/datasets/builder.py b/src/datasets/builder.py index d6a0ef9f89d..103f2dbd839 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -358,14 +358,13 @@ def __init__( lock_path = os.path.join(self._cache_dir_root, self._cache_dir.replace(os.sep, "_") + ".lock") with FileLock(lock_path): if os.path.exists(self._cache_dir): # check if data exist - if len(os.listdir(self._cache_dir)) > 0 and os.path.exists( - path_join(self._cache_dir, config.DATASET_INFO_FILENAME) - ): - logger.info("Overwrite dataset info from restored data version if exists.") - self.info = DatasetInfo.from_directory(self._cache_dir) + if len(os.listdir(self._cache_dir)) > 0: + if os.path.exists(path_join(self._cache_dir, config.DATASET_INFO_FILENAME)): + logger.info("Overwrite dataset info from restored data version if exists.") + self.info = DatasetInfo.from_directory(self._cache_dir) else: # dir exists but no data, remove the empty dir as data aren't available anymore logger.warning( - f"Old caching folder {self._cache_dir} for dataset {self.name} exists but not data were found. Removing it. " + f"Old caching folder {self._cache_dir} for dataset {self.name} exists but no data were found. Removing it. " ) os.rmdir(self._cache_dir)