-
Notifications
You must be signed in to change notification settings - Fork 2.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix outdated verification_mode
values
#5607
Changes from 6 commits
7b2d000
89721b2
b9cacdd
e45f3ea
11d5f49
4b00c9d
315a9bd
dc747fb
1b514bb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -357,10 +357,11 @@ def __init__( | |
os.makedirs(self._cache_dir_root, exist_ok=True) | ||
lock_path = os.path.join(self._cache_dir_root, self._cache_dir.replace(os.sep, "_") + ".lock") | ||
with FileLock(lock_path): | ||
if os.path.exists(self._cache_dir): # check if data exist | ||
if len(os.listdir(self._cache_dir)) > 0: | ||
logger.info("Overwrite dataset info from restored data version.") | ||
self.info = DatasetInfo.from_directory(self._cache_dir) | ||
if os.path.exists(self._cache_dir) > 0: # check if data exist | ||
if len(os.listdir(self._cache_dir)): | ||
logger.info("Overwrite dataset info from restored data version if exists.") | ||
if os.path.exists(path_join(self._cache_dir, config.DATASET_INFO_FILENAME)): | ||
self.info = DatasetInfo.from_directory(self._cache_dir) | ||
else: # dir exists but no data, remove the empty dir as data aren't available anymore | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should remove the directory only if it's completely empty - but with your change it would remove it if the dataset info file is not there. This is in case some data fiels are still left somehow There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks for noticing, fixed it, can I merge? |
||
logger.warning( | ||
f"Old caching folder {self._cache_dir} for dataset {self.name} exists but not data were found. Removing it. " | ||
|
@@ -374,7 +375,7 @@ def __init__( | |
# Set download manager | ||
self.dl_manager = None | ||
|
||
# Record infos even if verification_mode="none"; used by "datasets-cli test" to generate file checksums for (deprecated) dataset_infos.json | ||
# Set to True by "datasets-cli test" to generate file checksums for (deprecated) dataset_infos.json independently of verification_mode value | ||
self._record_infos = False | ||
|
||
# Enable streaming (e.g. it patches "open" to work with remote files) | ||
|
@@ -718,10 +719,10 @@ def download_and_prepare( | |
``` | ||
""" | ||
if ignore_verifications != "deprecated": | ||
verification_mode = "none" if ignore_verifications else "full" | ||
verification_mode = VerificationMode.NO_CHECKS if ignore_verifications else VerificationMode.ALL_CHECKS | ||
warnings.warn( | ||
"'ignore_verifications' was deprecated in favor of 'verification_mode' in version 2.9.1 and will be removed in 3.0.0.\n" | ||
f"You can remove this warning by passing 'verification_mode={verification_mode}' instead.", | ||
f"You can remove this warning by passing 'verification_mode={verification_mode.value}' instead.", | ||
FutureWarning, | ||
) | ||
if use_auth_token != "deprecated": | ||
|
@@ -880,7 +881,8 @@ def incomplete_dir(dirname): | |
self.info.download_checksums = dl_manager.get_recorded_sizes_checksums() | ||
self.info.size_in_bytes = self.info.dataset_size + self.info.download_size | ||
# Save info | ||
self._save_info() | ||
if verification_mode is not VerificationMode.NO_CHECKS: | ||
self._save_info() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should always save the info. For example it stores the feature types and the arrow shards lengths. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hm true... load_dataset("ds_name") # error - failed verifications
# then
load_dataset("ds_name", verification_mode="no_checks") # successfully loaded
# then
load_dataset("ds_name") # successfully loaded despite it was an error before am I missing something? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think we can do anything tbh - e.g. reloading a cached dataset works even if you don't have the downloaded raw files anymore so we can't check for their checksums There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. okay reverted it. But I also changed incorrect values of |
||
|
||
# Download post processing resources | ||
self.download_post_processing_resources(dl_manager) | ||
|
@@ -1078,10 +1080,10 @@ def as_dataset( | |
``` | ||
""" | ||
if ignore_verifications != "deprecated": | ||
verification_mode = "none" if ignore_verifications else "full" | ||
verification_mode = verification_mode.NO_CHECKS if ignore_verifications else VerificationMode.ALL_CHECKS | ||
warnings.warn( | ||
"'ignore_verifications' was deprecated in favor of 'verification' in version 2.9.1 and will be removed in 3.0.0.\n" | ||
f"You can remove this warning by passing 'verification_mode={verification_mode}' instead.", | ||
f"You can remove this warning by passing 'verification_mode={verification_mode.value}' instead.", | ||
FutureWarning, | ||
) | ||
is_local = not is_remote_filesystem(self._fs) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.