Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix outdated verification_mode values #5607

Merged
22 changes: 12 additions & 10 deletions src/datasets/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,10 +357,11 @@ def __init__(
os.makedirs(self._cache_dir_root, exist_ok=True)
lock_path = os.path.join(self._cache_dir_root, self._cache_dir.replace(os.sep, "_") + ".lock")
with FileLock(lock_path):
if os.path.exists(self._cache_dir): # check if data exist
if len(os.listdir(self._cache_dir)) > 0:
logger.info("Overwrite dataset info from restored data version.")
self.info = DatasetInfo.from_directory(self._cache_dir)
if os.path.exists(self._cache_dir) > 0: # check if data exist
if len(os.listdir(self._cache_dir)):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if os.path.exists(self._cache_dir) > 0: # check if data exist
if len(os.listdir(self._cache_dir)):
if os.path.exists(self._cache_dir): # check if data exist
if len(os.listdir(self._cache_dir)) > 0:

logger.info("Overwrite dataset info from restored data version if exists.")
if os.path.exists(path_join(self._cache_dir, config.DATASET_INFO_FILENAME)):
self.info = DatasetInfo.from_directory(self._cache_dir)
else: # dir exists but no data, remove the empty dir as data aren't available anymore
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should remove the directory only if it's completely empty - but with your change it would remove it if the dataset info file is not there. This is in case some data fiels are still left somehow

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for noticing, fixed it, can I merge?

logger.warning(
f"Old caching folder {self._cache_dir} for dataset {self.name} exists but not data were found. Removing it. "
Expand All @@ -374,7 +375,7 @@ def __init__(
# Set download manager
self.dl_manager = None

# Record infos even if verification_mode="none"; used by "datasets-cli test" to generate file checksums for (deprecated) dataset_infos.json
# Set to True by "datasets-cli test" to generate file checksums for (deprecated) dataset_infos.json independently of verification_mode value
self._record_infos = False

# Enable streaming (e.g. it patches "open" to work with remote files)
Expand Down Expand Up @@ -718,10 +719,10 @@ def download_and_prepare(
```
"""
if ignore_verifications != "deprecated":
verification_mode = "none" if ignore_verifications else "full"
verification_mode = VerificationMode.NO_CHECKS if ignore_verifications else VerificationMode.ALL_CHECKS
warnings.warn(
"'ignore_verifications' was deprecated in favor of 'verification_mode' in version 2.9.1 and will be removed in 3.0.0.\n"
f"You can remove this warning by passing 'verification_mode={verification_mode}' instead.",
f"You can remove this warning by passing 'verification_mode={verification_mode.value}' instead.",
FutureWarning,
)
if use_auth_token != "deprecated":
Expand Down Expand Up @@ -880,7 +881,8 @@ def incomplete_dir(dirname):
self.info.download_checksums = dl_manager.get_recorded_sizes_checksums()
self.info.size_in_bytes = self.info.dataset_size + self.info.download_size
# Save info
self._save_info()
if verification_mode is not VerificationMode.NO_CHECKS:
self._save_info()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should always save the info. For example it stores the feature types and the arrow shards lengths.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hm true...
what should we do then? I don't think it's fine that we get different results running the same command, like:

load_dataset("ds_name")  # error - failed verifications
# then
load_dataset("ds_name", verification_mode="no_checks")  # successfully loaded
# then
load_dataset("ds_name")  # successfully loaded despite it was an error before

am I missing something?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we can do anything tbh - e.g. reloading a cached dataset works even if you don't have the downloaded raw files anymore so we can't check for their checksums

Copy link
Contributor Author

@polinaeterna polinaeterna Mar 8, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

okay reverted it.

But I also changed incorrect values of verification_mode in two places, from "none" -> "no_checks", "all" -> "all_checks" (they weren't updated in the previous verification PR #5303), do you agree with this change?


# Download post processing resources
self.download_post_processing_resources(dl_manager)
Expand Down Expand Up @@ -1078,10 +1080,10 @@ def as_dataset(
```
"""
if ignore_verifications != "deprecated":
verification_mode = "none" if ignore_verifications else "full"
verification_mode = verification_mode.NO_CHECKS if ignore_verifications else VerificationMode.ALL_CHECKS
warnings.warn(
"'ignore_verifications' was deprecated in favor of 'verification' in version 2.9.1 and will be removed in 3.0.0.\n"
f"You can remove this warning by passing 'verification_mode={verification_mode}' instead.",
f"You can remove this warning by passing 'verification_mode={verification_mode.value}' instead.",
FutureWarning,
)
is_local = not is_remote_filesystem(self._fs)
Expand Down