Skip to content

Commit

Permalink
canonicalize data dir in config ID hash (#5899)
Browse files Browse the repository at this point in the history
This leaves the hash unchanged when the data dir changes in
insubstantial ways, like adding a trailing slash or using a symlink.

fixes #5871
  • Loading branch information
kylrth authored Jun 2, 2023
1 parent 7e52021 commit 02ee418
Showing 1 changed file with 9 additions and 2 deletions.
11 changes: 9 additions & 2 deletions src/datasets/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,15 @@ def create_config_id(
# it was previously ignored before the introduction of config id because we didn't want
# to change the config name. Now it's fine to take it into account for the config id.
# config_kwargs_to_add_to_suffix.pop("data_dir", None)
if "data_dir" in config_kwargs_to_add_to_suffix and config_kwargs_to_add_to_suffix["data_dir"] is None:
config_kwargs_to_add_to_suffix.pop("data_dir", None)
if "data_dir" in config_kwargs_to_add_to_suffix:
if config_kwargs_to_add_to_suffix["data_dir"] is None:
config_kwargs_to_add_to_suffix.pop("data_dir", None)
else:
# canonicalize the data dir to avoid two paths to the same location having different
# hashes
data_dir = config_kwargs_to_add_to_suffix["data_dir"]
data_dir = os.path.normpath(data_dir)
config_kwargs_to_add_to_suffix["data_dir"] = data_dir
if config_kwargs_to_add_to_suffix:
# we don't care about the order of the kwargs
config_kwargs_to_add_to_suffix = {
Expand Down

0 comments on commit 02ee418

Please sign in to comment.