Skip to content

Commit

Permalink
canonicalize data dir in config ID hash
Browse files Browse the repository at this point in the history
This leaves the hash unchanged when the data dir changes in
insubstantial ways, like adding a trailing slash or using a symlink.

fixes #5871
  • Loading branch information
kylrth committed May 25, 2023
1 parent 6801623 commit eac649f
Showing 1 changed file with 10 additions and 2 deletions.
12 changes: 10 additions & 2 deletions src/datasets/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,16 @@ def create_config_id(
# it was previously ignored before the introduction of config id because we didn't want
# to change the config name. Now it's fine to take it into account for the config id.
# config_kwargs_to_add_to_suffix.pop("data_dir", None)
if "data_dir" in config_kwargs_to_add_to_suffix and config_kwargs_to_add_to_suffix["data_dir"] is None:
config_kwargs_to_add_to_suffix.pop("data_dir", None)
if "data_dir" in config_kwargs_to_add_to_suffix:
if config_kwargs_to_add_to_suffix["data_dir"] is None:
config_kwargs_to_add_to_suffix.pop("data_dir", None)
else:
# canonicalize the data dir to avoid two paths to the same location having different
# hashes
data_dir = config_kwargs_to_add_to_suffix["data_dir"]
data_dir = os.path.realpath(data_dir)
data_dir = os.path.normpath(data_dir)
config_kwargs_to_add_to_suffix["data_dir"] = data_dir
if config_kwargs_to_add_to_suffix:
# we don't care about the order of the kwargs
config_kwargs_to_add_to_suffix = {
Expand Down

0 comments on commit eac649f

Please sign in to comment.