diff --git a/src/datasets/builder.py b/src/datasets/builder.py index 13190244ed1..7815358210f 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -170,8 +170,15 @@ def create_config_id( # it was previously ignored before the introduction of config id because we didn't want # to change the config name. Now it's fine to take it into account for the config id. # config_kwargs_to_add_to_suffix.pop("data_dir", None) - if "data_dir" in config_kwargs_to_add_to_suffix and config_kwargs_to_add_to_suffix["data_dir"] is None: - config_kwargs_to_add_to_suffix.pop("data_dir", None) + if "data_dir" in config_kwargs_to_add_to_suffix: + if config_kwargs_to_add_to_suffix["data_dir"] is None: + config_kwargs_to_add_to_suffix.pop("data_dir", None) + else: + # canonicalize the data dir to avoid two paths to the same location having different + # hashes + data_dir = config_kwargs_to_add_to_suffix["data_dir"] + data_dir = os.path.normpath(data_dir) + config_kwargs_to_add_to_suffix["data_dir"] = data_dir if config_kwargs_to_add_to_suffix: # we don't care about the order of the kwargs config_kwargs_to_add_to_suffix = {