From f5136ea69dc321d27495e14496b98964a7da3b1d Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Tue, 19 Mar 2024 11:48:17 +0100 Subject: [PATCH 1/4] fix offline mode with single config --- src/datasets/packaged_modules/cache/cache.py | 27 ++++++++++++-------- tests/packaged_modules/test_cache.py | 20 +++++++++++++++ 2 files changed, 37 insertions(+), 10 deletions(-) diff --git a/src/datasets/packaged_modules/cache/cache.py b/src/datasets/packaged_modules/cache/cache.py index 54c4737be11..b8d99d10284 100644 --- a/src/datasets/packaged_modules/cache/cache.py +++ b/src/datasets/packaged_modules/cache/cache.py @@ -114,16 +114,23 @@ def __init__( if data_dir is not None: config_kwargs["data_dir"] = data_dir if hash == "auto" and version == "auto": - # First we try to find a folder that takes the config_kwargs into account - # e.g. with "default-data_dir=data%2Ffortran" as config_id - config_id = self.BUILDER_CONFIG_CLASS(config_name or "default").create_config_id( - config_kwargs=config_kwargs, custom_features=features - ) - config_name, version, hash = _find_hash_in_cache( - dataset_name=repo_id or dataset_name, - config_name=config_id, - cache_dir=cache_dir, - ) + if config_name or config_kwargs: + # First we try to find a folder that takes the config_kwargs into account + # e.g. with "default-data_dir=data%2Ffortran" as config_id + config_id = self.BUILDER_CONFIG_CLASS(config_name or "default").create_config_id( + config_kwargs=config_kwargs, custom_features=features + ) + config_name, version, hash = _find_hash_in_cache( + dataset_name=repo_id or dataset_name, + config_name=config_id, + cache_dir=cache_dir, + ) + else: + config_name, version, hash = _find_hash_in_cache( + dataset_name=repo_id or dataset_name, + config_name=None, + cache_dir=cache_dir, + ) elif hash == "auto" or version == "auto": raise NotImplementedError("Pass both hash='auto' and version='auto' instead") super().__init__( diff --git a/tests/packaged_modules/test_cache.py b/tests/packaged_modules/test_cache.py index aed1496cee8..aa65e697af1 100644 --- a/tests/packaged_modules/test_cache.py +++ b/tests/packaged_modules/test_cache.py @@ -6,6 +6,7 @@ from datasets.packaged_modules.cache.cache import Cache +SAMPLE_DATASET_SINGLE_CONFIG_IN_METADATA = "hf-internal-testing/audiofolder_single_config_in_metadata" SAMPLE_DATASET_TWO_CONFIG_IN_METADATA = "hf-internal-testing/audiofolder_two_configs_in_metadata" @@ -74,3 +75,22 @@ def test_cache_multi_configs(): with pytest.raises(ValueError) as excinfo: Cache(dataset_name=dataset_name, repo_id=repo_id, config_name="missing", version="auto", hash="auto") assert config_name in str(excinfo.value) + + +@pytest.mark.integration +def test_cache_single_config(): + repo_id = SAMPLE_DATASET_SINGLE_CONFIG_IN_METADATA + dataset_name = repo_id.split("/")[-1] + config_name = "custom" + ds = load_dataset(repo_id) + cache = Cache(dataset_name=dataset_name, repo_id=repo_id, version="auto", hash="auto") + reloaded = cache.as_dataset() + assert list(ds) == list(reloaded) + assert len(ds["train"]) == len(reloaded["train"]) + cache = Cache(dataset_name=dataset_name, config_name=config_name, repo_id=repo_id, version="auto", hash="auto") + reloaded = cache.as_dataset() + assert list(ds) == list(reloaded) + assert len(ds["train"]) == len(reloaded["train"]) + with pytest.raises(ValueError) as excinfo: + Cache(dataset_name=dataset_name, repo_id=repo_id, config_name="missing", version="auto", hash="auto") + assert config_name in str(excinfo.value) From f76131258436399995fd6b0c2e8c019268adba81 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Fri, 22 Mar 2024 17:59:31 +0100 Subject: [PATCH 2/4] fix tests --- src/datasets/packaged_modules/cache/cache.py | 76 ++++++++------- tests/packaged_modules/test_cache.py | 97 ++++++++++++++------ 2 files changed, 112 insertions(+), 61 deletions(-) diff --git a/src/datasets/packaged_modules/cache/cache.py b/src/datasets/packaged_modules/cache/cache.py index b8d99d10284..4eb7855bd48 100644 --- a/src/datasets/packaged_modules/cache/cache.py +++ b/src/datasets/packaged_modules/cache/cache.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil import time @@ -22,43 +23,64 @@ def _get_modification_time(cached_directory_path): def _find_hash_in_cache( - dataset_name: str, config_name: Optional[str], cache_dir: Optional[str] + dataset_name: str, + config_name: Optional[str], + cache_dir: Optional[str], + config_kwargs: dict, + custom_features: Optional[datasets.Features], ) -> Tuple[str, str, str]: + if config_name or config_kwargs or custom_features: + config_id = datasets.BuilderConfig(config_name or "default").create_config_id( + config_kwargs=config_kwargs, custom_features=custom_features + ) + else: + config_id = None cache_dir = os.path.expanduser(str(cache_dir or datasets.config.HF_DATASETS_CACHE)) cached_datasets_directory_path_root = os.path.join(cache_dir, dataset_name.replace("/", "___")) cached_directory_paths = [ cached_directory_path for cached_directory_path in glob.glob( - os.path.join(cached_datasets_directory_path_root, config_name or "*", "*", "*") + os.path.join(cached_datasets_directory_path_root, config_id or "*", "*", "*") ) if os.path.isdir(cached_directory_path) + and ( + config_kwargs + or custom_features + or json.load(open(os.path.join(cached_directory_path, "dataset_info.json"), "r"))["config_name"] + == Path(cached_directory_path).parts[-3] # no extra params => config_id == config_name + ) ] if not cached_directory_paths: - if config_name is not None: - cached_directory_paths = [ - cached_directory_path - for cached_directory_path in glob.glob( - os.path.join(cached_datasets_directory_path_root, "*", "*", "*") - ) - if os.path.isdir(cached_directory_path) - ] + cached_directory_paths = [ + cached_directory_path + for cached_directory_path in glob.glob( + os.path.join(cached_datasets_directory_path_root, "*", "*", "*") + ) + if os.path.isdir(cached_directory_path) + ] available_configs = sorted( {Path(cached_directory_path).parts[-3] for cached_directory_path in cached_directory_paths} ) raise ValueError( f"Couldn't find cache for {dataset_name}" - + (f" for config '{config_name}'" if config_name else "") + + (f" for config '{config_id}'" if config_id else "") + (f"\nAvailable configs in the cache: {available_configs}" if available_configs else "") ) # get most recent cached_directory_path = Path(sorted(cached_directory_paths, key=_get_modification_time)[-1]) version, hash = cached_directory_path.parts[-2:] other_configs = [ - Path(cached_directory_path).parts[-3] - for cached_directory_path in glob.glob(os.path.join(cached_datasets_directory_path_root, "*", version, hash)) - if os.path.isdir(cached_directory_path) + Path(_cached_directory_path).parts[-3] + for _cached_directory_path in glob.glob(os.path.join(cached_datasets_directory_path_root, "*", version, hash)) + if os.path.isdir(_cached_directory_path) + and ( + config_kwargs + or custom_features + or json.load(open(os.path.join(_cached_directory_path, "dataset_info.json"), "r"))["config_name"] + == Path(_cached_directory_path).parts[-3] # no extra params => config_id == config_name + ) ] - if not config_name and len(other_configs) > 1: + if not config_id and len(other_configs) > 1: raise ValueError( f"There are multiple '{dataset_name}' configurations in the cache: {', '.join(other_configs)}" f"\nPlease specify which configuration to reload from the cache, e.g." @@ -114,23 +136,13 @@ def __init__( if data_dir is not None: config_kwargs["data_dir"] = data_dir if hash == "auto" and version == "auto": - if config_name or config_kwargs: - # First we try to find a folder that takes the config_kwargs into account - # e.g. with "default-data_dir=data%2Ffortran" as config_id - config_id = self.BUILDER_CONFIG_CLASS(config_name or "default").create_config_id( - config_kwargs=config_kwargs, custom_features=features - ) - config_name, version, hash = _find_hash_in_cache( - dataset_name=repo_id or dataset_name, - config_name=config_id, - cache_dir=cache_dir, - ) - else: - config_name, version, hash = _find_hash_in_cache( - dataset_name=repo_id or dataset_name, - config_name=None, - cache_dir=cache_dir, - ) + config_name, version, hash = _find_hash_in_cache( + dataset_name=repo_id or dataset_name, + config_name=config_name, + cache_dir=cache_dir, + config_kwargs=config_kwargs, + custom_features=features, + ) elif hash == "auto" or version == "auto": raise NotImplementedError("Pass both hash='auto' and version='auto' instead") super().__init__( diff --git a/tests/packaged_modules/test_cache.py b/tests/packaged_modules/test_cache.py index aa65e697af1..fdde27dbc1c 100644 --- a/tests/packaged_modules/test_cache.py +++ b/tests/packaged_modules/test_cache.py @@ -10,37 +10,43 @@ SAMPLE_DATASET_TWO_CONFIG_IN_METADATA = "hf-internal-testing/audiofolder_two_configs_in_metadata" -def test_cache(text_dir: Path): - ds = load_dataset(str(text_dir)) +def test_cache(text_dir: Path, tmp_path: Path): + cache_dir = tmp_path / "test_cache" + ds = load_dataset(str(text_dir), cache_dir=str(cache_dir)) hash = Path(ds["train"].cache_files[0]["filename"]).parts[-2] - cache = Cache(dataset_name=text_dir.name, hash=hash) + cache = Cache(cache_dir=str(cache_dir), dataset_name=text_dir.name, hash=hash) reloaded = cache.as_dataset() assert list(ds) == list(reloaded) assert list(ds["train"]) == list(reloaded["train"]) -def test_cache_streaming(text_dir: Path): - ds = load_dataset(str(text_dir)) +def test_cache_streaming(text_dir: Path, tmp_path: Path): + cache_dir = tmp_path / "test_cache_streaming" + ds = load_dataset(str(text_dir), cache_dir=str(cache_dir)) hash = Path(ds["train"].cache_files[0]["filename"]).parts[-2] - cache = Cache(dataset_name=text_dir.name, hash=hash) + cache = Cache(cache_dir=str(cache_dir), dataset_name=text_dir.name, hash=hash) reloaded = cache.as_streaming_dataset() assert list(ds) == list(reloaded) assert list(ds["train"]) == list(reloaded["train"]) -def test_cache_auto_hash(text_dir: Path): - ds = load_dataset(str(text_dir)) - cache = Cache(dataset_name=text_dir.name, version="auto", hash="auto") +def test_cache_auto_hash(text_dir: Path, tmp_path: Path): + cache_dir = tmp_path / "test_cache_auto_hash" + ds = load_dataset(str(text_dir), cache_dir=str(cache_dir)) + cache = Cache(cache_dir=str(cache_dir), dataset_name=text_dir.name, version="auto", hash="auto") reloaded = cache.as_dataset() assert list(ds) == list(reloaded) assert list(ds["train"]) == list(reloaded["train"]) -def test_cache_auto_hash_with_custom_config(text_dir: Path): - ds = load_dataset(str(text_dir), sample_by="paragraph") - another_ds = load_dataset(str(text_dir)) - cache = Cache(dataset_name=text_dir.name, version="auto", hash="auto", sample_by="paragraph") - another_cache = Cache(dataset_name=text_dir.name, version="auto", hash="auto") +def test_cache_auto_hash_with_custom_config(text_dir: Path, tmp_path: Path): + cache_dir = tmp_path / "test_cache_auto_hash_with_custom_config" + ds = load_dataset(str(text_dir), sample_by="paragraph", cache_dir=str(cache_dir)) + another_ds = load_dataset(str(text_dir), cache_dir=str(cache_dir)) + cache = Cache( + cache_dir=str(cache_dir), dataset_name=text_dir.name, version="auto", hash="auto", sample_by="paragraph" + ) + another_cache = Cache(cache_dir=str(cache_dir), dataset_name=text_dir.name, version="auto", hash="auto") assert cache.config_id.endswith("paragraph") assert not another_cache.config_id.endswith("paragraph") reloaded = cache.as_dataset() @@ -51,46 +57,79 @@ def test_cache_auto_hash_with_custom_config(text_dir: Path): assert list(another_ds["train"]) == list(another_reloaded["train"]) -def test_cache_missing(text_dir: Path): - load_dataset(str(text_dir)) - Cache(dataset_name=text_dir.name, version="auto", hash="auto").download_and_prepare() +def test_cache_missing(text_dir: Path, tmp_path: Path): + cache_dir = tmp_path / "test_cache_missing" + load_dataset(str(text_dir), cache_dir=str(cache_dir)) + Cache(cache_dir=str(cache_dir), dataset_name=text_dir.name, version="auto", hash="auto").download_and_prepare() with pytest.raises(ValueError): - Cache(dataset_name="missing", version="auto", hash="auto").download_and_prepare() + Cache(cache_dir=str(cache_dir), dataset_name="missing", version="auto", hash="auto").download_and_prepare() with pytest.raises(ValueError): - Cache(dataset_name=text_dir.name, hash="missing").download_and_prepare() + Cache(cache_dir=str(cache_dir), dataset_name=text_dir.name, hash="missing").download_and_prepare() with pytest.raises(ValueError): - Cache(dataset_name=text_dir.name, config_name="missing", version="auto", hash="auto").download_and_prepare() + Cache( + cache_dir=str(cache_dir), dataset_name=text_dir.name, config_name="missing", version="auto", hash="auto" + ).download_and_prepare() @pytest.mark.integration -def test_cache_multi_configs(): +def test_cache_multi_configs(tmp_path: Path): + cache_dir = tmp_path / "test_cache_multi_configs" repo_id = SAMPLE_DATASET_TWO_CONFIG_IN_METADATA dataset_name = repo_id.split("/")[-1] config_name = "v1" - ds = load_dataset(repo_id, config_name) - cache = Cache(dataset_name=dataset_name, repo_id=repo_id, config_name=config_name, version="auto", hash="auto") + ds = load_dataset(repo_id, config_name, cache_dir=str(cache_dir)) + cache = Cache( + cache_dir=str(cache_dir), + dataset_name=dataset_name, + repo_id=repo_id, + config_name=config_name, + version="auto", + hash="auto", + ) reloaded = cache.as_dataset() assert list(ds) == list(reloaded) assert len(ds["train"]) == len(reloaded["train"]) with pytest.raises(ValueError) as excinfo: - Cache(dataset_name=dataset_name, repo_id=repo_id, config_name="missing", version="auto", hash="auto") + Cache( + cache_dir=str(cache_dir), + dataset_name=dataset_name, + repo_id=repo_id, + config_name="missing", + version="auto", + hash="auto", + ) assert config_name in str(excinfo.value) @pytest.mark.integration -def test_cache_single_config(): +def test_cache_single_config(tmp_path: Path): + cache_dir = tmp_path / "test_cache_single_config" repo_id = SAMPLE_DATASET_SINGLE_CONFIG_IN_METADATA dataset_name = repo_id.split("/")[-1] config_name = "custom" - ds = load_dataset(repo_id) - cache = Cache(dataset_name=dataset_name, repo_id=repo_id, version="auto", hash="auto") + ds = load_dataset(repo_id, cache_dir=str(cache_dir)) + cache = Cache(cache_dir=str(cache_dir), dataset_name=dataset_name, repo_id=repo_id, version="auto", hash="auto") reloaded = cache.as_dataset() assert list(ds) == list(reloaded) assert len(ds["train"]) == len(reloaded["train"]) - cache = Cache(dataset_name=dataset_name, config_name=config_name, repo_id=repo_id, version="auto", hash="auto") + cache = Cache( + cache_dir=str(cache_dir), + dataset_name=dataset_name, + config_name=config_name, + repo_id=repo_id, + version="auto", + hash="auto", + ) reloaded = cache.as_dataset() assert list(ds) == list(reloaded) assert len(ds["train"]) == len(reloaded["train"]) with pytest.raises(ValueError) as excinfo: - Cache(dataset_name=dataset_name, repo_id=repo_id, config_name="missing", version="auto", hash="auto") + Cache( + cache_dir=str(cache_dir), + dataset_name=dataset_name, + repo_id=repo_id, + config_name="missing", + version="auto", + hash="auto", + ) assert config_name in str(excinfo.value) From ff35de61ecba6cea5c1e0807573b293e696ebb2a Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 25 Mar 2024 12:56:01 +0100 Subject: [PATCH 3/4] style --- src/datasets/packaged_modules/cache/cache.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/datasets/packaged_modules/cache/cache.py b/src/datasets/packaged_modules/cache/cache.py index 4eb7855bd48..1b5e396af04 100644 --- a/src/datasets/packaged_modules/cache/cache.py +++ b/src/datasets/packaged_modules/cache/cache.py @@ -53,9 +53,7 @@ def _find_hash_in_cache( if not cached_directory_paths: cached_directory_paths = [ cached_directory_path - for cached_directory_path in glob.glob( - os.path.join(cached_datasets_directory_path_root, "*", "*", "*") - ) + for cached_directory_path in glob.glob(os.path.join(cached_datasets_directory_path_root, "*", "*", "*")) if os.path.isdir(cached_directory_path) ] available_configs = sorted( From dfb1be97b1597a3a50211a92265a9bac104b34ac Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 25 Mar 2024 16:52:27 +0100 Subject: [PATCH 4/4] mario's suggestion --- src/datasets/packaged_modules/cache/cache.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/datasets/packaged_modules/cache/cache.py b/src/datasets/packaged_modules/cache/cache.py index 1b5e396af04..2f31176e08b 100644 --- a/src/datasets/packaged_modules/cache/cache.py +++ b/src/datasets/packaged_modules/cache/cache.py @@ -46,7 +46,7 @@ def _find_hash_in_cache( and ( config_kwargs or custom_features - or json.load(open(os.path.join(cached_directory_path, "dataset_info.json"), "r"))["config_name"] + or json.loads(Path(cached_directory_path, "dataset_info.json").read_text(encoding="utf-8"))["config_name"] == Path(cached_directory_path).parts[-3] # no extra params => config_id == config_name ) ] @@ -74,7 +74,7 @@ def _find_hash_in_cache( and ( config_kwargs or custom_features - or json.load(open(os.path.join(_cached_directory_path, "dataset_info.json"), "r"))["config_name"] + or json.loads(Path(_cached_directory_path, "dataset_info.json").read_text(encoding="utf-8"))["config_name"] == Path(_cached_directory_path).parts[-3] # no extra params => config_id == config_name ) ]