diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index f66cfecfe8c..945d9bad873 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -60,7 +60,15 @@ import pyarrow as pa import pyarrow.compute as pc from fsspec.core import url_to_fs -from huggingface_hub import CommitInfo, CommitOperationAdd, CommitOperationDelete, DatasetCard, DatasetCardData, HfApi +from huggingface_hub import ( + CommitInfo, + CommitOperationAdd, + CommitOperationDelete, + DatasetCard, + DatasetCardData, + HfApi, +) +from huggingface_hub.hf_api import RepoFile from multiprocess import Pool from tqdm.contrib.concurrent import thread_map @@ -115,7 +123,6 @@ from .utils import tqdm as hf_tqdm from .utils.deprecation_utils import deprecated from .utils.file_utils import estimate_dataset_size -from .utils.hub import list_files_info, preupload_lfs_files from .utils.info_utils import is_small_dataset from .utils.metadata import MetadataConfigs from .utils.py_utils import ( @@ -5388,11 +5395,9 @@ def shards_with_embedded_external_files(shards): shard.to_parquet(buffer) uploaded_size += buffer.tell() shard_addition = CommitOperationAdd(path_in_repo=shard_path_in_repo, path_or_fileobj=buffer) - preupload_lfs_files( - api, + api.preupload_lfs_files( repo_id=repo_id, additions=[shard_addition], - token=token, repo_type="dataset", revision=revision, create_pr=create_pr, @@ -5577,7 +5582,11 @@ def push_to_hub( deletions, deleted_size = [], 0 repo_splits = [] # use a list to keep the order of the splits repo_files_to_add = [addition.path_in_repo for addition in additions] - for repo_file in list_files_info(api, repo_id=repo_id, revision=revision, repo_type="dataset", token=token): + for repo_file in api.list_repo_tree( + repo_id=repo_id, revision=revision, repo_type="dataset", token=token, recursive=True + ): + if not isinstance(repo_file, RepoFile): + continue if repo_file.rfilename == config.REPOCARD_FILENAME: repo_with_dataset_card = True elif repo_file.rfilename == config.DATASETDICT_INFOS_FILENAME: diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py index 18c99dac1b8..563479f26a4 100644 --- a/src/datasets/dataset_dict.py +++ b/src/datasets/dataset_dict.py @@ -21,6 +21,7 @@ DatasetCardData, HfApi, ) +from huggingface_hub.hf_api import RepoFile from . import config from .arrow_dataset import PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED, Dataset @@ -34,7 +35,6 @@ from .utils import logging from .utils.deprecation_utils import deprecated from .utils.doc_utils import is_documented_by -from .utils.hub import list_files_info from .utils.metadata import MetadataConfigs from .utils.py_utils import asdict, glob_pattern_to_regex, string_to_dict from .utils.typing import PathLike @@ -1745,7 +1745,11 @@ def push_to_hub( repo_splits = [] # use a list to keep the order of the splits deletions = [] repo_files_to_add = [addition.path_in_repo for addition in additions] - for repo_file in list_files_info(api, repo_id=repo_id, revision=revision, repo_type="dataset", token=token): + for repo_file in api.list_repo_tree( + repo_id=repo_id, revision=revision, repo_type="dataset", token=token, recursive=True + ): + if not isinstance(repo_file, RepoFile): + continue if repo_file.rfilename == config.REPOCARD_FILENAME: repo_with_dataset_card = True elif repo_file.rfilename == config.DATASETDICT_INFOS_FILENAME: diff --git a/src/datasets/load.py b/src/datasets/load.py index c08ad274acf..991a3b35651 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -84,7 +84,7 @@ relative_to_absolute_path, url_or_path_join, ) -from .utils.hub import hf_hub_url +from .utils.hub import hf_dataset_url from .utils.info_utils import VerificationMode, is_small_dataset from .utils.logging import get_logger from .utils.metadata import MetadataConfigs @@ -1211,7 +1211,7 @@ def get_module(self) -> DatasetModule: download_config.download_desc = "Downloading readme" try: dataset_readme_path = cached_path( - hf_hub_url(self.name, config.REPOCARD_FILENAME, revision=revision), + hf_dataset_url(self.name, config.REPOCARD_FILENAME, revision=revision), download_config=download_config, ) dataset_card_data = DatasetCard.load(Path(dataset_readme_path)).data @@ -1222,7 +1222,7 @@ def get_module(self) -> DatasetModule: download_config.download_desc = "Downloading standalone yaml" try: standalone_yaml_path = cached_path( - hf_hub_url(self.name, config.REPOYAML_FILENAME, revision=revision), + hf_dataset_url(self.name, config.REPOYAML_FILENAME, revision=revision), download_config=download_config, ) with open(standalone_yaml_path, "r", encoding="utf-8") as f: @@ -1308,7 +1308,7 @@ def get_module(self) -> DatasetModule: ] default_config_name = None builder_kwargs = { - "base_path": hf_hub_url(self.name, "", revision=revision).rstrip("/"), + "base_path": hf_dataset_url(self.name, "", revision=revision).rstrip("/"), "repo_id": self.name, "dataset_name": camelcase_to_snakecase(Path(self.name).name), } @@ -1320,7 +1320,7 @@ def get_module(self) -> DatasetModule: try: # this file is deprecated and was created automatically in old versions of push_to_hub dataset_infos_path = cached_path( - hf_hub_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=revision), + hf_dataset_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=revision), download_config=download_config, ) with open(dataset_infos_path, encoding="utf-8") as f: @@ -1444,14 +1444,14 @@ def __init__( increase_load_count(name, resource_type="dataset") def download_loading_script(self) -> str: - file_path = hf_hub_url(self.name, self.name.split("/")[-1] + ".py", revision=self.revision) + file_path = hf_dataset_url(self.name, self.name.split("/")[-1] + ".py", revision=self.revision) download_config = self.download_config.copy() if download_config.download_desc is None: download_config.download_desc = "Downloading builder script" return cached_path(file_path, download_config=download_config) def download_dataset_infos_file(self) -> str: - dataset_infos = hf_hub_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=self.revision) + dataset_infos = hf_dataset_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=self.revision) # Download the dataset infos file if available download_config = self.download_config.copy() if download_config.download_desc is None: @@ -1465,7 +1465,7 @@ def download_dataset_infos_file(self) -> str: return None def download_dataset_readme_file(self) -> str: - readme_url = hf_hub_url(self.name, config.REPOCARD_FILENAME, revision=self.revision) + readme_url = hf_dataset_url(self.name, config.REPOCARD_FILENAME, revision=self.revision) # Download the dataset infos file if available download_config = self.download_config.copy() if download_config.download_desc is None: @@ -1494,7 +1494,7 @@ def get_module(self) -> DatasetModule: imports = get_imports(local_path) local_imports = _download_additional_modules( name=self.name, - base_path=hf_hub_url(self.name, "", revision=self.revision), + base_path=hf_dataset_url(self.name, "", revision=self.revision), imports=imports, download_config=self.download_config, ) @@ -1540,7 +1540,7 @@ def get_module(self) -> DatasetModule: # make the new module to be noticed by the import system importlib.invalidate_caches() builder_kwargs = { - "base_path": hf_hub_url(self.name, "", revision=self.revision).rstrip("/"), + "base_path": hf_dataset_url(self.name, "", revision=self.revision).rstrip("/"), "repo_id": self.name, } return DatasetModule(module_path, hash, builder_kwargs, importable_file_path=importable_file_path) diff --git a/src/datasets/utils/hub.py b/src/datasets/utils/hub.py index f19181b41b1..6d784333b23 100644 --- a/src/datasets/utils/hub.py +++ b/src/datasets/utils/hub.py @@ -1,64 +1,6 @@ -import time from functools import partial -from huggingface_hub import HfApi, hf_hub_url -from huggingface_hub.hf_api import RepoFile -from packaging import version -from requests import ConnectionError, HTTPError +from huggingface_hub import hf_hub_url -from .. import config -from . import logging - -logger = logging.get_logger(__name__) - -# Retry `preupload_lfs_files` in `huggingface_hub<0.20.0` on the "500 (Internal Server Error)" and "503 (Service Unavailable)" HTTP errors -if config.HF_HUB_VERSION.release < version.parse("0.20.0").release: - - def preupload_lfs_files(hf_api: HfApi, **kwargs): - max_retries = 5 - base_wait_time = 1 - max_wait_time = 8 - retry = 0 - while True: - try: - hf_api.preupload_lfs_files(**kwargs) - except (RuntimeError, HTTPError, ConnectionError) as err: - if isinstance(err, RuntimeError): - if isinstance(err.__cause__, (HTTPError, ConnectionError)): - err = err.__cause__ - else: - raise err - if retry >= max_retries or err.response and err.response.status_code not in [500, 503]: - raise err - else: - sleep_time = min(max_wait_time, base_wait_time * 2**retry) # Exponential backoff - logger.info( - f"{hf_api.preupload_lfs_files} timed out, retrying in {sleep_time}s... [{retry/max_retries}]" - ) - time.sleep(sleep_time) - retry += 1 - else: - break -else: - - def preupload_lfs_files(hf_api: HfApi, **kwargs): - hf_api.preupload_lfs_files(**kwargs) - - -# `list_files_info` is deprecated in favor of `list_repo_tree` in `huggingface_hub>=0.20.0` -if config.HF_HUB_VERSION.release < version.parse("0.20.0").release: - - def list_files_info(hf_api: HfApi, **kwargs): - yield from hf_api.list_files_info(**kwargs) -else: - - def list_files_info(hf_api: HfApi, **kwargs): - kwargs = {**kwargs, "recursive": True} - for repo_path in hf_api.list_repo_tree(**kwargs): - if isinstance(repo_path, RepoFile): - yield repo_path - - -# bakckward compatibility -hf_hub_url = partial(hf_hub_url, repo_type="dataset") +hf_dataset_url = partial(hf_hub_url, repo_type="dataset") diff --git a/tests/test_hub.py b/tests/test_hub.py index e940d7b8b29..116ed588585 100644 --- a/tests/test_hub.py +++ b/tests/test_hub.py @@ -2,12 +2,12 @@ import pytest -from datasets.utils.hub import hf_hub_url +from datasets.utils.hub import hf_dataset_url @pytest.mark.parametrize("repo_id", ["canonical_dataset_name", "org-name/dataset-name"]) @pytest.mark.parametrize("filename", ["filename.csv", "filename with blanks.csv"]) @pytest.mark.parametrize("revision", [None, "v2"]) -def test_hf_hub_url(repo_id, filename, revision): - url = hf_hub_url(repo_id=repo_id, filename=filename, revision=revision) +def test_dataset_url(repo_id, filename, revision): + url = hf_dataset_url(repo_id=repo_id, filename=filename, revision=revision) assert url == f"https://huggingface.co/datasets/{repo_id}/resolve/{revision or 'main'}/{quote(filename)}" diff --git a/tests/test_streaming_download_manager.py b/tests/test_streaming_download_manager.py index 44e73eee73c..3e90514b26e 100644 --- a/tests/test_streaming_download_manager.py +++ b/tests/test_streaming_download_manager.py @@ -28,7 +28,7 @@ xwalk, ) from datasets.filesystems import COMPRESSION_FILESYSTEMS -from datasets.utils.hub import hf_hub_url +from datasets.utils.hub import hf_dataset_url from .utils import require_lz4, require_zstandard, slow @@ -236,7 +236,7 @@ def test_xexists(input_path, exists, tmp_path, mock_fsspec): @pytest.mark.integration def test_xexists_private(hf_private_dataset_repo_txt_data, hf_token): - root_url = hf_hub_url(hf_private_dataset_repo_txt_data, "") + root_url = hf_dataset_url(hf_private_dataset_repo_txt_data, "") download_config = DownloadConfig(token=hf_token) assert xexists(root_url + "data/text_data.txt", download_config=download_config) assert not xexists(root_url + "file_that_doesnt_exist.txt", download_config=download_config) @@ -321,7 +321,7 @@ def test_xlistdir(input_path, expected_paths, tmp_path, mock_fsspec): @pytest.mark.integration def test_xlistdir_private(hf_private_dataset_repo_zipped_txt_data, hf_token): - root_url = hf_hub_url(hf_private_dataset_repo_zipped_txt_data, "data.zip") + root_url = hf_dataset_url(hf_private_dataset_repo_zipped_txt_data, "data.zip") download_config = DownloadConfig(token=hf_token) assert len(xlistdir("zip://::" + root_url, download_config=download_config)) == 1 assert len(xlistdir("zip://main_dir::" + root_url, download_config=download_config)) == 2 @@ -350,7 +350,7 @@ def test_xisdir(input_path, isdir, tmp_path, mock_fsspec): @pytest.mark.integration def test_xisdir_private(hf_private_dataset_repo_zipped_txt_data, hf_token): - root_url = hf_hub_url(hf_private_dataset_repo_zipped_txt_data, "data.zip") + root_url = hf_dataset_url(hf_private_dataset_repo_zipped_txt_data, "data.zip") download_config = DownloadConfig(token=hf_token) assert xisdir("zip://::" + root_url, download_config=download_config) is True assert xisdir("zip://main_dir::" + root_url, download_config=download_config) is True @@ -376,7 +376,7 @@ def test_xisfile(input_path, isfile, tmp_path, mock_fsspec): @pytest.mark.integration def test_xisfile_private(hf_private_dataset_repo_txt_data, hf_token): - root_url = hf_hub_url(hf_private_dataset_repo_txt_data, "") + root_url = hf_dataset_url(hf_private_dataset_repo_txt_data, "") download_config = DownloadConfig(token=hf_token) assert xisfile(root_url + "data/text_data.txt", download_config=download_config) is True assert xisfile(root_url + "qwertyuiop", download_config=download_config) is False @@ -400,7 +400,7 @@ def test_xgetsize(input_path, size, tmp_path, mock_fsspec): @pytest.mark.integration def test_xgetsize_private(hf_private_dataset_repo_txt_data, hf_token): - root_url = hf_hub_url(hf_private_dataset_repo_txt_data, "") + root_url = hf_dataset_url(hf_private_dataset_repo_txt_data, "") download_config = DownloadConfig(token=hf_token) assert xgetsize(root_url + "data/text_data.txt", download_config=download_config) == 39 with pytest.raises(FileNotFoundError): @@ -444,7 +444,7 @@ def test_xglob(input_path, expected_paths, tmp_path, mock_fsspec): @pytest.mark.integration def test_xglob_private(hf_private_dataset_repo_zipped_txt_data, hf_token): - root_url = hf_hub_url(hf_private_dataset_repo_zipped_txt_data, "data.zip") + root_url = hf_dataset_url(hf_private_dataset_repo_zipped_txt_data, "data.zip") download_config = DownloadConfig(token=hf_token) assert len(xglob("zip://**::" + root_url, download_config=download_config)) == 3 assert len(xglob("zip://qwertyuiop/*::" + root_url, download_config=download_config)) == 0 @@ -483,7 +483,7 @@ def test_xwalk(input_path, expected_outputs, tmp_path, mock_fsspec): @pytest.mark.integration def test_xwalk_private(hf_private_dataset_repo_zipped_txt_data, hf_token): - root_url = hf_hub_url(hf_private_dataset_repo_zipped_txt_data, "data.zip") + root_url = hf_dataset_url(hf_private_dataset_repo_zipped_txt_data, "data.zip") download_config = DownloadConfig(token=hf_token) assert len(list(xwalk("zip://::" + root_url, download_config=download_config))) == 2 assert len(list(xwalk("zip://main_dir::" + root_url, download_config=download_config))) == 1 diff --git a/tests/test_upstream_hub.py b/tests/test_upstream_hub.py index 2952cf310bf..daa0841ac98 100644 --- a/tests/test_upstream_hub.py +++ b/tests/test_upstream_hub.py @@ -33,7 +33,7 @@ FolderBasedBuilderConfig, ) from datasets.utils.file_utils import cached_path -from datasets.utils.hub import hf_hub_url +from datasets.utils.hub import hf_dataset_url from tests.fixtures.hub import CI_HUB_ENDPOINT, CI_HUB_USER, CI_HUB_USER_TOKEN from tests.utils import for_all_test_methods, require_pil, require_sndfile, xfail_if_500_502_http_error @@ -608,7 +608,7 @@ def test_push_multiple_dataset_configs_to_hub_readme_metadata_content( ds_config2.push_to_hub(ds_name, "config2", token=self._token) # check that configs args was correctly pushed to README.md - ds_readme_path = cached_path(hf_hub_url(ds_name, "README.md")) + ds_readme_path = cached_path(hf_dataset_url(ds_name, "README.md")) dataset_card_data = DatasetCard.load(ds_readme_path).data assert METADATA_CONFIGS_FIELD in dataset_card_data assert isinstance(dataset_card_data[METADATA_CONFIGS_FIELD], list) @@ -757,7 +757,7 @@ def test_push_multiple_dataset_dict_configs_to_hub_readme_metadata_content( ds_config2.push_to_hub(ds_name, "config2", token=self._token) # check that configs args was correctly pushed to README.md - ds_readme_path = cached_path(hf_hub_url(ds_name, "README.md")) + ds_readme_path = cached_path(hf_dataset_url(ds_name, "README.md")) dataset_card_data = DatasetCard.load(ds_readme_path).data assert METADATA_CONFIGS_FIELD in dataset_card_data assert isinstance(dataset_card_data[METADATA_CONFIGS_FIELD], list)