Skip to content

Commit

Permalink
Remove deprecated code (#6761)
Browse files Browse the repository at this point in the history
* Remove deprecated list_files_info

* remove legacy preupload_lfs_file + rename hf_hub_url partial

* fix tests

* fix

* fix tests
  • Loading branch information
Wauplin authored Mar 29, 2024
1 parent dcd0104 commit 66d6242
Show file tree
Hide file tree
Showing 7 changed files with 47 additions and 92 deletions.
21 changes: 15 additions & 6 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,15 @@
import pyarrow as pa
import pyarrow.compute as pc
from fsspec.core import url_to_fs
from huggingface_hub import CommitInfo, CommitOperationAdd, CommitOperationDelete, DatasetCard, DatasetCardData, HfApi
from huggingface_hub import (
CommitInfo,
CommitOperationAdd,
CommitOperationDelete,
DatasetCard,
DatasetCardData,
HfApi,
)
from huggingface_hub.hf_api import RepoFile
from multiprocess import Pool
from tqdm.contrib.concurrent import thread_map

Expand Down Expand Up @@ -115,7 +123,6 @@
from .utils import tqdm as hf_tqdm
from .utils.deprecation_utils import deprecated
from .utils.file_utils import estimate_dataset_size
from .utils.hub import list_files_info, preupload_lfs_files
from .utils.info_utils import is_small_dataset
from .utils.metadata import MetadataConfigs
from .utils.py_utils import (
Expand Down Expand Up @@ -5388,11 +5395,9 @@ def shards_with_embedded_external_files(shards):
shard.to_parquet(buffer)
uploaded_size += buffer.tell()
shard_addition = CommitOperationAdd(path_in_repo=shard_path_in_repo, path_or_fileobj=buffer)
preupload_lfs_files(
api,
api.preupload_lfs_files(
repo_id=repo_id,
additions=[shard_addition],
token=token,
repo_type="dataset",
revision=revision,
create_pr=create_pr,
Expand Down Expand Up @@ -5577,7 +5582,11 @@ def push_to_hub(
deletions, deleted_size = [], 0
repo_splits = [] # use a list to keep the order of the splits
repo_files_to_add = [addition.path_in_repo for addition in additions]
for repo_file in list_files_info(api, repo_id=repo_id, revision=revision, repo_type="dataset", token=token):
for repo_file in api.list_repo_tree(
repo_id=repo_id, revision=revision, repo_type="dataset", token=token, recursive=True
):
if not isinstance(repo_file, RepoFile):
continue
if repo_file.rfilename == config.REPOCARD_FILENAME:
repo_with_dataset_card = True
elif repo_file.rfilename == config.DATASETDICT_INFOS_FILENAME:
Expand Down
8 changes: 6 additions & 2 deletions src/datasets/dataset_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
DatasetCardData,
HfApi,
)
from huggingface_hub.hf_api import RepoFile

from . import config
from .arrow_dataset import PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED, Dataset
Expand All @@ -34,7 +35,6 @@
from .utils import logging
from .utils.deprecation_utils import deprecated
from .utils.doc_utils import is_documented_by
from .utils.hub import list_files_info
from .utils.metadata import MetadataConfigs
from .utils.py_utils import asdict, glob_pattern_to_regex, string_to_dict
from .utils.typing import PathLike
Expand Down Expand Up @@ -1745,7 +1745,11 @@ def push_to_hub(
repo_splits = [] # use a list to keep the order of the splits
deletions = []
repo_files_to_add = [addition.path_in_repo for addition in additions]
for repo_file in list_files_info(api, repo_id=repo_id, revision=revision, repo_type="dataset", token=token):
for repo_file in api.list_repo_tree(
repo_id=repo_id, revision=revision, repo_type="dataset", token=token, recursive=True
):
if not isinstance(repo_file, RepoFile):
continue
if repo_file.rfilename == config.REPOCARD_FILENAME:
repo_with_dataset_card = True
elif repo_file.rfilename == config.DATASETDICT_INFOS_FILENAME:
Expand Down
20 changes: 10 additions & 10 deletions src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@
relative_to_absolute_path,
url_or_path_join,
)
from .utils.hub import hf_hub_url
from .utils.hub import hf_dataset_url
from .utils.info_utils import VerificationMode, is_small_dataset
from .utils.logging import get_logger
from .utils.metadata import MetadataConfigs
Expand Down Expand Up @@ -1211,7 +1211,7 @@ def get_module(self) -> DatasetModule:
download_config.download_desc = "Downloading readme"
try:
dataset_readme_path = cached_path(
hf_hub_url(self.name, config.REPOCARD_FILENAME, revision=revision),
hf_dataset_url(self.name, config.REPOCARD_FILENAME, revision=revision),
download_config=download_config,
)
dataset_card_data = DatasetCard.load(Path(dataset_readme_path)).data
Expand All @@ -1222,7 +1222,7 @@ def get_module(self) -> DatasetModule:
download_config.download_desc = "Downloading standalone yaml"
try:
standalone_yaml_path = cached_path(
hf_hub_url(self.name, config.REPOYAML_FILENAME, revision=revision),
hf_dataset_url(self.name, config.REPOYAML_FILENAME, revision=revision),
download_config=download_config,
)
with open(standalone_yaml_path, "r", encoding="utf-8") as f:
Expand Down Expand Up @@ -1308,7 +1308,7 @@ def get_module(self) -> DatasetModule:
]
default_config_name = None
builder_kwargs = {
"base_path": hf_hub_url(self.name, "", revision=revision).rstrip("/"),
"base_path": hf_dataset_url(self.name, "", revision=revision).rstrip("/"),
"repo_id": self.name,
"dataset_name": camelcase_to_snakecase(Path(self.name).name),
}
Expand All @@ -1320,7 +1320,7 @@ def get_module(self) -> DatasetModule:
try:
# this file is deprecated and was created automatically in old versions of push_to_hub
dataset_infos_path = cached_path(
hf_hub_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=revision),
hf_dataset_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=revision),
download_config=download_config,
)
with open(dataset_infos_path, encoding="utf-8") as f:
Expand Down Expand Up @@ -1444,14 +1444,14 @@ def __init__(
increase_load_count(name, resource_type="dataset")

def download_loading_script(self) -> str:
file_path = hf_hub_url(self.name, self.name.split("/")[-1] + ".py", revision=self.revision)
file_path = hf_dataset_url(self.name, self.name.split("/")[-1] + ".py", revision=self.revision)
download_config = self.download_config.copy()
if download_config.download_desc is None:
download_config.download_desc = "Downloading builder script"
return cached_path(file_path, download_config=download_config)

def download_dataset_infos_file(self) -> str:
dataset_infos = hf_hub_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=self.revision)
dataset_infos = hf_dataset_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=self.revision)
# Download the dataset infos file if available
download_config = self.download_config.copy()
if download_config.download_desc is None:
Expand All @@ -1465,7 +1465,7 @@ def download_dataset_infos_file(self) -> str:
return None

def download_dataset_readme_file(self) -> str:
readme_url = hf_hub_url(self.name, config.REPOCARD_FILENAME, revision=self.revision)
readme_url = hf_dataset_url(self.name, config.REPOCARD_FILENAME, revision=self.revision)
# Download the dataset infos file if available
download_config = self.download_config.copy()
if download_config.download_desc is None:
Expand Down Expand Up @@ -1494,7 +1494,7 @@ def get_module(self) -> DatasetModule:
imports = get_imports(local_path)
local_imports = _download_additional_modules(
name=self.name,
base_path=hf_hub_url(self.name, "", revision=self.revision),
base_path=hf_dataset_url(self.name, "", revision=self.revision),
imports=imports,
download_config=self.download_config,
)
Expand Down Expand Up @@ -1540,7 +1540,7 @@ def get_module(self) -> DatasetModule:
# make the new module to be noticed by the import system
importlib.invalidate_caches()
builder_kwargs = {
"base_path": hf_hub_url(self.name, "", revision=self.revision).rstrip("/"),
"base_path": hf_dataset_url(self.name, "", revision=self.revision).rstrip("/"),
"repo_id": self.name,
}
return DatasetModule(module_path, hash, builder_kwargs, importable_file_path=importable_file_path)
Expand Down
62 changes: 2 additions & 60 deletions src/datasets/utils/hub.py
Original file line number Diff line number Diff line change
@@ -1,64 +1,6 @@
import time
from functools import partial

from huggingface_hub import HfApi, hf_hub_url
from huggingface_hub.hf_api import RepoFile
from packaging import version
from requests import ConnectionError, HTTPError
from huggingface_hub import hf_hub_url

from .. import config
from . import logging


logger = logging.get_logger(__name__)

# Retry `preupload_lfs_files` in `huggingface_hub<0.20.0` on the "500 (Internal Server Error)" and "503 (Service Unavailable)" HTTP errors
if config.HF_HUB_VERSION.release < version.parse("0.20.0").release:

def preupload_lfs_files(hf_api: HfApi, **kwargs):
max_retries = 5
base_wait_time = 1
max_wait_time = 8
retry = 0
while True:
try:
hf_api.preupload_lfs_files(**kwargs)
except (RuntimeError, HTTPError, ConnectionError) as err:
if isinstance(err, RuntimeError):
if isinstance(err.__cause__, (HTTPError, ConnectionError)):
err = err.__cause__
else:
raise err
if retry >= max_retries or err.response and err.response.status_code not in [500, 503]:
raise err
else:
sleep_time = min(max_wait_time, base_wait_time * 2**retry) # Exponential backoff
logger.info(
f"{hf_api.preupload_lfs_files} timed out, retrying in {sleep_time}s... [{retry/max_retries}]"
)
time.sleep(sleep_time)
retry += 1
else:
break
else:

def preupload_lfs_files(hf_api: HfApi, **kwargs):
hf_api.preupload_lfs_files(**kwargs)


# `list_files_info` is deprecated in favor of `list_repo_tree` in `huggingface_hub>=0.20.0`
if config.HF_HUB_VERSION.release < version.parse("0.20.0").release:

def list_files_info(hf_api: HfApi, **kwargs):
yield from hf_api.list_files_info(**kwargs)
else:

def list_files_info(hf_api: HfApi, **kwargs):
kwargs = {**kwargs, "recursive": True}
for repo_path in hf_api.list_repo_tree(**kwargs):
if isinstance(repo_path, RepoFile):
yield repo_path


# bakckward compatibility
hf_hub_url = partial(hf_hub_url, repo_type="dataset")
hf_dataset_url = partial(hf_hub_url, repo_type="dataset")
6 changes: 3 additions & 3 deletions tests/test_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@

import pytest

from datasets.utils.hub import hf_hub_url
from datasets.utils.hub import hf_dataset_url


@pytest.mark.parametrize("repo_id", ["canonical_dataset_name", "org-name/dataset-name"])
@pytest.mark.parametrize("filename", ["filename.csv", "filename with blanks.csv"])
@pytest.mark.parametrize("revision", [None, "v2"])
def test_hf_hub_url(repo_id, filename, revision):
url = hf_hub_url(repo_id=repo_id, filename=filename, revision=revision)
def test_dataset_url(repo_id, filename, revision):
url = hf_dataset_url(repo_id=repo_id, filename=filename, revision=revision)
assert url == f"https://huggingface.co/datasets/{repo_id}/resolve/{revision or 'main'}/{quote(filename)}"
16 changes: 8 additions & 8 deletions tests/test_streaming_download_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
xwalk,
)
from datasets.filesystems import COMPRESSION_FILESYSTEMS
from datasets.utils.hub import hf_hub_url
from datasets.utils.hub import hf_dataset_url

from .utils import require_lz4, require_zstandard, slow

Expand Down Expand Up @@ -236,7 +236,7 @@ def test_xexists(input_path, exists, tmp_path, mock_fsspec):

@pytest.mark.integration
def test_xexists_private(hf_private_dataset_repo_txt_data, hf_token):
root_url = hf_hub_url(hf_private_dataset_repo_txt_data, "")
root_url = hf_dataset_url(hf_private_dataset_repo_txt_data, "")
download_config = DownloadConfig(token=hf_token)
assert xexists(root_url + "data/text_data.txt", download_config=download_config)
assert not xexists(root_url + "file_that_doesnt_exist.txt", download_config=download_config)
Expand Down Expand Up @@ -321,7 +321,7 @@ def test_xlistdir(input_path, expected_paths, tmp_path, mock_fsspec):

@pytest.mark.integration
def test_xlistdir_private(hf_private_dataset_repo_zipped_txt_data, hf_token):
root_url = hf_hub_url(hf_private_dataset_repo_zipped_txt_data, "data.zip")
root_url = hf_dataset_url(hf_private_dataset_repo_zipped_txt_data, "data.zip")
download_config = DownloadConfig(token=hf_token)
assert len(xlistdir("zip://::" + root_url, download_config=download_config)) == 1
assert len(xlistdir("zip://main_dir::" + root_url, download_config=download_config)) == 2
Expand Down Expand Up @@ -350,7 +350,7 @@ def test_xisdir(input_path, isdir, tmp_path, mock_fsspec):

@pytest.mark.integration
def test_xisdir_private(hf_private_dataset_repo_zipped_txt_data, hf_token):
root_url = hf_hub_url(hf_private_dataset_repo_zipped_txt_data, "data.zip")
root_url = hf_dataset_url(hf_private_dataset_repo_zipped_txt_data, "data.zip")
download_config = DownloadConfig(token=hf_token)
assert xisdir("zip://::" + root_url, download_config=download_config) is True
assert xisdir("zip://main_dir::" + root_url, download_config=download_config) is True
Expand All @@ -376,7 +376,7 @@ def test_xisfile(input_path, isfile, tmp_path, mock_fsspec):

@pytest.mark.integration
def test_xisfile_private(hf_private_dataset_repo_txt_data, hf_token):
root_url = hf_hub_url(hf_private_dataset_repo_txt_data, "")
root_url = hf_dataset_url(hf_private_dataset_repo_txt_data, "")
download_config = DownloadConfig(token=hf_token)
assert xisfile(root_url + "data/text_data.txt", download_config=download_config) is True
assert xisfile(root_url + "qwertyuiop", download_config=download_config) is False
Expand All @@ -400,7 +400,7 @@ def test_xgetsize(input_path, size, tmp_path, mock_fsspec):

@pytest.mark.integration
def test_xgetsize_private(hf_private_dataset_repo_txt_data, hf_token):
root_url = hf_hub_url(hf_private_dataset_repo_txt_data, "")
root_url = hf_dataset_url(hf_private_dataset_repo_txt_data, "")
download_config = DownloadConfig(token=hf_token)
assert xgetsize(root_url + "data/text_data.txt", download_config=download_config) == 39
with pytest.raises(FileNotFoundError):
Expand Down Expand Up @@ -444,7 +444,7 @@ def test_xglob(input_path, expected_paths, tmp_path, mock_fsspec):

@pytest.mark.integration
def test_xglob_private(hf_private_dataset_repo_zipped_txt_data, hf_token):
root_url = hf_hub_url(hf_private_dataset_repo_zipped_txt_data, "data.zip")
root_url = hf_dataset_url(hf_private_dataset_repo_zipped_txt_data, "data.zip")
download_config = DownloadConfig(token=hf_token)
assert len(xglob("zip://**::" + root_url, download_config=download_config)) == 3
assert len(xglob("zip://qwertyuiop/*::" + root_url, download_config=download_config)) == 0
Expand Down Expand Up @@ -483,7 +483,7 @@ def test_xwalk(input_path, expected_outputs, tmp_path, mock_fsspec):

@pytest.mark.integration
def test_xwalk_private(hf_private_dataset_repo_zipped_txt_data, hf_token):
root_url = hf_hub_url(hf_private_dataset_repo_zipped_txt_data, "data.zip")
root_url = hf_dataset_url(hf_private_dataset_repo_zipped_txt_data, "data.zip")
download_config = DownloadConfig(token=hf_token)
assert len(list(xwalk("zip://::" + root_url, download_config=download_config))) == 2
assert len(list(xwalk("zip://main_dir::" + root_url, download_config=download_config))) == 1
Expand Down
6 changes: 3 additions & 3 deletions tests/test_upstream_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
FolderBasedBuilderConfig,
)
from datasets.utils.file_utils import cached_path
from datasets.utils.hub import hf_hub_url
from datasets.utils.hub import hf_dataset_url
from tests.fixtures.hub import CI_HUB_ENDPOINT, CI_HUB_USER, CI_HUB_USER_TOKEN
from tests.utils import for_all_test_methods, require_pil, require_sndfile, xfail_if_500_502_http_error

Expand Down Expand Up @@ -608,7 +608,7 @@ def test_push_multiple_dataset_configs_to_hub_readme_metadata_content(
ds_config2.push_to_hub(ds_name, "config2", token=self._token)

# check that configs args was correctly pushed to README.md
ds_readme_path = cached_path(hf_hub_url(ds_name, "README.md"))
ds_readme_path = cached_path(hf_dataset_url(ds_name, "README.md"))
dataset_card_data = DatasetCard.load(ds_readme_path).data
assert METADATA_CONFIGS_FIELD in dataset_card_data
assert isinstance(dataset_card_data[METADATA_CONFIGS_FIELD], list)
Expand Down Expand Up @@ -757,7 +757,7 @@ def test_push_multiple_dataset_dict_configs_to_hub_readme_metadata_content(
ds_config2.push_to_hub(ds_name, "config2", token=self._token)

# check that configs args was correctly pushed to README.md
ds_readme_path = cached_path(hf_hub_url(ds_name, "README.md"))
ds_readme_path = cached_path(hf_dataset_url(ds_name, "README.md"))
dataset_card_data = DatasetCard.load(ds_readme_path).data
assert METADATA_CONFIGS_FIELD in dataset_card_data
assert isinstance(dataset_card_data[METADATA_CONFIGS_FIELD], list)
Expand Down

0 comments on commit 66d6242

Please sign in to comment.