Skip to content

Commit

Permalink
feat: 🎸 upgrade datasets to 2.3.1 (#375)
Browse files Browse the repository at this point in the history
* feat: 🎸 upgrade datasets to 2.3.0

* feat: 🎸 add make refresh-cache to trigger a refresh of all ds

* style: 💄 fix style

* feat: 🎸 upgrade admin and worker images

* test: 💍 disable a buggy test

* feat: 🎸 upgrade datasets to 2.3.1

* feat: 🎸 upgrade the worker image
  • Loading branch information
severo authored Jun 15, 2022
1 parent c768827 commit 3991247
Show file tree
Hide file tree
Showing 6 changed files with 63 additions and 26 deletions.
6 changes: 3 additions & 3 deletions infra/charts/datasets-server/docker-images.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
{
"dockerImage": {
"admin": "707930574880.dkr.ecr.us-east-1.amazonaws.com/hub-datasets-server-admin:sha-9592dc1",
"admin": "707930574880.dkr.ecr.us-east-1.amazonaws.com/hub-datasets-server-admin:sha-3327d8f",
"api": "707930574880.dkr.ecr.us-east-1.amazonaws.com/hub-datasets-server-api:sha-9592dc1",
"datasetsWorker": "707930574880.dkr.ecr.us-east-1.amazonaws.com/hub-datasets-server-worker:sha-a7d745e",
"datasetsWorker": "707930574880.dkr.ecr.us-east-1.amazonaws.com/hub-datasets-server-worker:sha-3f61bbe",
"reverseProxy": "docker.io/nginx:1.20",
"splitsWorker": "707930574880.dkr.ecr.us-east-1.amazonaws.com/hub-datasets-server-worker:sha-a7d745e"
"splitsWorker": "707930574880.dkr.ecr.us-east-1.amazonaws.com/hub-datasets-server-worker:sha-3f61bbe"
}
}
4 changes: 4 additions & 0 deletions services/admin/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ cancel-started-split-jobs:
cancel-started-dataset-jobs:
poetry run python src/admin/scripts/cancel_started_dataset_jobs.py

.PHONY: refresh-cache
refresh-cache:
poetry run python src/admin/scripts/refresh_cache.py

.PHONY: warm-cache
warm-cache:
poetry run python src/admin/scripts/warm_cache.py
Expand Down
31 changes: 31 additions & 0 deletions services/admin/src/admin/scripts/refresh_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import logging
from typing import List

from dotenv import load_dotenv
from huggingface_hub import list_datasets # type: ignore
from libqueue.queue import add_dataset_job, connect_to_queue
from libutils.logger import init_logger

from admin.config import LOG_LEVEL, MONGO_QUEUE_DATABASE, MONGO_URL

# Load environment variables defined in .env, if any
load_dotenv()


def get_hf_dataset_names():
return [str(dataset.id) for dataset in list_datasets(full=True)]


def refresh_datasets_cache(dataset_names: List[str]) -> None:
logger = logging.getLogger("warm_cache")
for dataset_name in dataset_names:
add_dataset_job(dataset_name)
logger.info(f"added a job to refresh '{dataset_name}'")


if __name__ == "__main__":
init_logger(LOG_LEVEL, "warm_cache")
logger = logging.getLogger("warm_cache")
connect_to_queue(MONGO_QUEUE_DATABASE, MONGO_URL)
refresh_datasets_cache(get_hf_dataset_names())
logger.info("all the datasets of the Hub have been added to the queue to refresh the cache")
28 changes: 14 additions & 14 deletions services/worker/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion services/worker/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ aiohttp = "^3.7.4.post0"
apache-beam = "^2.33.0"
bs4 = "^0.0.1"
conllu = "^4.4.1"
datasets = { extras = ["audio", "vision"], version = "^2.2.2" }
datasets = { extras = ["audio", "vision"], version = "^2.3.1" }
gdown = "^4.2.0"
kenlm = { url = "https://github.com/kpu/kenlm/archive/master.zip" }
kss = "^2.6.0"
Expand Down
18 changes: 10 additions & 8 deletions services/worker/tests/models/test_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,14 +101,16 @@ def test_get_split() -> None:
assert split["num_examples"] == 14006


def test_gated() -> None:
dataset_name = "severo/dummy_gated"
config_name = "severo--embellishments"
split_name = "train"
split = get_split(dataset_name, config_name, split_name, HF_TOKEN, rows_max_number=ROWS_MAX_NUMBER)

assert len(split["rows_response"]["rows"]) == ROWS_MAX_NUMBER
assert split["rows_response"]["rows"][0]["row"]["year"] == "1855"
# disabled since the dataset has a problem unrelated to the gated status
# see https://github.com/huggingface/datasets-server/pull/375#issuecomment-1156425010
# def test_gated() -> None:
# dataset_name = "severo/dummy_gated"
# config_name = "severo--embellishments"
# split_name = "train"
# split = get_split(dataset_name, config_name, split_name, HF_TOKEN, rows_max_number=ROWS_MAX_NUMBER)

# assert len(split["rows_response"]["rows"]) == ROWS_MAX_NUMBER
# assert split["rows_response"]["rows"][0]["row"]["year"] == "1855"


def test_fallback() -> None:
Expand Down

0 comments on commit 3991247

Please sign in to comment.