From 6d89678d894db2b05f5e185f3b2586e7c5e90bd7 Mon Sep 17 00:00:00 2001
From: Test User <sylvain.lesage@huggingface.co>
Date: Thu, 22 Sep 2022 09:56:30 +0000
Subject: [PATCH 1/8] =?UTF-8?q?docs:=20=E2=9C=8F=EF=B8=8F=20add=20missing?=
 =?UTF-8?q?=20argument=20in=20docstring?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 services/worker/src/worker/responses/first_rows.py | 2 ++
 services/worker/src/worker/responses/splits.py     | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/services/worker/src/worker/responses/first_rows.py b/services/worker/src/worker/responses/first_rows.py
index 7c2bb200e5..9bd1f387a3 100644
--- a/services/worker/src/worker/responses/first_rows.py
+++ b/services/worker/src/worker/responses/first_rows.py
@@ -251,6 +251,8 @@ def get_first_rows_response(
             A split name.
         assets_base_url (`str`):
             The base url of the assets.
+        hf_endpoint (`str`):
+            The Hub endpoint (for example: "https://huggingface.co")
         hf_token (`str`, *optional*):
             An authentication token (See https://huggingface.co/settings/token)
         max_size_fallback (`int`, *optional*):
diff --git a/services/worker/src/worker/responses/splits.py b/services/worker/src/worker/responses/splits.py
index a32bf8155e..68a56a046c 100644
--- a/services/worker/src/worker/responses/splits.py
+++ b/services/worker/src/worker/responses/splits.py
@@ -55,6 +55,8 @@ def get_splits_response(
         dataset (`str`):
             A namespace (user or an organization) and a repo name separated
             by a `/`.
+        hf_endpoint (`str`):
+            The Hub endpoint (for example: "https://huggingface.co")
         hf_token (`str`, *optional*):
             An authentication token (See https://huggingface.co/settings/token)
     Returns:

From 5f755c12b7e5fd0aabdf4bb6d60450d1d6f068b8 Mon Sep 17 00:00:00 2001
From: Test User <sylvain.lesage@huggingface.co>
Date: Thu, 22 Sep 2022 10:34:53 +0000
Subject: [PATCH 2/8] =?UTF-8?q?feat:=20=F0=9F=8E=B8=20change=20the=20logic?=
 =?UTF-8?q?=20of=20the=20webhook?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We now use the webhook only to get the name of the dataset to handle.
Then, if the dataset is supported (exists, is public, can be gated or
not), update the cache; otherwise, delete the cache.
---
 services/api/README.md                    |  1 +
 services/api/poetry.lock                  | 73 ++++++++++++++++--
 services/api/pyproject.toml               |  1 +
 services/api/src/api/app.py               | 22 +++++-
 services/api/src/api/config.py            |  2 +
 services/api/src/api/constants.py         |  3 +
 services/api/src/api/hub.py               | 30 ++++++++
 services/api/src/api/routes/first_rows.py |  6 +-
 services/api/src/api/routes/splits.py     |  7 +-
 services/api/src/api/routes/webhook.py    | 94 ++++++++++++-----------
 10 files changed, 180 insertions(+), 59 deletions(-)
 create mode 100644 services/api/src/api/hub.py

diff --git a/services/api/README.md b/services/api/README.md
index 2f26f795ea..9a232f53e0 100644
--- a/services/api/README.md
+++ b/services/api/README.md
@@ -12,6 +12,7 @@ Set environment variables to configure the following aspects:
 - `ASSETS_DIRECTORY`: directory where the asset files are stored. Defaults to empty, in which case the assets are located in the `datasets_server_assets` subdirectory inside the OS default cache directory.
 - `HF_AUTH_PATH`: the path of the external authentication service, on the hub (see `HF_ENDPOINT`). The string must contain `%s` which will be replaced with the dataset name. The external authentication service must return 200, 401, 403 or 404. If empty, the authentication is disabled. Defaults to "/api/datasets/%s/auth-check".
 - `HF_ENDPOINT`: URL of the HuggingFace Hub. Defaults to `https://huggingface.co`.
+- `HF_TOKEN`: App Access Token (ask moonlanding administrators to get one, only the `read` role is required), to access the gated datasets. Defaults to empty.
 - `LOG_LEVEL`: log level, among `DEBUG`, `INFO`, `WARNING`, `ERROR` and `CRITICAL`. Defaults to `INFO`.
 - `MAX_AGE_LONG_SECONDS`: number of seconds to set in the `max-age` header on data endpoints. Defaults to `120` (2 minutes).
 - `MAX_AGE_SHORT_SECONDS`: number of seconds to set in the `max-age` header on technical endpoints. Defaults to `10` (10 seconds).
diff --git a/services/api/poetry.lock b/services/api/poetry.lock
index 30c29709f7..0b7e1c75d1 100644
--- a/services/api/poetry.lock
+++ b/services/api/poetry.lock
@@ -101,7 +101,7 @@ uvloop = ["uvloop (>=0.15.2)"]
 name = "certifi"
 version = "2022.5.18.1"
 description = "Python package for providing Mozilla's CA Bundle."
-category = "dev"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 
@@ -109,7 +109,7 @@ python-versions = ">=3.6"
 name = "charset-normalizer"
 version = "2.0.12"
 description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
-category = "dev"
+category = "main"
 optional = false
 python-versions = ">=3.5.0"
 
@@ -174,6 +174,18 @@ toml = "*"
 [package.extras]
 pipenv = ["pipenv"]
 
+[[package]]
+name = "filelock"
+version = "3.8.0"
+description = "A platform independent file lock."
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[package.extras]
+docs = ["furo (>=2022.6.21)", "sphinx (>=5.1.1)", "sphinx-autodoc-typehints (>=1.19.1)"]
+testing = ["covdefaults (>=2.2)", "coverage (>=6.4.2)", "pytest (>=7.1.2)", "pytest-cov (>=3)", "pytest-timeout (>=2.1)"]
+
 [[package]]
 name = "flake8"
 version = "3.9.2"
@@ -217,6 +229,31 @@ category = "main"
 optional = false
 python-versions = ">=3.6"
 
+[[package]]
+name = "huggingface-hub"
+version = "0.9.1"
+description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
+category = "main"
+optional = false
+python-versions = ">=3.7.0"
+
+[package.dependencies]
+filelock = "*"
+packaging = ">=20.9"
+pyyaml = ">=5.1"
+requests = "*"
+tqdm = "*"
+typing-extensions = ">=3.7.4.3"
+
+[package.extras]
+torch = ["torch"]
+testing = ["soundfile", "datasets", "pytest-cov", "pytest"]
+tensorflow = ["graphviz", "pydot", "tensorflow"]
+quality = ["flake8-bugbear", "flake8 (>=3.8.3)", "isort (>=5.5.4)", "black (==22.3)"]
+fastai = ["fastcore (>=1.3.27)", "fastai (>=2.4)", "toml"]
+dev = ["flake8-bugbear", "flake8 (>=3.8.3)", "isort (>=5.5.4)", "black (==22.3)", "soundfile", "datasets", "pytest-cov", "pytest"]
+all = ["flake8-bugbear", "flake8 (>=3.8.3)", "isort (>=5.5.4)", "black (==22.3)", "soundfile", "datasets", "pytest-cov", "pytest"]
+
 [[package]]
 name = "idna"
 version = "3.3"
@@ -361,7 +398,7 @@ python-versions = ">=3.7"
 name = "packaging"
 version = "21.3"
 description = "Core utilities for Python packages"
-category = "dev"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 
@@ -479,7 +516,7 @@ zstd = ["zstandard"]
 name = "pyparsing"
 version = "3.0.9"
 description = "pyparsing module - Classes and methods to define and execute parsing grammars"
-category = "dev"
+category = "main"
 optional = false
 python-versions = ">=3.6.8"
 
@@ -535,7 +572,7 @@ python-versions = ">=3.6"
 name = "requests"
 version = "2.28.0"
 description = "Python HTTP for Humans."
-category = "dev"
+category = "main"
 optional = false
 python-versions = ">=3.7, <4"
 
@@ -679,6 +716,23 @@ category = "dev"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 
+[[package]]
+name = "tqdm"
+version = "4.64.1"
+description = "Fast, Extensible Progress Meter"
+category = "main"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7"
+
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+
+[package.extras]
+dev = ["py-make (>=0.1.0)", "twine", "wheel"]
+notebook = ["ipywidgets (>=6)"]
+slack = ["slack-sdk"]
+telegram = ["requests"]
+
 [[package]]
 name = "typed-ast"
 version = "1.4.3"
@@ -691,7 +745,7 @@ python-versions = "*"
 name = "typing-extensions"
 version = "4.2.0"
 description = "Backported and Experimental Type Hints for Python 3.7+"
-category = "dev"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 
@@ -699,7 +753,7 @@ python-versions = ">=3.7"
 name = "urllib3"
 version = "1.26.9"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
-category = "dev"
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4"
 
@@ -741,7 +795,7 @@ watchmedo = ["PyYAML (>=3.10)"]
 [metadata]
 lock-version = "1.1"
 python-versions = "3.9.6"
-content-hash = "972c7d6f5c61a411052028a6e328d28ec6fddcaa8e172b3a8c3cc9a93ce93645"
+content-hash = "4d293310af057fa5b93e81a80698d9a4a3a5a5358909cff78ceb352ba7207959"
 
 [metadata.files]
 anyio = [
@@ -860,6 +914,7 @@ dparse = [
     {file = "dparse-0.5.1-py3-none-any.whl", hash = "sha256:e953a25e44ebb60a5c6efc2add4420c177f1d8404509da88da9729202f306994"},
     {file = "dparse-0.5.1.tar.gz", hash = "sha256:a1b5f169102e1c894f9a7d5ccf6f9402a836a5d24be80a986c7ce9eaed78f367"},
 ]
+filelock = []
 flake8 = [
     {file = "flake8-3.9.2-py2.py3-none-any.whl", hash = "sha256:bf8fd333346d844f616e8d47905ef3a3384edae6b4e9beb0c5101e25e3110907"},
     {file = "flake8-3.9.2.tar.gz", hash = "sha256:07528381786f2a6237b061f6e96610a4167b226cb926e2aa2b6b1d78057c576b"},
@@ -876,6 +931,7 @@ h11 = [
     {file = "h11-0.13.0-py3-none-any.whl", hash = "sha256:8ddd78563b633ca55346c8cd41ec0af27d3c79931828beffb46ce70a379e7442"},
     {file = "h11-0.13.0.tar.gz", hash = "sha256:70813c1135087a248a4d38cc0e1a0181ffab2188141a93eaf567940c3957ff06"},
 ]
+huggingface-hub = []
 idna = [
     {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"},
     {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"},
@@ -1208,6 +1264,7 @@ tomlkit = [
     {file = "tomlkit-0.7.2-py2.py3-none-any.whl", hash = "sha256:173ad840fa5d2aac140528ca1933c29791b79a374a0861a80347f42ec9328117"},
     {file = "tomlkit-0.7.2.tar.gz", hash = "sha256:d7a454f319a7e9bd2e249f239168729327e4dd2d27b17dc68be264ad1ce36754"},
 ]
+tqdm = []
 typed-ast = [
     {file = "typed_ast-1.4.3-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:2068531575a125b87a41802130fa7e29f26c09a2833fea68d9a40cf33902eba6"},
     {file = "typed_ast-1.4.3-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:c907f561b1e83e93fad565bac5ba9c22d96a54e7ea0267c708bffe863cbe4075"},
diff --git a/services/api/pyproject.toml b/services/api/pyproject.toml
index aa31046ba0..3f7723bddb 100644
--- a/services/api/pyproject.toml
+++ b/services/api/pyproject.toml
@@ -6,6 +6,7 @@ version = "0.1.3"
 license = "Apache-2.0"
 
 [tool.poetry.dependencies]
+huggingface-hub = "^0.9.1"
 libcache = { path = "../../libs/libcache/dist/libcache-0.2.1-py3-none-any.whl", develop = false }
 libqueue = { path = "../../libs/libqueue/dist/libqueue-0.2.0-py3-none-any.whl", develop = false }
 libutils = { path = "../../libs/libutils/dist/libutils-0.2.0-py3-none-any.whl", develop = false }
diff --git a/services/api/src/api/app.py b/services/api/src/api/app.py
index 4815c7e9d7..cc75371af4 100644
--- a/services/api/src/api/app.py
+++ b/services/api/src/api/app.py
@@ -22,6 +22,8 @@
     APP_PORT,
     ASSETS_DIRECTORY,
     EXTERNAL_AUTH_URL,
+    HF_ENDPOINT,
+    HF_TOKEN,
     LOG_LEVEL,
     MONGO_CACHE_DATABASE,
     MONGO_QUEUE_DATABASE,
@@ -32,7 +34,7 @@
 from api.routes.healthcheck import healthcheck_endpoint
 from api.routes.splits import create_splits_endpoint
 from api.routes.valid import create_is_valid_endpoint, valid_endpoint
-from api.routes.webhook import webhook_endpoint
+from api.routes.webhook import create_webhook_endpoint
 
 
 def create_app() -> Starlette:
@@ -53,12 +55,24 @@ def create_app() -> Starlette:
         Route("/valid", endpoint=valid_endpoint),
         Route("/is-valid", endpoint=create_is_valid_endpoint(EXTERNAL_AUTH_URL)),
         # ^ called by https://github.com/huggingface/model-evaluator
-        Route("/first-rows", endpoint=create_first_rows_endpoint(EXTERNAL_AUTH_URL)),
-        Route("/splits", endpoint=create_splits_endpoint(EXTERNAL_AUTH_URL)),
+        Route(
+            "/first-rows",
+            endpoint=create_first_rows_endpoint(
+                external_auth_url=EXTERNAL_AUTH_URL, hf_endpoint=HF_ENDPOINT, hf_token=HF_TOKEN
+            ),
+        ),
+        Route(
+            "/splits",
+            endpoint=create_splits_endpoint(
+                external_auth_url=EXTERNAL_AUTH_URL, hf_endpoint=HF_ENDPOINT, hf_token=HF_TOKEN
+            ),
+        ),
     ]
     to_protect: List[BaseRoute] = [
         # called by the Hub webhooks
-        Route("/webhook", endpoint=webhook_endpoint, methods=["POST"]),
+        Route(
+            "/webhook", endpoint=create_webhook_endpoint(hf_endpoint=HF_ENDPOINT, hf_token=HF_TOKEN), methods=["POST"]
+        ),
     ]
     protected: List[BaseRoute] = [
         Route("/healthcheck", endpoint=healthcheck_endpoint),
diff --git a/services/api/src/api/config.py b/services/api/src/api/config.py
index 8e5efbc77a..f1bb71d5dd 100644
--- a/services/api/src/api/config.py
+++ b/services/api/src/api/config.py
@@ -12,6 +12,7 @@
     DEFAULT_ASSETS_DIRECTORY,
     DEFAULT_HF_AUTH_PATH,
     DEFAULT_HF_ENDPOINT,
+    DEFAULT_HF_TOKEN,
     DEFAULT_LOG_LEVEL,
     DEFAULT_MAX_AGE_LONG_SECONDS,
     DEFAULT_MAX_AGE_SHORT_SECONDS,
@@ -26,6 +27,7 @@
 ASSETS_DIRECTORY = get_str_or_none_value(d=os.environ, key="ASSETS_DIRECTORY", default=DEFAULT_ASSETS_DIRECTORY)
 HF_AUTH_PATH = get_str_or_none_value(d=os.environ, key="HF_AUTH_PATH", default=DEFAULT_HF_AUTH_PATH)
 HF_ENDPOINT = get_str_value(d=os.environ, key="HF_ENDPOINT", default=DEFAULT_HF_ENDPOINT)
+HF_TOKEN = get_str_or_none_value(d=os.environ, key="HF_TOKEN", default=DEFAULT_HF_TOKEN)
 LOG_LEVEL = get_str_value(d=os.environ, key="LOG_LEVEL", default=DEFAULT_LOG_LEVEL)
 MAX_AGE_LONG_SECONDS = get_int_value(d=os.environ, key="MAX_AGE_LONG_SECONDS", default=DEFAULT_MAX_AGE_LONG_SECONDS)
 MAX_AGE_SHORT_SECONDS = get_int_value(d=os.environ, key="MAX_AGE_SHORT_SECONDS", default=DEFAULT_MAX_AGE_SHORT_SECONDS)
diff --git a/services/api/src/api/constants.py b/services/api/src/api/constants.py
index 3f749c2714..c1f298de3d 100644
--- a/services/api/src/api/constants.py
+++ b/services/api/src/api/constants.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright 2022 The HuggingFace Authors.
 
+from typing import Optional
+
 DEFAULT_APP_HOSTNAME: str = "localhost"
 DEFAULT_APP_NUM_WORKERS: int = 2
 DEFAULT_APP_PORT: int = 8000
@@ -8,6 +10,7 @@
 DEFAULT_DATASETS_ENABLE_PRIVATE: bool = False
 DEFAULT_HF_AUTH_PATH: str = "/api/datasets/%s/auth-check"
 DEFAULT_HF_ENDPOINT: str = "https://huggingface.co"
+DEFAULT_HF_TOKEN: Optional[str] = None
 DEFAULT_LOG_LEVEL: str = "INFO"
 DEFAULT_MAX_AGE_LONG_SECONDS: int = 120  # 2 minutes
 DEFAULT_MAX_AGE_SHORT_SECONDS: int = 10  # 10 seconds
diff --git a/services/api/src/api/hub.py b/services/api/src/api/hub.py
new file mode 100644
index 0000000000..ae671aa3de
--- /dev/null
+++ b/services/api/src/api/hub.py
@@ -0,0 +1,30 @@
+from typing import Optional
+
+from huggingface_hub.hf_api import HfApi  # type: ignore
+from huggingface_hub.utils import RepositoryNotFoundError  # type: ignore
+
+
+def is_dataset_supported(
+    dataset: str,
+    hf_endpoint: str,
+    hf_token: Optional[str] = None,
+) -> bool:
+    """
+    Check if the dataset exists on the Hub and is supported by the datasets-server.
+    Args:
+        dataset (`str`):
+            A namespace (user or an organization) and a repo name separated
+            by a `/`.
+        hf_endpoint (`str`):
+            The Hub endpoint (for example: "https://huggingface.co")
+        hf_token (`str`, *optional*):
+            An authentication token (See https://huggingface.co/settings/token)
+    Returns:
+        [`bool`]: True if the dataset is supported by the datasets-server.
+    """
+    try:
+        # note that token is required to access gated dataset info
+        info = HfApi(endpoint=hf_endpoint).dataset_info(dataset, token=hf_token)
+    except RepositoryNotFoundError:
+        return False
+    return info.private is False
diff --git a/services/api/src/api/routes/first_rows.py b/services/api/src/api/routes/first_rows.py
index b750f04249..c7a12f49c9 100644
--- a/services/api/src/api/routes/first_rows.py
+++ b/services/api/src/api/routes/first_rows.py
@@ -27,7 +27,11 @@
 logger = logging.getLogger(__name__)
 
 
-def create_first_rows_endpoint(external_auth_url: Optional[str] = None) -> Endpoint:
+def create_first_rows_endpoint(
+    hf_endpoint: str,
+    hf_token: Optional[str] = None,
+    external_auth_url: Optional[str] = None,
+) -> Endpoint:
     async def first_rows_endpoint(request: Request) -> Response:
         try:
             dataset_name = request.query_params.get("dataset")
diff --git a/services/api/src/api/routes/splits.py b/services/api/src/api/routes/splits.py
index aac9e91856..1191700cfd 100644
--- a/services/api/src/api/routes/splits.py
+++ b/services/api/src/api/routes/splits.py
@@ -27,7 +27,9 @@
 logger = logging.getLogger(__name__)
 
 
-def create_splits_endpoint(external_auth_url: Optional[str] = None) -> Endpoint:
+def create_splits_endpoint(
+    hf_endpoint: str, hf_token: Optional[str] = None, external_auth_url: Optional[str] = None
+) -> Endpoint:
     async def splits_endpoint(request: Request) -> Response:
         try:
             dataset_name = request.query_params.get("dataset")
@@ -49,6 +51,9 @@ async def splits_endpoint(request: Request) -> Response:
                         "The list of splits is not ready yet. Please retry later."
                     ) from e
                 else:
+                    # let's double-check if the response should exist
+                    # see https://github.com/huggingface/datasets-server/issues/380
+                    # no webhook is sent when a private dataset goes public
                     raise SplitsResponseNotFoundError("Not found.") from e
         except ApiCustomError as e:
             return get_json_api_error_response(e)
diff --git a/services/api/src/api/routes/webhook.py b/services/api/src/api/routes/webhook.py
index 49f1c027fb..5e91a52065 100644
--- a/services/api/src/api/routes/webhook.py
+++ b/services/api/src/api/routes/webhook.py
@@ -14,7 +14,8 @@
 from starlette.requests import Request
 from starlette.responses import Response
 
-from api.utils import are_valid_parameters, get_response
+from api.hub import is_dataset_supported
+from api.utils import Endpoint, get_response, is_non_empty_string
 
 logger = logging.getLogger(__name__)
 
@@ -50,47 +51,50 @@ def get_dataset_name(id: Optional[str]) -> Optional[str]:
     # if id == dataset_name:
     #     logger.info(f"ignored because a full dataset id must starts with 'datasets/': {id}")
     #     return None
-    return dataset_name if are_valid_parameters([dataset_name]) else None
-
-
-def try_to_update(id: Optional[str]) -> None:
-    dataset_name = get_dataset_name(id)
-    if dataset_name is not None:
-        logger.debug(f"webhook: refresh {dataset_name}")
-        # new implementation for the /splits endpoint
-        mark_splits_responses_as_stale(dataset_name)
-        mark_first_rows_responses_as_stale(dataset_name)
-        add_splits_job(dataset_name)
-
-
-def try_to_delete(id: Optional[str]) -> None:
-    dataset_name = get_dataset_name(id)
-    if dataset_name is not None:
-        logger.debug(f"webhook: delete {dataset_name}")
-        # new implementation for the /splits endpoint
-        delete_splits_responses(dataset_name)
-        delete_first_rows_responses(dataset_name)
-
-
-def process_payload(payload: MoonWebhookV1Payload) -> None:
-    try_to_update(payload["add"])
-    try_to_update(payload["update"])
-    try_to_delete(payload["remove"])
-
-
-async def webhook_endpoint(request: Request) -> Response:
-    try:
-        json = await request.json()
-    except Exception:
-        content = {"status": "error", "error": "the body could not be parsed as a JSON"}
-        return get_response(content, 400)
-    logger.info(f"/webhook: {json}")
-    try:
-        payload = parse_payload(json)
-    except Exception:
-        content = {"status": "error", "error": "the JSON payload is invalid"}
-        return get_response(content, 400)
-
-    process_payload(payload)
-    content = {"status": "ok"}
-    return get_response(content, 200)
+    return dataset_name if is_non_empty_string(dataset_name) else None
+
+
+def update(dataset_name: str) -> None:
+    logger.debug(f"webhook: refresh {dataset_name}")
+    mark_splits_responses_as_stale(dataset_name)
+    mark_first_rows_responses_as_stale(dataset_name)
+    add_splits_job(dataset_name)
+
+
+def delete(dataset_name: str) -> None:
+    logger.debug(f"webhook: delete {dataset_name}")
+    delete_splits_responses(dataset_name)
+    delete_first_rows_responses(dataset_name)
+
+
+def process_payload(payload: MoonWebhookV1Payload, hf_endpoint: str, hf_token: Optional[str] = None) -> None:
+    unique_dataset_names = {get_dataset_name(id) for id in {payload["add"], payload["remove"], payload["update"]}}
+    for dataset_name in unique_dataset_names:
+        if dataset_name is not None:
+            if is_dataset_supported(dataset_name, hf_endpoint, hf_token):
+                update(dataset_name)
+            else:
+                delete(dataset_name)
+
+
+def create_webhook_endpoint(
+    hf_endpoint: str, hf_token: Optional[str] = None, external_auth_url: Optional[str] = None
+) -> Endpoint:
+    async def webhook_endpoint(request: Request) -> Response:
+        try:
+            json = await request.json()
+        except Exception:
+            content = {"status": "error", "error": "the body could not be parsed as a JSON"}
+            return get_response(content, 400)
+        logger.info(f"/webhook: {json}")
+        try:
+            payload = parse_payload(json)
+        except Exception:
+            content = {"status": "error", "error": "the JSON payload is invalid"}
+            return get_response(content, 400)
+
+        process_payload(payload, hf_endpoint, hf_token)
+        content = {"status": "ok"}
+        return get_response(content, 200)
+
+    return webhook_endpoint

From 2de641db26d20bd3505558eaffddf47b3d044504 Mon Sep 17 00:00:00 2001
From: Test User <sylvain.lesage@huggingface.co>
Date: Fri, 23 Sep 2022 09:25:20 +0000
Subject: [PATCH 3/8] =?UTF-8?q?feat:=20=F0=9F=8E=B8=20create=20a=20new=20j?=
 =?UTF-8?q?ob=20if=20a=20splits/=20response=20is=20a=20cache=20miss?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a request comes to splits/, if no cache entry exist for the
dataset, and if no job is pending either, we now check on the Hub
if the dataset exists and is supported (ie. not private).
If so, it means that we somehow missed a webhook. We add a job to
the queue, and we respond with a "NotReady" error.
Otherwise, we respond with "NotFound".

This change is a quick fix for https://github.com/huggingface/datasets-server/issues/380#issuecomment-1254740105

Also: to be able to test, we changed the test library "responses"
to a proper HTTP server ("pytest-httpserver" library), which
gives us more flexibility to mock the Hub APIs (both the auth endpoint
and the API datasets endpoint) and test even if we use huggingface_hub
lib in the code.

Also: the tests are a bit more precise (more cases)
---
 services/api/poetry.lock                  |  54 ++++--
 services/api/pyproject.toml               |   7 +-
 services/api/src/api/dataset.py           | 103 ++++++++++++
 services/api/src/api/hub.py               |  30 ----
 services/api/src/api/routes/splits.py     |  11 +-
 services/api/src/api/routes/webhook.py    |  32 +---
 services/api/tests/conftest.py            |  26 ++-
 services/api/tests/test_app.py            | 191 ++++++++++++++--------
 services/api/tests/test_authentication.py |  95 +++++------
 services/api/tests/test_dataset.py        |  23 +++
 services/api/tests/utils.py               |  23 ++-
 11 files changed, 378 insertions(+), 217 deletions(-)
 create mode 100644 services/api/src/api/dataset.py
 delete mode 100644 services/api/src/api/hub.py
 create mode 100644 services/api/tests/test_dataset.py

diff --git a/services/api/poetry.lock b/services/api/poetry.lock
index 0b7e1c75d1..a5fa534cd4 100644
--- a/services/api/poetry.lock
+++ b/services/api/poetry.lock
@@ -335,6 +335,14 @@ starlette = ">=0.16.0,<0.17.0"
 type = "file"
 url = "../../libs/libutils/dist/libutils-0.2.0-py3-none-any.whl"
 
+[[package]]
+name = "markupsafe"
+version = "2.1.1"
+description = "Safely add untrusted strings to HTML/XML markup."
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
 [[package]]
 name = "mccabe"
 version = "0.6.1"
@@ -560,6 +568,17 @@ toml = "*"
 [package.extras]
 testing = ["fields", "hunter", "process-tests", "six", "pytest-xdist", "virtualenv"]
 
+[[package]]
+name = "pytest-httpserver"
+version = "1.0.6"
+description = "pytest-httpserver is a httpserver for pytest"
+category = "dev"
+optional = false
+python-versions = ">=3.7,<4.0"
+
+[package.dependencies]
+Werkzeug = ">=2.0.0"
+
 [[package]]
 name = "pyyaml"
 version = "6.0"
@@ -586,21 +605,6 @@ urllib3 = ">=1.21.1,<1.27"
 socks = ["PySocks (>=1.5.6,!=1.5.7)"]
 use_chardet_on_py3 = ["chardet (>=3.0.2,<5)"]
 
-[[package]]
-name = "responses"
-version = "0.21.0"
-description = "A utility library for mocking out the `requests` Python library."
-category = "dev"
-optional = false
-python-versions = ">=3.7"
-
-[package.dependencies]
-requests = ">=2.0,<3.0"
-urllib3 = ">=1.25.10"
-
-[package.extras]
-tests = ["pytest (>=7.0.0)", "coverage (>=6.0.0)", "pytest-cov", "pytest-asyncio", "pytest-localserver", "flake8", "types-mock", "types-requests", "mypy"]
-
 [[package]]
 name = "ruamel.yaml"
 version = "0.17.21"
@@ -792,10 +796,24 @@ PyYAML = {version = ">=3.10", optional = true, markers = "extra == \"watchmedo\"
 [package.extras]
 watchmedo = ["PyYAML (>=3.10)"]
 
+[[package]]
+name = "werkzeug"
+version = "2.2.2"
+description = "The comprehensive WSGI web application library."
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+MarkupSafe = ">=2.1.1"
+
+[package.extras]
+watchdog = ["watchdog"]
+
 [metadata]
 lock-version = "1.1"
 python-versions = "3.9.6"
-content-hash = "4d293310af057fa5b93e81a80698d9a4a3a5a5358909cff78ceb352ba7207959"
+content-hash = "cf90d33b884908a7275c17f252f3d6797baeca82a4b628a0447d27cd875d89c6"
 
 [metadata.files]
 anyio = [
@@ -953,6 +971,7 @@ libqueue = [
 libutils = [
     {file = "libutils-0.2.0-py3-none-any.whl", hash = "sha256:a562dd39d4b3c5ab20bb11354e8eaf582d873f0367996df9a4c3c00609f608da"},
 ]
+markupsafe = []
 mccabe = [
     {file = "mccabe-0.6.1-py2.py3-none-any.whl", hash = "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42"},
     {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"},
@@ -1189,6 +1208,7 @@ pytest-cov = [
     {file = "pytest-cov-2.12.1.tar.gz", hash = "sha256:261ceeb8c227b726249b376b8526b600f38667ee314f910353fa318caa01f4d7"},
     {file = "pytest_cov-2.12.1-py2.py3-none-any.whl", hash = "sha256:261bb9e47e65bd099c89c3edf92972865210c36813f80ede5277dceb77a4a62a"},
 ]
+pytest-httpserver = []
 pyyaml = [
     {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"},
     {file = "PyYAML-6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c"},
@@ -1228,7 +1248,6 @@ requests = [
     {file = "requests-2.28.0-py3-none-any.whl", hash = "sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f"},
     {file = "requests-2.28.0.tar.gz", hash = "sha256:d568723a7ebd25875d8d1eaf5dfa068cd2fc8194b2e483d7b1f7c81918dbec6b"},
 ]
-responses = []
 "ruamel.yaml" = []
 "ruamel.yaml.clib" = []
 safety = []
@@ -1336,3 +1355,4 @@ watchdog = [
     {file = "watchdog-2.1.9-py3-none-win_ia64.whl", hash = "sha256:ad576a565260d8f99d97f2e64b0f97a48228317095908568a9d5c786c829d428"},
     {file = "watchdog-2.1.9.tar.gz", hash = "sha256:43ce20ebb36a51f21fa376f76d1d4692452b2527ccd601950d69ed36b9e21609"},
 ]
+werkzeug = []
diff --git a/services/api/pyproject.toml b/services/api/pyproject.toml
index 3f7723bddb..3ba57dd7f5 100644
--- a/services/api/pyproject.toml
+++ b/services/api/pyproject.toml
@@ -25,7 +25,7 @@ mypy = "0.812"
 poetryup = "^0.3.8"
 pytest = "^6.2.5"
 pytest-cov = "^2.12.1"
-responses = "^0.21.0"
+pytest-httpserver = "^1.0.6"
 safety = "^2.1.1"
 
 [build-system]
@@ -33,7 +33,12 @@ build-backend = "poetry.core.masonry.api"
 requires = ["poetry-core>=1.0.0"]
 
 [tool.pytest.ini_options]
+addopts = "-k 'not deprecated'"
 filterwarnings = ["ignore::DeprecationWarning"]
+markers = [
+    "deprecated: tests on deprecated code (deselect with '-m \"not deprecated\"')",
+    "wip: tests being developed"
+]
 
 [tool.coverage.run]
 source = ["api"]
diff --git a/services/api/src/api/dataset.py b/services/api/src/api/dataset.py
new file mode 100644
index 0000000000..1bc3e37f74
--- /dev/null
+++ b/services/api/src/api/dataset.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2022 The HuggingFace Authors.
+
+import logging
+
+# from http import HTTPStatus
+from typing import Optional
+
+from huggingface_hub.hf_api import HfApi  # type: ignore
+from huggingface_hub.utils import RepositoryNotFoundError  # type: ignore
+from libcache.simple_cache import (  # DoesNotExist,; get_splits_response,
+    delete_first_rows_responses,
+    delete_splits_responses,
+    mark_first_rows_responses_as_stale,
+    mark_splits_responses_as_stale,
+)
+from libqueue.queue import (  # is_first_rows_response_in_process,
+    add_splits_job,
+    is_splits_response_in_process,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def is_supported(
+    dataset: str,
+    hf_endpoint: str,
+    hf_token: Optional[str] = None,
+) -> bool:
+    """
+    Check if the dataset exists on the Hub and is supported by the datasets-server.
+    Args:
+        dataset (`str`):
+            A namespace (user or an organization) and a repo name separated
+            by a `/`.
+        hf_endpoint (`str`):
+            The Hub endpoint (for example: "https://huggingface.co")
+        hf_token (`str`, *optional*):
+            An authentication token (See https://huggingface.co/settings/token)
+    Returns:
+        [`bool`]: True if the dataset is supported by the datasets-server.
+    """
+    try:
+        # note that token is required to access gated dataset info
+        info = HfApi(endpoint=hf_endpoint).dataset_info(dataset, token=hf_token)
+    except RepositoryNotFoundError:
+        return False
+    return info.private is False
+
+
+def update(dataset: str) -> None:
+    logger.debug(f"webhook: refresh {dataset}")
+    mark_splits_responses_as_stale(dataset)
+    mark_first_rows_responses_as_stale(dataset)
+    add_splits_job(dataset)
+
+
+def delete(dataset: str) -> None:
+    logger.debug(f"webhook: delete {dataset}")
+    delete_splits_responses(dataset)
+    delete_first_rows_responses(dataset)
+
+
+def is_splits_in_process(
+    dataset: str,
+    hf_endpoint: str,
+    hf_token: Optional[str] = None,
+) -> bool:
+    if is_splits_response_in_process(dataset_name=dataset):
+        return True
+    if is_supported(dataset=dataset, hf_endpoint=hf_endpoint, hf_token=hf_token):
+        update(dataset=dataset)
+        return True
+    return False
+
+
+# def is_first_rows_in_process(
+#     dataset: str,
+#     config: str,
+#     split: str,
+#     hf_endpoint: str,
+#     hf_token: Optional[str] = None,
+# ) -> bool:
+#     if is_first_rows_response_in_process(dataset_name=dataset, config_name=config, split_name=split):
+#         return True
+#     # a bit convoluted, but we need to check if the first-rows response should be exist
+#     if is_splits_response_in_process(dataset_name=dataset):
+#         return True
+#     try:
+#         response, http_status, _ = get_splits_response(dataset)
+#         if http_status == HTTPStatus.OK and any(
+#             split_item["dataset"] == dataset or split_item["config"] == config or split_item["split"] == split
+#             for split_item in response["splits"]
+#         ):
+#             # it's listed in the splits response.
+#             # we assume that it does not exist in the cache (it's why one call this function)
+#             # relaunch the whole /splits job, because something did not work
+#             update(dataset=dataset)
+#             return True
+#     except DoesNotExist:
+#         # the splits responses does not exist, let's check if it should
+#         return is_splits_in_process(dataset=dataset, hf_endpoint=hf_endpoint, hf_token=hf_token)
+#     return False
diff --git a/services/api/src/api/hub.py b/services/api/src/api/hub.py
deleted file mode 100644
index ae671aa3de..0000000000
--- a/services/api/src/api/hub.py
+++ /dev/null
@@ -1,30 +0,0 @@
-from typing import Optional
-
-from huggingface_hub.hf_api import HfApi  # type: ignore
-from huggingface_hub.utils import RepositoryNotFoundError  # type: ignore
-
-
-def is_dataset_supported(
-    dataset: str,
-    hf_endpoint: str,
-    hf_token: Optional[str] = None,
-) -> bool:
-    """
-    Check if the dataset exists on the Hub and is supported by the datasets-server.
-    Args:
-        dataset (`str`):
-            A namespace (user or an organization) and a repo name separated
-            by a `/`.
-        hf_endpoint (`str`):
-            The Hub endpoint (for example: "https://huggingface.co")
-        hf_token (`str`, *optional*):
-            An authentication token (See https://huggingface.co/settings/token)
-    Returns:
-        [`bool`]: True if the dataset is supported by the datasets-server.
-    """
-    try:
-        # note that token is required to access gated dataset info
-        info = HfApi(endpoint=hf_endpoint).dataset_info(dataset, token=hf_token)
-    except RepositoryNotFoundError:
-        return False
-    return info.private is False
diff --git a/services/api/src/api/routes/splits.py b/services/api/src/api/routes/splits.py
index 1191700cfd..778def04b2 100644
--- a/services/api/src/api/routes/splits.py
+++ b/services/api/src/api/routes/splits.py
@@ -6,11 +6,11 @@
 from typing import Optional
 
 from libcache.simple_cache import DoesNotExist, get_splits_response
-from libqueue.queue import is_splits_response_in_process
 from starlette.requests import Request
 from starlette.responses import Response
 
 from api.authentication import auth_check
+from api.dataset import is_splits_in_process
 from api.utils import (
     ApiCustomError,
     Endpoint,
@@ -46,15 +46,12 @@ async def splits_endpoint(request: Request) -> Response:
                 else:
                     return get_json_error_response(response, http_status, error_code)
             except DoesNotExist as e:
-                if is_splits_response_in_process(dataset_name):
+                # maybe the splits response is in process
+                if is_splits_in_process(dataset=dataset_name, hf_endpoint=hf_endpoint, hf_token=hf_token):
                     raise SplitsResponseNotReadyError(
                         "The list of splits is not ready yet. Please retry later."
                     ) from e
-                else:
-                    # let's double-check if the response should exist
-                    # see https://github.com/huggingface/datasets-server/issues/380
-                    # no webhook is sent when a private dataset goes public
-                    raise SplitsResponseNotFoundError("Not found.") from e
+                raise SplitsResponseNotFoundError("Not found.") from e
         except ApiCustomError as e:
             return get_json_api_error_response(e)
         except Exception as err:
diff --git a/services/api/src/api/routes/webhook.py b/services/api/src/api/routes/webhook.py
index 5e91a52065..c27606ac06 100644
--- a/services/api/src/api/routes/webhook.py
+++ b/services/api/src/api/routes/webhook.py
@@ -4,17 +4,10 @@
 import logging
 from typing import Any, Optional, TypedDict
 
-from libcache.simple_cache import (
-    delete_first_rows_responses,
-    delete_splits_responses,
-    mark_first_rows_responses_as_stale,
-    mark_splits_responses_as_stale,
-)
-from libqueue.queue import add_splits_job
 from starlette.requests import Request
 from starlette.responses import Response
 
-from api.hub import is_dataset_supported
+from api.dataset import delete, is_supported, update
 from api.utils import Endpoint, get_response, is_non_empty_string
 
 logger = logging.getLogger(__name__)
@@ -54,32 +47,17 @@ def get_dataset_name(id: Optional[str]) -> Optional[str]:
     return dataset_name if is_non_empty_string(dataset_name) else None
 
 
-def update(dataset_name: str) -> None:
-    logger.debug(f"webhook: refresh {dataset_name}")
-    mark_splits_responses_as_stale(dataset_name)
-    mark_first_rows_responses_as_stale(dataset_name)
-    add_splits_job(dataset_name)
-
-
-def delete(dataset_name: str) -> None:
-    logger.debug(f"webhook: delete {dataset_name}")
-    delete_splits_responses(dataset_name)
-    delete_first_rows_responses(dataset_name)
-
-
 def process_payload(payload: MoonWebhookV1Payload, hf_endpoint: str, hf_token: Optional[str] = None) -> None:
     unique_dataset_names = {get_dataset_name(id) for id in {payload["add"], payload["remove"], payload["update"]}}
     for dataset_name in unique_dataset_names:
         if dataset_name is not None:
-            if is_dataset_supported(dataset_name, hf_endpoint, hf_token):
-                update(dataset_name)
+            if is_supported(dataset=dataset_name, hf_endpoint=hf_endpoint, hf_token=hf_token):
+                update(dataset=dataset_name)
             else:
-                delete(dataset_name)
+                delete(dataset=dataset_name)
 
 
-def create_webhook_endpoint(
-    hf_endpoint: str, hf_token: Optional[str] = None, external_auth_url: Optional[str] = None
-) -> Endpoint:
+def create_webhook_endpoint(hf_endpoint: str, hf_token: Optional[str] = None) -> Endpoint:
     async def webhook_endpoint(request: Request) -> Response:
         try:
             json = await request.json()
diff --git a/services/api/tests/conftest.py b/services/api/tests/conftest.py
index a8392cd668..50f0e8473e 100644
--- a/services/api/tests/conftest.py
+++ b/services/api/tests/conftest.py
@@ -3,5 +3,27 @@
 
 import os
 
-os.environ["HF_AUTH_PATH"] = "/%s"
-os.environ["HF_ENDPOINT"] = "https://fake.url"
+import pytest
+
+port = 8888
+host = "localhost"
+HF_ENDPOINT = f"http://{host}:{port}"
+HF_AUTH_PATH = "/api/datasets/%s/auth-check"
+
+os.environ["HF_ENDPOINT"] = HF_ENDPOINT
+os.environ["HF_AUTH_PATH"] = HF_AUTH_PATH
+
+
+@pytest.fixture(scope="session")
+def httpserver_listen_address():
+    return (host, 8888)
+
+
+@pytest.fixture(scope="session")
+def hf_endpoint():
+    return HF_ENDPOINT
+
+
+@pytest.fixture(scope="session")
+def hf_auth_path():
+    return HF_AUTH_PATH
diff --git a/services/api/tests/test_app.py b/services/api/tests/test_app.py
index 565fa36c65..a4e0c5e285 100644
--- a/services/api/tests/test_app.py
+++ b/services/api/tests/test_app.py
@@ -1,28 +1,23 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright 2022 The HuggingFace Authors.
 
+import json
 from http import HTTPStatus
 from typing import Dict, Optional
 
 import pytest
-import responses
 
-# from libcache.cache import clean_database as clean_cache_database
 from libcache.simple_cache import _clean_database as clean_cache_database
-from libcache.simple_cache import (
-    mark_first_rows_responses_as_stale,
-    mark_splits_responses_as_stale,
-    upsert_first_rows_response,
-    upsert_splits_response,
-)
-from libqueue.queue import add_first_rows_job, add_splits_job
+from libcache.simple_cache import upsert_splits_response
 from libqueue.queue import clean_database as clean_queue_database
+from pytest_httpserver import HTTPServer
 from starlette.testclient import TestClient
 
 from api.app import create_app
 from api.config import EXTERNAL_AUTH_URL, MONGO_QUEUE_DATABASE
 
-from .utils import request_callback
+from .utils import auth_callback
+
 
 external_auth_url = EXTERNAL_AUTH_URL or "%s"  # for mypy
 
@@ -81,21 +76,32 @@ def test_get_valid_datasets(client: TestClient) -> None:
     assert "valid" in json
 
 
-@responses.activate
-def test_get_is_valid(client: TestClient) -> None:
-    response = client.get("/is-valid")
-    assert response.status_code == 422
-
-    dataset = "doesnotexist"
-    responses.add_callback(responses.GET, external_auth_url % dataset, callback=request_callback)
+@pytest.mark.parametrize(
+    "dataset,exists_on_the_hub,expected_status_code,expected_is_valid",
+    [
+        (None, True, 422, None),
+        ("notinthecache", True, 200, False),
+        ("notinthecache", False, 404, None),
+    ],
+)
+def test_get_is_valid(
+    client: TestClient,
+    httpserver: HTTPServer,
+    hf_auth_path: str,
+    dataset: Optional[str],
+    exists_on_the_hub: bool,
+    expected_status_code: int,
+    expected_is_valid: Optional[bool],
+) -> None:
+    httpserver.expect_request(hf_auth_path % dataset).respond_with_data(status=200 if exists_on_the_hub else 404)
     response = client.get("/is-valid", params={"dataset": dataset})
-    assert response.status_code == 200
-    json = response.json()
-    assert "valid" in json
-    assert json["valid"] is False
+    assert response.status_code == expected_status_code
+    if expected_is_valid is not None:
+        assert response.json()["valid"] == expected_is_valid
 
 
-# the logic below is just to check the cookie and authorization headers are managed correctly
+# caveat: the returned status codes don't simulate the reality
+# they're just used to check every case
 @pytest.mark.parametrize(
     "headers,status_code,error_code",
     [
@@ -104,12 +110,16 @@ def test_get_is_valid(client: TestClient) -> None:
         ({}, 200, None),
     ],
 )
-@responses.activate
 def test_is_valid_auth(
-    client: TestClient, headers: Dict[str, str], status_code: int, error_code: Optional[str]
+    client: TestClient,
+    httpserver: HTTPServer,
+    hf_auth_path: str,
+    headers: Dict[str, str],
+    status_code: int,
+    error_code: Optional[str],
 ) -> None:
     dataset = "dataset-which-does-not-exist"
-    responses.add_callback(responses.GET, external_auth_url % dataset, callback=request_callback)
+    httpserver.expect_request(hf_auth_path % dataset, headers=headers).respond_with_handler(auth_callback)
     response = client.get(f"/is-valid?dataset={dataset}", headers=headers)
     assert response.status_code == status_code
     assert response.headers.get("X-Error-Code") == error_code
@@ -130,7 +140,8 @@ def test_get_splits(client: TestClient) -> None:
     assert response.status_code == 422
 
 
-# the logic below is just to check the cookie and authorization headers are managed correctly
+# caveat: the returned status codes don't simulate the reality
+# they're just used to check every case
 @pytest.mark.parametrize(
     "headers,status_code,error_code",
     [
@@ -139,66 +150,106 @@ def test_get_splits(client: TestClient) -> None:
         ({}, 404, "SplitsResponseNotFound"),
     ],
 )
-@responses.activate
-def test_splits_auth(client: TestClient, headers: Dict[str, str], status_code: int, error_code: str) -> None:
+def test_splits_auth(
+    client: TestClient,
+    httpserver: HTTPServer,
+    hf_auth_path: str,
+    headers: Dict[str, str],
+    status_code: int,
+    error_code: str,
+) -> None:
     dataset = "dataset-which-does-not-exist"
-    responses.add_callback(responses.GET, external_auth_url % dataset, callback=request_callback)
+    httpserver.expect_request(hf_auth_path % dataset, headers=headers).respond_with_handler(auth_callback)
+    httpserver.expect_request(f"/api/datasets/{dataset}").respond_with_data(
+        json.dumps({}), headers={"X-Error-Code": "RepoNotFound"}
+    )
     response = client.get(f"/splits?dataset={dataset}", headers=headers)
-    assert response.status_code == status_code
+    assert response.status_code == status_code, f"{response.headers}, {response.json()}"
     assert response.headers.get("X-Error-Code") == error_code
 
 
-def test_get_first_rows(client: TestClient) -> None:
-    # missing parameter
-    response = client.get("/first-rows")
-    assert response.status_code == 422
-    response = client.get("/first-rows?dataset=a")
-    assert response.status_code == 422
-    response = client.get("/first-rows?dataset=a&config=b")
-    assert response.status_code == 422
-    # empty parameter
-    response = client.get("/first-rows?dataset=a&config=b&split=")
+@pytest.mark.parametrize(
+    "dataset,config,split",
+    [
+        (None, None, None),
+        ("a", None, None),
+        ("a", "b", None),
+        ("a", "b", ""),
+    ],
+)
+def test_get_first_rows_missing_parameter(
+    client: TestClient, dataset: Optional[str], config: Optional[str], split: Optional[str]
+) -> None:
+    response = client.get("/first-rows", params={"dataset": dataset, "config": config, "split": split})
     assert response.status_code == 422
 
 
-@responses.activate
-def test_splits_cache_refreshing(client: TestClient) -> None:
-    dataset = "acronym_identification"
-    responses.add_callback(responses.GET, external_auth_url % dataset, callback=request_callback)
+@pytest.mark.parametrize(
+    "exists,is_private,expected_error_code",
+    [
+        (False, None, "ExternalAuthenticatedError"),
+        (True, True, "SplitsResponseNotFound"),
+        (True, False, "SplitsResponseNotReady"),
+    ],
+)
+def test_splits_cache_refreshing(
+    client: TestClient,
+    httpserver: HTTPServer,
+    hf_auth_path: str,
+    exists: bool,
+    is_private: Optional[bool],
+    expected_error_code: str,
+) -> None:
+    dataset = "dataset-to-be-processed"
+    httpserver.expect_request(hf_auth_path % dataset).respond_with_data(status=200 if exists else 404)
+    httpserver.expect_request(f"/api/datasets/{dataset}").respond_with_data(
+        json.dumps({"private": is_private}), headers={} if exists else {"X-Error-Code": "RepoNotFound"}
+    )
 
     response = client.get("/splits", params={"dataset": dataset})
-    assert response.json()["error"] == "Not found."
-    add_splits_job(dataset)
-    mark_splits_responses_as_stale(dataset)
-    # ^ has no effect for the moment (no entry for the dataset, and anyway: no way to know the value of the stale flag)
-    response = client.get("/splits", params={"dataset": dataset})
-    assert response.json()["error"] == "The list of splits is not ready yet. Please retry later."
-    # simulate the worker
-    upsert_splits_response(dataset, {"key": "value"}, HTTPStatus.OK)
-    response = client.get("/splits", params={"dataset": dataset})
-    assert response.json()["key"] == "value"
-    assert response.status_code == 200
+    assert response.headers["X-Error-Code"] == expected_error_code
+
+    if expected_error_code == "SplitsResponseNotReady":
+        # simulate the worker
+        upsert_splits_response(dataset, {"key": "value"}, HTTPStatus.OK)
+        response = client.get("/splits", params={"dataset": dataset})
+        assert response.json()["key"] == "value"
+        assert response.status_code == 200
 
 
-@responses.activate
-def test_first_rows_cache_refreshing(client: TestClient) -> None:
-    dataset = "acronym_identification"
+@pytest.mark.parametrize(
+    "exists,is_private,expected_error_code",
+    [
+        (False, None, "ExternalAuthenticatedError"),
+        (True, True, "FirstRowsResponseNotFound"),
+        (True, False, "FirstRowsResponseNotFound"),
+    ],
+)
+def test_first_rows_cache_refreshing(
+    client: TestClient,
+    httpserver: HTTPServer,
+    hf_auth_path: str,
+    exists: bool,
+    is_private: Optional[bool],
+    expected_error_code: str,
+) -> None:
+    dataset = "dataset-to-be-processed"
     config = "default"
     split = "train"
-    responses.add_callback(responses.GET, external_auth_url % dataset, callback=request_callback)
+    httpserver.expect_request(hf_auth_path % dataset).respond_with_data(status=200 if exists else 404)
+    httpserver.expect_request(f"/api/datasets/{dataset}").respond_with_data(
+        json.dumps({"private": is_private}), headers={} if exists else {"X-Error-Code": "RepoNotFound"}
+    )
 
     response = client.get("/first-rows", params={"dataset": dataset, "config": config, "split": split})
-    assert response.json()["error"] == "Not found."
-    add_first_rows_job(dataset, config, split)
-    mark_first_rows_responses_as_stale(dataset, config, split)
-    # ^ has no effect for the moment (no entry for the split, and anyway: no way to know the value of the stale flag)
-    response = client.get("/first-rows", params={"dataset": dataset, "config": config, "split": split})
-    assert response.json()["error"] == "The list of the first rows is not ready yet. Please retry later."
-    # simulate the worker
-    upsert_first_rows_response(dataset, config, split, {"key": "value"}, HTTPStatus.OK)
-    response = client.get("/first-rows", params={"dataset": dataset, "config": config, "split": split})
-    assert response.json()["key"] == "value"
-    assert response.status_code == 200
+    assert response.headers["X-Error-Code"] == expected_error_code
+
+    # if expected_error_code == "FirstRowsResponseNotReady":
+    #     # simulate the worker
+    #     upsert_first_rows_response(dataset, config, split, {"key": "value"}, HTTPStatus.OK)
+    #     response = client.get("/first-rows", params={"dataset": dataset, "config": config, "split": split})
+    #     assert response.json()["key"] == "value"
+    #     assert response.status_code == 200
 
 
 def test_metrics(client: TestClient) -> None:
diff --git a/services/api/tests/test_authentication.py b/services/api/tests/test_authentication.py
index 48bc15e2ec..da07e0c07b 100644
--- a/services/api/tests/test_authentication.py
+++ b/services/api/tests/test_authentication.py
@@ -1,16 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright 2022 The HuggingFace Authors.
 
-from typing import Dict
+from contextlib import nullcontext as does_not_raise
+from typing import Any, Dict
 
 import pytest
-import responses
+from pytest_httpserver import HTTPServer
 from starlette.requests import Headers, Request
 
 from api.authentication import auth_check
 from api.utils import ExternalAuthenticatedError, ExternalUnauthenticatedError
 
-from .utils import request_callback
+from .utils import auth_callback
 
 
 def test_no_auth_check() -> None:
@@ -22,34 +23,33 @@ def test_invalid_auth_check_url() -> None:
         auth_check("dataset", external_auth_url="https://auth.check/")
 
 
-@responses.activate
 def test_unreachable_external_auth_check_service() -> None:
     with pytest.raises(RuntimeError):
         auth_check("dataset", external_auth_url="https://auth.check/%s")
 
 
-@responses.activate
-def test_external_auth_responses_without_request() -> None:
+@pytest.mark.parametrize(
+    "status_code,expectation",
+    [
+        (200, does_not_raise()),
+        (401, pytest.raises(ExternalUnauthenticatedError)),
+        (403, pytest.raises(ExternalAuthenticatedError)),
+        (404, pytest.raises(ExternalAuthenticatedError)),
+        (429, pytest.raises(ValueError)),
+    ],
+)
+def test_external_auth_responses_without_request(
+    httpserver: HTTPServer,
+    hf_endpoint: str,
+    hf_auth_path: str,
+    status_code: int,
+    expectation: Any,
+) -> None:
     dataset = "dataset"
-    url = "https://auth.check/%s"
-    responses.add(responses.GET, url % dataset, status=200)
-    assert auth_check(dataset, external_auth_url=url) is True
-
-    responses.add(responses.GET, url % dataset, status=401)
-    with pytest.raises(ExternalUnauthenticatedError):
-        auth_check(dataset, external_auth_url=url)
-
-    responses.add(responses.GET, url % dataset, status=403)
-    with pytest.raises(ExternalAuthenticatedError):
-        auth_check(dataset, external_auth_url=url)
-
-    responses.add(responses.GET, url % dataset, status=404)
-    with pytest.raises(ExternalAuthenticatedError):
-        auth_check(dataset, external_auth_url=url)
-
-    responses.add(responses.GET, url % dataset, status=429)
-    with pytest.raises(ValueError):
-        auth_check(dataset, external_auth_url=url)
+    external_auth_url = hf_endpoint + hf_auth_path
+    httpserver.expect_request(hf_auth_path % dataset).respond_with_data(status=status_code)
+    with expectation:
+        auth_check(dataset, external_auth_url=external_auth_url)
 
 
 def create_request(headers: Dict[str, str]) -> Request:
@@ -67,32 +67,27 @@ def create_request(headers: Dict[str, str]) -> Request:
     )
 
 
-@responses.activate
-def test_valid_responses_with_request() -> None:
+@pytest.mark.parametrize(
+    "headers,expectation",
+    [
+        ({"Cookie": "some cookie"}, pytest.raises(ExternalUnauthenticatedError)),
+        ({"Authorization": "Bearer invalid"}, pytest.raises(ExternalAuthenticatedError)),
+        ({}, does_not_raise()),
+    ],
+)
+def test_valid_responses_with_request(
+    httpserver: HTTPServer,
+    hf_endpoint: str,
+    hf_auth_path: str,
+    headers: Dict[str, str],
+    expectation: Any,
+) -> None:
     dataset = "dataset"
-    url = "https://auth.check/%s"
-
-    responses.add_callback(responses.GET, url % dataset, callback=request_callback)
-
-    with pytest.raises(ExternalUnauthenticatedError):
-        auth_check(
-            dataset,
-            external_auth_url=url,
-            request=create_request(headers={"cookie": "some cookie"}),
-        )
-
-    with pytest.raises(ExternalAuthenticatedError):
-        auth_check(
-            dataset,
-            external_auth_url=url,
-            request=create_request(headers={"authorization": "Bearer token"}),
-        )
-
-    assert (
+    external_auth_url = hf_endpoint + hf_auth_path
+    httpserver.expect_request(hf_auth_path % dataset).respond_with_handler(auth_callback)
+    with expectation:
         auth_check(
             dataset,
-            external_auth_url=url,
-            request=create_request(headers={}),
+            external_auth_url=external_auth_url,
+            request=create_request(headers=headers),
         )
-        is True
-    )
diff --git a/services/api/tests/test_dataset.py b/services/api/tests/test_dataset.py
new file mode 100644
index 0000000000..f5f7c88e54
--- /dev/null
+++ b/services/api/tests/test_dataset.py
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2022 The HuggingFace Authors.
+
+import json
+
+import pytest
+from pytest_httpserver import HTTPServer
+
+from api.dataset import is_supported
+
+
+@pytest.mark.parametrize(
+    "private,exists,expected",
+    [(True, False, False), (False, False, True), (True, False, False)],
+)
+def test_is_supported(httpserver: HTTPServer, hf_endpoint: str, private: bool, exists: bool, expected: bool) -> None:
+    dataset = "dataset"
+    endpoint = f"/api/datasets/{dataset}"
+    hf_token = "dummy_token"
+
+    headers = None if exists else {"X-Error-Code": "RepoNotFound"}
+    httpserver.expect_request(endpoint).respond_with_data(json.dumps({"private": private}), headers=headers)
+    assert is_supported(dataset=dataset, hf_endpoint=hf_endpoint, hf_token=hf_token) is expected
diff --git a/services/api/tests/utils.py b/services/api/tests/utils.py
index c23582fa32..b7cc894b00 100644
--- a/services/api/tests/utils.py
+++ b/services/api/tests/utils.py
@@ -1,19 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright 2022 The HuggingFace Authors.
 
-from typing import Mapping, Tuple, Union
+from werkzeug.wrappers.request import Request
+from werkzeug.wrappers.response import Response
 
-from requests import PreparedRequest
-from responses import _Body
 
-
-def request_callback(request: PreparedRequest) -> Union[Exception, Tuple[int, Mapping[str, str], _Body]]:
+def auth_callback(request: Request) -> Response:
     # return 401 if a cookie has been provided, 404 if a token has been provided,
-    # and 401 if none has been provided
-    # there is no logic behind this behavior, it's just to test if the cookie and
-    # token are correctly passed to the auth_check service
-    if request.headers.get("cookie"):
-        return (401, {"Content-Type": "text/plain"}, "OK")
-    if request.headers.get("authorization"):
-        return (404, {"Content-Type": "text/plain"}, "OK")
-    return (200, {"Content-Type": "text/plain"}, "OK")
+    # and 200 if none has been provided
+    #
+    # caveat: the returned status codes don't simulate the reality
+    # they're just used to check every case
+    return Response(
+        status=401 if request.headers.get("cookie") else 404 if request.headers.get("authorization") else 200
+    )

From 956f38fdc7b036a600b6e85f8dd7612d8e9916f6 Mon Sep 17 00:00:00 2001
From: Test User <sylvain.lesage@huggingface.co>
Date: Fri, 23 Sep 2022 09:31:09 +0000
Subject: [PATCH 4/8] =?UTF-8?q?refactor:=20=F0=9F=92=A1=20dataset=5Fname?=
 =?UTF-8?q?=20->=20dataset?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 services/api/src/api/routes/splits.py  | 12 ++++++------
 services/api/src/api/routes/webhook.py | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/services/api/src/api/routes/splits.py b/services/api/src/api/routes/splits.py
index 778def04b2..7e86229f16 100644
--- a/services/api/src/api/routes/splits.py
+++ b/services/api/src/api/routes/splits.py
@@ -32,22 +32,22 @@ def create_splits_endpoint(
 ) -> Endpoint:
     async def splits_endpoint(request: Request) -> Response:
         try:
-            dataset_name = request.query_params.get("dataset")
-            logger.info(f"/splits, dataset={dataset_name}")
+            dataset = request.query_params.get("dataset")
+            logger.info(f"/splits, dataset={dataset}")
 
-            if not are_valid_parameters([dataset_name]):
+            if not are_valid_parameters([dataset]):
                 raise MissingRequiredParameterError("Parameter 'dataset' is required")
             # if auth_check fails, it will raise an exception that will be caught below
-            auth_check(dataset_name, external_auth_url=external_auth_url, request=request)
+            auth_check(dataset, external_auth_url=external_auth_url, request=request)
             try:
-                response, http_status, error_code = get_splits_response(dataset_name)
+                response, http_status, error_code = get_splits_response(dataset)
                 if http_status == HTTPStatus.OK:
                     return get_json_ok_response(response)
                 else:
                     return get_json_error_response(response, http_status, error_code)
             except DoesNotExist as e:
                 # maybe the splits response is in process
-                if is_splits_in_process(dataset=dataset_name, hf_endpoint=hf_endpoint, hf_token=hf_token):
+                if is_splits_in_process(dataset=dataset, hf_endpoint=hf_endpoint, hf_token=hf_token):
                     raise SplitsResponseNotReadyError(
                         "The list of splits is not ready yet. Please retry later."
                     ) from e
diff --git a/services/api/src/api/routes/webhook.py b/services/api/src/api/routes/webhook.py
index c27606ac06..c5ddad1e6b 100644
--- a/services/api/src/api/routes/webhook.py
+++ b/services/api/src/api/routes/webhook.py
@@ -48,13 +48,13 @@ def get_dataset_name(id: Optional[str]) -> Optional[str]:
 
 
 def process_payload(payload: MoonWebhookV1Payload, hf_endpoint: str, hf_token: Optional[str] = None) -> None:
-    unique_dataset_names = {get_dataset_name(id) for id in {payload["add"], payload["remove"], payload["update"]}}
-    for dataset_name in unique_dataset_names:
-        if dataset_name is not None:
-            if is_supported(dataset=dataset_name, hf_endpoint=hf_endpoint, hf_token=hf_token):
-                update(dataset=dataset_name)
+    unique_datasets = {get_dataset_name(id) for id in {payload["add"], payload["remove"], payload["update"]}}
+    for dataset in unique_datasets:
+        if dataset is not None:
+            if is_supported(dataset=dataset, hf_endpoint=hf_endpoint, hf_token=hf_token):
+                update(dataset=dataset)
             else:
-                delete(dataset=dataset_name)
+                delete(dataset=dataset)
 
 
 def create_webhook_endpoint(hf_endpoint: str, hf_token: Optional[str] = None) -> Endpoint:

From 21c4f32faa391885696d18c6e9555d48b96700d4 Mon Sep 17 00:00:00 2001
From: Test User <sylvain.lesage@huggingface.co>
Date: Fri, 23 Sep 2022 09:37:08 +0000
Subject: [PATCH 5/8] =?UTF-8?q?refactor:=20=F0=9F=92=A1=20add=20intermedia?=
 =?UTF-8?q?te=20function=20in=20dataset.py?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

also: dataset_name -> dataset. also: fix style
---
 services/api/src/api/dataset.py           | 36 +++++------------------
 services/api/src/api/routes/first_rows.py | 22 +++++++-------
 services/api/tests/test_app.py            |  2 --
 3 files changed, 19 insertions(+), 41 deletions(-)

diff --git a/services/api/src/api/dataset.py b/services/api/src/api/dataset.py
index 1bc3e37f74..752ef2e83d 100644
--- a/services/api/src/api/dataset.py
+++ b/services/api/src/api/dataset.py
@@ -14,8 +14,9 @@
     mark_first_rows_responses_as_stale,
     mark_splits_responses_as_stale,
 )
-from libqueue.queue import (  # is_first_rows_response_in_process,
+from libqueue.queue import (
     add_splits_job,
+    is_first_rows_response_in_process,
     is_splits_response_in_process,
 )
 
@@ -74,30 +75,9 @@ def is_splits_in_process(
     return False
 
 
-# def is_first_rows_in_process(
-#     dataset: str,
-#     config: str,
-#     split: str,
-#     hf_endpoint: str,
-#     hf_token: Optional[str] = None,
-# ) -> bool:
-#     if is_first_rows_response_in_process(dataset_name=dataset, config_name=config, split_name=split):
-#         return True
-#     # a bit convoluted, but we need to check if the first-rows response should be exist
-#     if is_splits_response_in_process(dataset_name=dataset):
-#         return True
-#     try:
-#         response, http_status, _ = get_splits_response(dataset)
-#         if http_status == HTTPStatus.OK and any(
-#             split_item["dataset"] == dataset or split_item["config"] == config or split_item["split"] == split
-#             for split_item in response["splits"]
-#         ):
-#             # it's listed in the splits response.
-#             # we assume that it does not exist in the cache (it's why one call this function)
-#             # relaunch the whole /splits job, because something did not work
-#             update(dataset=dataset)
-#             return True
-#     except DoesNotExist:
-#         # the splits responses does not exist, let's check if it should
-#         return is_splits_in_process(dataset=dataset, hf_endpoint=hf_endpoint, hf_token=hf_token)
-#     return False
+def is_first_rows_in_process(
+    dataset: str,
+    config: str,
+    split: str,
+) -> bool:
+    return is_first_rows_response_in_process(dataset_name=dataset, config_name=config, split_name=split)
diff --git a/services/api/src/api/routes/first_rows.py b/services/api/src/api/routes/first_rows.py
index c7a12f49c9..f51c155a0b 100644
--- a/services/api/src/api/routes/first_rows.py
+++ b/services/api/src/api/routes/first_rows.py
@@ -6,11 +6,11 @@
 from typing import Optional
 
 from libcache.simple_cache import DoesNotExist, get_first_rows_response
-from libqueue.queue import is_first_rows_response_in_process
 from starlette.requests import Request
 from starlette.responses import Response
 
 from api.authentication import auth_check
+from api.dataset import is_first_rows_in_process
 from api.utils import (
     ApiCustomError,
     Endpoint,
@@ -34,28 +34,28 @@ def create_first_rows_endpoint(
 ) -> Endpoint:
     async def first_rows_endpoint(request: Request) -> Response:
         try:
-            dataset_name = request.query_params.get("dataset")
-            config_name = request.query_params.get("config")
-            split_name = request.query_params.get("split")
-            logger.info(f"/rows, dataset={dataset_name}, config={config_name}, split={split_name}")
+            dataset = request.query_params.get("dataset")
+            config = request.query_params.get("config")
+            split = request.query_params.get("split")
+            logger.info(f"/rows, dataset={dataset}, config={config}, split={split}")
 
-            if not are_valid_parameters([dataset_name, config_name, split_name]):
+            if not are_valid_parameters([dataset, config, split]):
                 raise MissingRequiredParameterError("Parameters 'dataset', 'config' and 'split' are required")
             # if auth_check fails, it will raise an exception that will be caught below
-            auth_check(dataset_name, external_auth_url=external_auth_url, request=request)
+            auth_check(dataset, external_auth_url=external_auth_url, request=request)
             try:
-                response, http_status, error_code = get_first_rows_response(dataset_name, config_name, split_name)
+                response, http_status, error_code = get_first_rows_response(dataset, config, split)
                 if http_status == HTTPStatus.OK:
                     return get_json_ok_response(response)
                 else:
                     return get_json_error_response(response, http_status, error_code)
             except DoesNotExist as e:
-                if is_first_rows_response_in_process(dataset_name, config_name, split_name):
+                # maybe the first-rows response is in process
+                if is_first_rows_in_process(dataset=dataset, config=config, split=split):
                     raise FirstRowsResponseNotReadyError(
                         "The list of the first rows is not ready yet. Please retry later."
                     ) from e
-                else:
-                    raise FirstRowsResponseNotFoundError("Not found.") from e
+                raise FirstRowsResponseNotFoundError("Not found.") from e
         except ApiCustomError as e:
             return get_json_api_error_response(e)
         except Exception as e:
diff --git a/services/api/tests/test_app.py b/services/api/tests/test_app.py
index a4e0c5e285..3baa9d1e28 100644
--- a/services/api/tests/test_app.py
+++ b/services/api/tests/test_app.py
@@ -6,7 +6,6 @@
 from typing import Dict, Optional
 
 import pytest
-
 from libcache.simple_cache import _clean_database as clean_cache_database
 from libcache.simple_cache import upsert_splits_response
 from libqueue.queue import clean_database as clean_queue_database
@@ -18,7 +17,6 @@
 
 from .utils import auth_callback
 
-
 external_auth_url = EXTERNAL_AUTH_URL or "%s"  # for mypy
 
 

From 8d9e37dc4e9196b4892455951b7a13f7abc26e4f Mon Sep 17 00:00:00 2001
From: Test User <sylvain.lesage@huggingface.co>
Date: Fri, 23 Sep 2022 09:49:46 +0000
Subject: [PATCH 6/8] =?UTF-8?q?feat:=20=F0=9F=8E=B8=20add=20(/splits)=20jo?=
 =?UTF-8?q?bs=20if=20a=20/first-rows=20resp=20is=20cache=20miss?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If a request is made to /first-rows and there is no entry in the
cache, we check if it should.

Various cases have to be managed, which makes the logic a bit (too)
convoluted.

We return a FirstRowsNotReady error if:
- the /first-rows response for the split is already in process
- the /splits response for the dataset is still in process

We return a FirstRowsNotReady error, AND we add a /splits job because
something has failed before, if:
- the /splits response for the dataset exists, and the split is part
  of the splits of the dataset
- the /splits response for the dataset does not exist, is not in
  process, but it should because the dataset is supported.

Note that I didn't add a unit test for every of these cases
---
 services/api/src/api/dataset.py           | 36 ++++++++++++++++++-----
 services/api/src/api/routes/first_rows.py |  4 ++-
 services/api/tests/test_app.py            | 30 ++++++++++++-------
 3 files changed, 51 insertions(+), 19 deletions(-)

diff --git a/services/api/src/api/dataset.py b/services/api/src/api/dataset.py
index 752ef2e83d..da48acd638 100644
--- a/services/api/src/api/dataset.py
+++ b/services/api/src/api/dataset.py
@@ -2,15 +2,16 @@
 # Copyright 2022 The HuggingFace Authors.
 
 import logging
-
-# from http import HTTPStatus
+from http import HTTPStatus
 from typing import Optional
 
 from huggingface_hub.hf_api import HfApi  # type: ignore
 from huggingface_hub.utils import RepositoryNotFoundError  # type: ignore
-from libcache.simple_cache import (  # DoesNotExist,; get_splits_response,
+from libcache.simple_cache import (
+    DoesNotExist,
     delete_first_rows_responses,
     delete_splits_responses,
+    get_splits_response,
     mark_first_rows_responses_as_stale,
     mark_splits_responses_as_stale,
 )
@@ -76,8 +77,29 @@ def is_splits_in_process(
 
 
 def is_first_rows_in_process(
-    dataset: str,
-    config: str,
-    split: str,
+    dataset: str, config: str, split: str, hf_endpoint: str, hf_token: Optional[str] = None
 ) -> bool:
-    return is_first_rows_response_in_process(dataset_name=dataset, config_name=config, split_name=split)
+    if is_first_rows_response_in_process(dataset_name=dataset, config_name=config, split_name=split):
+        return True
+
+    # a bit convoluted, but checking if the first-rows response should exist
+    # requires to first parse the /splits response for the same dataset
+    if is_splits_response_in_process(dataset_name=dataset):
+        return True
+    try:
+        response, http_status, _ = get_splits_response(dataset)
+        if http_status == HTTPStatus.OK and any(
+            split_item["dataset"] == dataset or split_item["config"] == config or split_item["split"] == split
+            for split_item in response["splits"]
+        ):
+            # The splits is listed in the /splits response.
+            # Let's refresh *the whole dataset*, because something did not work
+            #
+            # Caveat: we don't check if the /first-rows response already exists in the cache,
+            # because we assume it's the reason why one would call this function
+            update(dataset=dataset)
+            return True
+    except DoesNotExist:
+        # the splits responses does not exist, let's check if it should
+        return is_splits_in_process(dataset=dataset, hf_endpoint=hf_endpoint, hf_token=hf_token)
+    return False
diff --git a/services/api/src/api/routes/first_rows.py b/services/api/src/api/routes/first_rows.py
index f51c155a0b..bfbe7c295b 100644
--- a/services/api/src/api/routes/first_rows.py
+++ b/services/api/src/api/routes/first_rows.py
@@ -51,7 +51,9 @@ async def first_rows_endpoint(request: Request) -> Response:
                     return get_json_error_response(response, http_status, error_code)
             except DoesNotExist as e:
                 # maybe the first-rows response is in process
-                if is_first_rows_in_process(dataset=dataset, config=config, split=split):
+                if is_first_rows_in_process(
+                    dataset=dataset, config=config, split=split, hf_endpoint=hf_endpoint, hf_token=hf_token
+                ):
                     raise FirstRowsResponseNotReadyError(
                         "The list of the first rows is not ready yet. Please retry later."
                     ) from e
diff --git a/services/api/tests/test_app.py b/services/api/tests/test_app.py
index 3baa9d1e28..a29a3d0091 100644
--- a/services/api/tests/test_app.py
+++ b/services/api/tests/test_app.py
@@ -7,13 +7,13 @@
 
 import pytest
 from libcache.simple_cache import _clean_database as clean_cache_database
-from libcache.simple_cache import upsert_splits_response
+from libcache.simple_cache import upsert_first_rows_response, upsert_splits_response
 from libqueue.queue import clean_database as clean_queue_database
 from pytest_httpserver import HTTPServer
 from starlette.testclient import TestClient
 
 from api.app import create_app
-from api.config import EXTERNAL_AUTH_URL, MONGO_QUEUE_DATABASE
+from api.config import EXTERNAL_AUTH_URL, MONGO_CACHE_DATABASE, MONGO_QUEUE_DATABASE
 
 from .utils import auth_callback
 
@@ -22,8 +22,8 @@
 
 @pytest.fixture(autouse=True, scope="module")
 def safe_guard() -> None:
-    # if "test" not in MONGO_CACHE_DATABASE:
-    #     raise ValueError("Tests on cache must be launched on a test mongo database")
+    if "test" not in MONGO_CACHE_DATABASE:
+        raise ValueError("Tests on cache must be launched on a test mongo database")
     if "test" not in MONGO_QUEUE_DATABASE:
         raise ValueError("Tests on queue must be launched on a test mongo database")
 
@@ -208,6 +208,10 @@ def test_splits_cache_refreshing(
     assert response.headers["X-Error-Code"] == expected_error_code
 
     if expected_error_code == "SplitsResponseNotReady":
+        # a subsequent request should return the same error code
+        response = client.get("/splits", params={"dataset": dataset})
+        assert response.headers["X-Error-Code"] == expected_error_code
+
         # simulate the worker
         upsert_splits_response(dataset, {"key": "value"}, HTTPStatus.OK)
         response = client.get("/splits", params={"dataset": dataset})
@@ -220,7 +224,7 @@ def test_splits_cache_refreshing(
     [
         (False, None, "ExternalAuthenticatedError"),
         (True, True, "FirstRowsResponseNotFound"),
-        (True, False, "FirstRowsResponseNotFound"),
+        (True, False, "FirstRowsResponseNotReady"),
     ],
 )
 def test_first_rows_cache_refreshing(
@@ -242,12 +246,16 @@ def test_first_rows_cache_refreshing(
     response = client.get("/first-rows", params={"dataset": dataset, "config": config, "split": split})
     assert response.headers["X-Error-Code"] == expected_error_code
 
-    # if expected_error_code == "FirstRowsResponseNotReady":
-    #     # simulate the worker
-    #     upsert_first_rows_response(dataset, config, split, {"key": "value"}, HTTPStatus.OK)
-    #     response = client.get("/first-rows", params={"dataset": dataset, "config": config, "split": split})
-    #     assert response.json()["key"] == "value"
-    #     assert response.status_code == 200
+    if expected_error_code == "FirstRowsResponseNotReady":
+        # a subsequent request should return the same error code
+        response = client.get("/first-rows", params={"dataset": dataset, "config": config, "split": split})
+        assert response.headers["X-Error-Code"] == expected_error_code
+
+        # simulate the worker
+        upsert_first_rows_response(dataset, config, split, {"key": "value"}, HTTPStatus.OK)
+        response = client.get("/first-rows", params={"dataset": dataset, "config": config, "split": split})
+        assert response.json()["key"] == "value"
+        assert response.status_code == 200
 
 
 def test_metrics(client: TestClient) -> None:

From 8f7fd4ccb0aa41c1a98a36f4a7ee9eed6d50d32c Mon Sep 17 00:00:00 2001
From: Test User <sylvain.lesage@huggingface.co>
Date: Fri, 23 Sep 2022 10:00:33 +0000
Subject: [PATCH 7/8] =?UTF-8?q?feat:=20=F0=9F=8E=B8=20update=20the=20docke?=
 =?UTF-8?q?r=20image?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 chart/docker-images.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/chart/docker-images.yaml b/chart/docker-images.yaml
index 64e989fad4..6647d83359 100644
--- a/chart/docker-images.yaml
+++ b/chart/docker-images.yaml
@@ -1,7 +1,7 @@
 {
   "dockerImage": {
     "admin": "707930574880.dkr.ecr.us-east-1.amazonaws.com/hub-datasets-server-admin:sha-49a60c5",
-    "api": "707930574880.dkr.ecr.us-east-1.amazonaws.com/hub-datasets-server-api:sha-fe75069",
+    "api": "707930574880.dkr.ecr.us-east-1.amazonaws.com/hub-datasets-server-api:sha-8d9e37d",
     "reverseProxy": "docker.io/nginx:1.20",
     "worker": {
       "splits": "707930574880.dkr.ecr.us-east-1.amazonaws.com/hub-datasets-server-worker:sha-0dff3bf",

From 5f43d72ae7f6dae130ecdde1274adc15d50e9545 Mon Sep 17 00:00:00 2001
From: Test User <sylvain.lesage@huggingface.co>
Date: Fri, 23 Sep 2022 11:40:40 +0000
Subject: [PATCH 8/8] =?UTF-8?q?fix:=20=F0=9F=90=9B=20pass=20HF=5FTOKEN=20t?=
 =?UTF-8?q?o=20the=20API=20service?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 chart/templates/api/_container.tpl                        | 8 ++++++++
 chart/templates/worker/first-rows/_container.tpl          | 2 +-
 chart/templates/worker/splits/_container.tpl              | 2 +-
 tools/docker-compose-datasets-server-from-local-code.yml  | 1 +
 .../docker-compose-datasets-server-from-remote-images.yml | 1 +
 5 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/chart/templates/api/_container.tpl b/chart/templates/api/_container.tpl
index 87228447b9..b4a88f4f6b 100644
--- a/chart/templates/api/_container.tpl
+++ b/chart/templates/api/_container.tpl
@@ -14,6 +14,14 @@
     value: {{ .Values.api.assetsDirectory | quote }}
   - name: HF_ENDPOINT
     value: {{ .Values.hfEndpoint | quote }}
+  - name: HF_TOKEN
+    # see https://kubernetes.io/docs/concepts/configuration/secret/#creating-a-secret
+    # and https://kubernetes.io/docs/concepts/configuration/secret/#using-secrets-as-environment-variables
+    valueFrom:
+      secretKeyRef:
+        name: {{ .Values.secrets.hfToken | quote }}
+        key: HF_TOKEN
+        optional: false
   - name: LOG_LEVEL
     value: {{ .Values.api.logLevel | quote }}
   - name: MAX_AGE_LONG_SECONDS
diff --git a/chart/templates/worker/first-rows/_container.tpl b/chart/templates/worker/first-rows/_container.tpl
index 79af2dfe19..e12820ac93 100644
--- a/chart/templates/worker/first-rows/_container.tpl
+++ b/chart/templates/worker/first-rows/_container.tpl
@@ -13,7 +13,7 @@
   - name: HF_DATASETS_CACHE
     value: "{{ .Values.worker.firstRows.cacheDirectory }}/datasets"
   - name: HF_ENDPOINT
-    value: "{{ .Values.hfEndpoint }}"
+    value: {{ .Values.hfEndpoint | quote }}
   # note: HF_MODULES_CACHE is not set to a shared directory
   - name: HF_MODULES_CACHE
     value: "/tmp/modules-cache"
diff --git a/chart/templates/worker/splits/_container.tpl b/chart/templates/worker/splits/_container.tpl
index 82c77f50f8..6f7ca5d34d 100644
--- a/chart/templates/worker/splits/_container.tpl
+++ b/chart/templates/worker/splits/_container.tpl
@@ -13,7 +13,7 @@
   - name: HF_DATASETS_CACHE
     value: "{{ .Values.worker.splits.cacheDirectory }}/datasets"
   - name: HF_ENDPOINT
-    value: "{{ .Values.hfEndpoint }}"
+    value: {{ .Values.hfEndpoint | quote }}
   - name: HF_MODULES_CACHE
     value: "/tmp/modules-cache"
   # the size should remain so small that we don't need to worry about putting it on an external storage
diff --git a/tools/docker-compose-datasets-server-from-local-code.yml b/tools/docker-compose-datasets-server-from-local-code.yml
index 9c83740a45..42efb6e9bb 100644
--- a/tools/docker-compose-datasets-server-from-local-code.yml
+++ b/tools/docker-compose-datasets-server-from-local-code.yml
@@ -34,6 +34,7 @@ services:
       APP_PORT: 8080
       ASSETS_DIRECTORY: "/assets"
       HF_ENDPOINT: ${HF_ENDPOINT}
+      HF_TOKEN: ${HF_TOKEN}
       MONGO_URL: "mongodb://mongodb"
     ports:
       # for debug
diff --git a/tools/docker-compose-datasets-server-from-remote-images.yml b/tools/docker-compose-datasets-server-from-remote-images.yml
index cb1b92d777..deedb14656 100644
--- a/tools/docker-compose-datasets-server-from-remote-images.yml
+++ b/tools/docker-compose-datasets-server-from-remote-images.yml
@@ -31,6 +31,7 @@ services:
       APP_PORT: 8080
       ASSETS_DIRECTORY: "/assets"
       HF_ENDPOINT: ${HF_ENDPOINT}
+      HF_TOKEN: ${HF_TOKEN}
       # use shorter cache durations for the e2e tests
       MAX_AGE_SHORT_SECONDS: 1
       MAX_AGE_LONG_SECONDS: 2