Details (#589)

* chore: 🤖 remove useless file * docs: ✏️ replace deprecated /rows with /first-rows * feat: 🎸 update protobuf (fixes a security vulnerability)
huggingface · Sep 26, 2022 · 72963ce · 72963ce
1 parent 25f6a81
commit 72963ce
Show file tree

Hide file tree

Showing 7 changed files with 16 additions and 102 deletions.
diff --git a/chart/static-files/openapi.json b/chart/static-files/openapi.json
@@ -1154,7 +1154,7 @@
         "description": "The list of the 100 first rows of a dataset split.",
         "externalDocs": {
           "description": "See First rows (Hub docs)",
-          "url": "https://huggingface.co/docs/datasets-server/rows"
+          "url": "https://huggingface.co/docs/datasets-server/first-rows"
         },
         "operationId": "listFirstRows",
         "security": [

diff --git a/chart/values.yaml b/chart/values.yaml
@@ -132,15 +132,15 @@ worker:
     maxMemoryPct: 0
     # Max size (in bytes) of the dataset to fallback in normal mode if streaming fails
     maxSizeFallback: "100_000_000"
-    # Min size of a cell in the /rows endpoint response in bytes
+    # Min size of a cell in the /first-rows endpoint response in bytes
     minCellBytes: 100
     # Directory of the "numba" library cache
     numbaCacheDirectory: "/numba-cache"
-    # Max size of the /rows endpoint response in bytes
+    # Max size of the /first-rows endpoint response in bytes
     rowMaxBytes: "1_000_000"
-    # Max number of rows in the /rows endpoint response
+    # Max number of rows in the /first-rows endpoint response
     rowsMaxNumber: 100
-    # Min number of rows in the /rows endpoint response
+    # Min number of rows in the /first-rows endpoint response
     rowsMinNumber: 10
     # Number of seconds a worker will sleep before trying to process a new job
     workerSleepSeconds: 15
@@ -176,15 +176,15 @@ worker:
     maxMemoryPct: 0
     # Max size (in bytes) of the dataset to fallback in normal mode if streaming fails
     maxSizeFallback: "100_000_000"
-    # Min size of a cell in the /rows endpoint response in bytes
+    # Min size of a cell in the /first-rows endpoint response in bytes
     minCellBytes: 100
     # Directory of the "numba" library cache
     numbaCacheDirectory: "/numba-cache"
-    # Max size of the /rows endpoint response in bytes
+    # Max size of the /first-rows endpoint response in bytes
     rowMaxBytes: "1_000_000"
-    # Max number of rows in the /rows endpoint response
+    # Max number of rows in the /first-rows endpoint response
     rowsMaxNumber: 100
-    # Min number of rows in the /rows endpoint response
+    # Min number of rows in the /first-rows endpoint response
     rowsMinNumber: 10
     # Number of seconds a worker will sleep before trying to process a new job
     workerSleepSeconds: 15

diff --git a/services/api/src/api/routes/first_rows.py b/services/api/src/api/routes/first_rows.py
@@ -37,7 +37,7 @@ async def first_rows_endpoint(request: Request) -> Response:
             dataset = request.query_params.get("dataset")
             config = request.query_params.get("config")
             split = request.query_params.get("split")
-            logger.info(f"/rows, dataset={dataset}, config={config}, split={split}")
+            logger.info(f"/first-rows, dataset={dataset}, config={config}, split={split}")
 
             if not are_valid_parameters([dataset, config, split]):
                 raise MissingRequiredParameterError("Parameters 'dataset', 'config' and 'split' are required")

diff --git a/services/worker/.env.example b/services/worker/.env.example
diff --git a/services/worker/README.md b/services/worker/README.md
@@ -24,8 +24,8 @@ Set environment variables to configure the following aspects:
 - `MONGO_QUEUE_DATABASE`: the name of the database used for storing the queue. Defaults to `"datasets_server_queue"`.
 - `MONGO_URL`: the URL used to connect to the mongo db server. Defaults to `"mongodb://localhost:27017"`.
 - `NUMBA_CACHE_DIR`: directory where the `numba` decorators (used by `librosa`) can write cache. Required on cloud infrastructure (see https://stackoverflow.com/a/63367171/7351594).
-- `ROWS_MAX_BYTES`: the max size of the /rows endpoint response in bytes. Defaults to `1_000_000` (1 MB).
-- `ROWS_MAX_NUMBER`: the max number of rows fetched by the worker for the split, and provided in the /rows endpoint response. Defaults to `100`.
-- `ROWS_MIN_NUMBER`: the min number of rows fetched by the worker for the split, and provided in the /rows endpoint response. Defaults to `10`.
+- `ROWS_MAX_BYTES`: the max size of the /first-rows endpoint response in bytes. Defaults to `1_000_000` (1 MB).
+- `ROWS_MAX_NUMBER`: the max number of rows fetched by the worker for the split, and provided in the /first-rows endpoint response. Defaults to `100`.
+- `ROWS_MIN_NUMBER`: the min number of rows fetched by the worker for the split, and provided in the /first-rows endpoint response. Defaults to `10`.
 - `WORKER_QUEUE`: name of the queue the worker will pull jobs from. It can be equal to `splits_responses` or `first_rows_responses`. The `splits_responses` jobs should be a lot faster than the `first_rows_responses` ones, so that we should need a lot more workers for `first_rows_responses` than for `splits_responses`. Defaults to `splits_responses`.
 - `WORKER_SLEEP_SECONDS`: duration in seconds of a worker wait loop iteration, before checking if resources are available and processing a job if any is available. Note that the worker does not sleep on the first loop after finishing a job. Defaults to `15`.
diff --git a/services/worker/poetry.lock b/services/worker/poetry.lock
diff --git a/services/worker/src/worker/responses/first_rows.py b/services/worker/src/worker/responses/first_rows.py
@@ -89,7 +89,7 @@ def get_rows(
 def get_size_in_bytes(obj: Any):
     return sys.getsizeof(orjson_dumps(obj))
     # ^^ every row is transformed here in a string, because it corresponds to
-    # the size the row will contribute in the JSON response to /rows endpoint.
+    # the size the row will contribute in the JSON response to /first-rows endpoint.
     # The size of the string is measured in bytes.
     # An alternative would have been to look at the memory consumption (pympler) but it's
     # less related to what matters here (size of the JSON, number of characters in the