Move workers/datasets_based to services/worker (#800)

* feat: 🎸 use primitive parameters, add release, add tests * style: 💄 fix style * feat: 🎸 use primitive parameters, add release, add tests * style: 💄 fix style * log a warning when the migration cannot access database thanks @AndreaFrancis * Update libs/libcommon/tests/test_resources.py Co-authored-by: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> * feat: 🎸 use primitive parameters, add release, add tests * style: 💄 fix style * feat: 🎸 move workers/datasets_based to services/workers * fix: 🐛 fix the Helm chart * feat: 🎸 upgrade the minor versions of the packages and update the kenlm source * style: 💄 fix style * test: 💍 fix the tests if the runner is slow * fix: 🐛 refactor to avoid having worker.py in the root Having worker.py at the root is not allowed since it's also the name of the package. Now: - WorkerLoop becomes Loop - Worker becomes JobRunner The terms are more accurate. Indeed, a JobRunner only processes one job. * Update services/worker/pyproject.toml Co-authored-by: Andrea Francis Soria Jimenez <andrea@huggingface.co> * Update services/worker/src/worker/config.py Co-authored-by: Andrea Francis Soria Jimenez <andrea@huggingface.co> * Revert "Update services/worker/src/worker/config.py" This reverts commit 1bd9324. --------- Co-authored-by: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Co-authored-by: Andrea Francis Soria Jimenez <andrea@huggingface.co>
huggingface · Feb 13, 2023 · f43a0d2 · f43a0d2
1 parent 67c2eee
commit f43a0d2
Show file tree

Hide file tree

Showing 102 changed files with 1,509 additions and 638 deletions.
diff --git a/.github/workflows/build_push_docker_hub.yml b/.github/workflows/build_push_docker_hub.yml
@@ -20,8 +20,8 @@ jobs:
             project: admin
           - directory: services
             project: api
-          - directory: workers
-            project: datasets_based
+          - directory: services
+            project: worker
     runs-on: "ubuntu-latest"
     steps:
       - name: Checkout repository

diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml
@@ -11,7 +11,6 @@ on:
       - 'e2e/**'
       - 'libs/**'
       - 'services/**'
-      - 'workers/**'
       - 'chart/static-files/openapi.json'
       - '.github/workflows/_e2e_tests.yml'
       - '.github/workflows/_quality-python.yml'
@@ -23,7 +22,6 @@ on:
       - 'e2e/**'
       - 'libs/**'
       - 'services/**'
-      - 'workers/**'
       - 'chart/static-files/openapi.json'
       - '.github/workflows/_e2e_tests.yml'
       - '.github/workflows/_quality-python.yml'

diff --git a/.github/workflows/w-datasets_based.yml → .github/workflows/s-worker.yml b/.github/workflows/w-datasets_based.yml → .github/workflows/s-worker.yml
@@ -1,25 +1,25 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright 2022 The HuggingFace Authors.
 
-name: workers/datasets_based
+name: services/worker
 on:
   workflow_dispatch:
   push:
     branches:
       - main
     paths:
       - 'libs/libcommon/**'
-      - 'workers/datasets_based/**'
-      - '.github/workflows/w-datasets_based.yml'
+      - 'services/worker/**'
+      - '.github/workflows/s-worker.yml'
       - '.github/workflows/_quality-python.yml'
       - '.github/workflows/_unit-tests-python.yml'
       - 'tools/docker-compose-mongo.yml'
       - 'vendors/'
   pull_request:
     paths:
       - 'libs/libcommon/**'
-      - 'workers/datasets_based/**'
-      - '.github/workflows/w-datasets_based.yml'
+      - 'services/worker/**'
+      - '.github/workflows/s-worker.yml'
       - '.github/workflows/_quality-python.yml'
       - '.github/workflows/_unit-tests-python.yml'
       - 'tools/docker-compose-mongo.yml'
@@ -28,10 +28,10 @@ jobs:
   quality:
     uses: ./.github/workflows/_quality-python.yml
     with:
-      working-directory: workers/datasets_based
+      working-directory: services/worker
       is-datasets-worker: true
   unit-tests:
     uses: ./.github/workflows/_unit-tests-python.yml
     with:
-      working-directory: workers/datasets_based
+      working-directory: services/worker
       is-datasets-worker: true
diff --git a/.vscode/monorepo.code-workspace b/.vscode/monorepo.code-workspace
@@ -25,21 +25,20 @@
       "path": "../services/api"
     },
     {
-      "name": "services/reverse-proxy",
-      "path": "../services/reverse-proxy"
+      "name": "services/worker",
+      "path": "../services/worker"
     },
     {
-      "name": "workers/datasets_based",
-      "path": "../workers/datasets_based"
+      "name": "services/reverse-proxy",
+      "path": "../services/reverse-proxy"
     }
   ],
   "settings": {
     "files.exclude": {
       "e2e": true,
       "jobs": true,
       "libs": true,
-      "services": true,
-      "workers": true
+      "services": true
     },
     "python.formatting.provider": "black",
     "python.linting.enabled": true,

diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md
@@ -28,7 +28,7 @@ make dev-start
 In development mode, you don't need to rebuild the docker images to apply a change in a worker.
 You can just restart the worker's docker container and it will apply your changes.
 
-To install a single job (in [jobs](./jobs)), library (in [libs](./libs)), service (in [services](./services)) or worker (in [workers](./workers)), go to their respective directory, and install Python 3.9 (consider [pyenv](https://github.com/pyenv/pyenv)) and [poetry](https://python-poetry.org/docs/master/#installation) (don't forget to add `poetry` to the `PATH` environment variable).
+To install a single job (in [jobs](./jobs)), library (in [libs](./libs)) or service (in [services](./services)), go to their respective directory, and install Python 3.9 (consider [pyenv](https://github.com/pyenv/pyenv)) and [poetry](https://python-poetry.org/docs/master/#installation) (don't forget to add `poetry` to the `PATH` environment variable).
 
 If you use pyenv:
 
@@ -51,20 +51,19 @@ If you use VSCode, it might be useful to use the ["monorepo" workspace](./.vscod
 
 ## Architecture
 
-The repository is structured as a monorepo, with Python libraries and applications in [jobs](./jobs)), [libs](./libs), [services](./services) and [workers](./workers):
+The repository is structured as a monorepo, with Python libraries and applications in [jobs](./jobs)), [libs](./libs) and [services](./services):
 
 - [jobs](./jobs) contains the one-time jobs run by Helm before deploying the pods. For now, the only job migrates the databases when needed.
 - [libs](./libs) contains the Python libraries used by the services and workers. For now, the only library is [libcommon](./libs/libcommon), which contains the common code for the services and workers.
-- [services](./services) contains the applications: the public API, the admin API (which is separated from the public API and might be published under its own domain at some point) and the reverse proxy.
-- [workers](./workers) contains the workers that process the queue asynchronously: they get a "job" (caution: not the Helm jobs, but the jobs stored in the queue), process the expected response for the associated endpoint, and store the response in the cache.
+- [services](./services) contains the applications: the public API, the admin API (which is separated from the public API and might be published under its own domain at some point), the reverse proxy, and the worker that processes the queue asynchronously: it gets a "job" (caution: the jobs stored in the queue, not the Helm jobs), processes the expected response for the associated endpoint, and stores the response in the cache.
 
 If you have access to the internal HF notion, see https://www.notion.so/huggingface2/Datasets-server-464848da2a984e999c540a4aa7f0ece5.
 
 The application is distributed in several components.
 
 [api](./services/api) is a web server that exposes the [API endpoints](https://huggingface.co/docs/datasets-server). Apart from some endpoints (`valid`, `is-valid`), all the responses are served from pre-computed responses. That's the main point of this project: generating these responses takes time, and the API server provides this service to the users.
 
-The precomputed responses are stored in a Mongo database called "cache". They are computed by [workers](./workers) which take their jobs from a job queue stored in a Mongo database called "queue", and store the results (error or valid response) into the "cache" (see [libcommon](./libs/libcommon)).
+The precomputed responses are stored in a Mongo database called "cache". They are computed by [workers](./services/worker) which take their jobs from a job queue stored in a Mongo database called "queue", and store the results (error or valid response) into the "cache" (see [libcommon](./libs/libcommon)).
 
 The API service exposes the `/webhook` endpoint which is called by the Hub on every creation, update or deletion of a dataset on the Hub. On deletion, the cached responses are deleted. On creation or update, a new job is appended in the "queue" database.
 
@@ -156,7 +155,7 @@ GITHUB_TOKEN=xxx
 
 ## Mac OS
 
-To install the [datasets based worker](./workers/datasets_based) on Mac OS, you can follow the next steps.
+To install the [datasets based worker](./services/worker) on Mac OS, you can follow the next steps.
 
 ### First: as an administrator
 
@@ -219,7 +218,7 @@ $ pyenv install 3.9.15
 Check that the expected local version of Python is used:
 
 ```bash
-$ cd workers/datasets_based
+$ cd services/worker
 $ python --version
 Python 3.9.15
 ```

diff --git a/chart/env/dev.yaml b/chart/env/dev.yaml
@@ -38,11 +38,10 @@ images:
       useGlobalRegistry: false
       repository: datasets-server-services-api
       tag: sha-27ad2f7
-  workers:
-    datasetsBased:
+    worker:
       registry: huggingface
       useGlobalRegistry: false
-      repository: datasets-server-workers-datasets_based
+      repository: datasets-server-services-worker
       tag: sha-27ad2f7
 
 secrets:

diff --git a/chart/env/prod.yaml b/chart/env/prod.yaml
@@ -28,11 +28,10 @@ images:
       useGlobalRegistry: false
       repository: datasets-server-services-api
       tag: sha-27ad2f7
-  workers:
-    datasetsBased:
+    worker:
       registry: huggingface
       useGlobalRegistry: false
-      repository: datasets-server-workers-datasets_based
+      repository: datasets-server-services-worker
       tag: sha-27ad2f7
 
 secrets:

diff --git a/chart/templates/_helpers.tpl b/chart/templates/_helpers.tpl
@@ -83,7 +83,7 @@ imagePullSecrets:
 {{- end -}}
 
 {{- define "workers.datasetsBased.image" -}}
-{{ include "datasetsServer.images.image" (dict "imageRoot" .Values.images.workers.datasetsBased "global" .Values.global.huggingface) }}
+{{ include "datasetsServer.images.image" (dict "imageRoot" .Values.images.services.worker "global" .Values.global.huggingface) }}
 {{- end -}}
 
 {{- define "image.imagePullSecrets" -}}

diff --git a/chart/values.yaml b/chart/values.yaml
@@ -35,11 +35,10 @@ images:
       useGlobalRegistry: false
       repository: datasets-server-services-api
       tag: sha-27ad2f7
-  workers:
-    datasetsBased:
+    worker:
       registry: huggingface
       useGlobalRegistry: false
-      repository: datasets-server-workers-datasets_based
+      repository: datasets-server-services-worker
       tag: sha-27ad2f7
 
 

diff --git a/docs/source/server.mdx b/docs/source/server.mdx
@@ -25,7 +25,7 @@ You might've noticed the `/valid` and `/is-valid` endpoints don't have a job in
 
 Workers are responsible for executing the jobs in the queue. They complete the actual preprocessing requests, such as getting a list of splits and configurations. The workers can be controlled by configurable environment variables, like the minimum or the maximum number of rows returned by a worker or the maximum number of jobs to start per dataset user or organization.
 
-Take a look at the [workers configuration](https://github.com/huggingface/datasets-server/tree/main/workers/datasets_based#configuration) for a complete list of the environment variables if you're interested in learning more.
+Take a look at the [workers configuration](https://github.com/huggingface/datasets-server/tree/main/services/worker#configuration) for a complete list of the environment variables if you're interested in learning more.
 
 ## Cache
 

diff --git a/jobs/mongodb_migration/poetry.lock b/jobs/mongodb_migration/poetry.lock
diff --git a/jobs/mongodb_migration/src/mongodb_migration/main.py b/jobs/mongodb_migration/src/mongodb_migration/main.py
@@ -35,12 +35,12 @@ def run_job() -> None:
                 "The connection to the cache database could not be established. The migration job is skipped."
             )
             return
-        if queue_resource.is_available() is False:
+        if not queue_resource.is_available():
             logging.warning(
                 "The connection to the queue database could not be established. The migration job is skipped."
             )
             return
-        if migrations_database_resource.is_available() is False:
+        if not migrations_database_resource.is_available():
             logging.warning(
                 "The connection to the migrations database could not be established. The migration job is skipped."
             )

diff --git a/libs/libcommon/pyproject.toml b/libs/libcommon/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 authors = ["Sylvain Lesage <sylvain.lesage@huggingface.co>"]
-description = "Library for utils, common to all the services and workers"
+description = "Library for utils common to all the services"
 name = "libcommon"
 version = "0.6.8"
 license = "Apache-2.0"

diff --git a/libs/libcommon/tests/test_queue.py b/libs/libcommon/tests/test_queue.py
@@ -225,4 +225,5 @@ def test_get_total_duration_per_dataset() -> None:
     # cancel one remaining job
     queue.cancel_started_jobs()
     # check the total duration
-    assert queue.get_total_duration_per_dataset() == {test_dataset: duration * 3}
+    assert queue.get_total_duration_per_dataset()[test_dataset] >= duration * 3
+    # ^ it should be equal,  not >=, but if the runner is slow, it might take a bit more time
diff --git a/libs/libcommon/tests/test_resources.py b/libs/libcommon/tests/test_resources.py
@@ -21,7 +21,7 @@ def test_database_resource(queue_mongo_host: str) -> None:
     database_2 = "datasets_server_2"
     host = queue_mongo_host
     mongoengine_alias = "datasets_server_mongo_alias"
-    server_selection_timeout_ms = 1_000
+    server_selection_timeout_ms = 5_000
     resource_1 = MongoResource(
         database=database_1,
         host=host,

diff --git a/services/admin/poetry.lock b/services/admin/poetry.lock
diff --git a/services/api/poetry.lock b/services/api/poetry.lock
diff --git a/workers/datasets_based/.flake8 → services/worker/.flake8 b/workers/datasets_based/.flake8 → services/worker/.flake8
diff --git a/workers/datasets_based/.python-version → services/worker/.python-version b/workers/datasets_based/.python-version → services/worker/.python-version
diff --git a/workers/datasets_based/Dockerfile → services/worker/Dockerfile b/workers/datasets_based/Dockerfile → services/worker/Dockerfile
@@ -25,13 +25,13 @@ RUN pip install -U pip
 RUN pip install "poetry==$POETRY_VERSION"
 
 WORKDIR /src
-COPY workers/datasets_based/vendors ./workers/datasets_based/vendors/
-COPY workers/datasets_based/poetry.lock ./workers/datasets_based/poetry.lock
-COPY workers/datasets_based/pyproject.toml ./workers/datasets_based/pyproject.toml
+COPY services/worker/vendors ./services/worker/vendors/
+COPY services/worker/poetry.lock ./services/worker/poetry.lock
+COPY services/worker/pyproject.toml ./services/worker/pyproject.toml
 COPY libs/libcommon ./libs/libcommon
-WORKDIR /src/workers/datasets_based/
+WORKDIR /src/services/worker/
 RUN poetry install --no-cache
-COPY workers/datasets_based/src ./src
+COPY services/worker/src ./src
 RUN poetry install --no-cache
 
-ENTRYPOINT ["poetry", "run", "python", "src/datasets_based/main.py"]
+ENTRYPOINT ["poetry", "run", "python", "src/worker/main.py"]
diff --git a/workers/datasets_based/Makefile → services/worker/Makefile b/workers/datasets_based/Makefile → services/worker/Makefile
@@ -1,5 +1,5 @@
 # environment variables for the commands (docker compose, poetry)
-export COMPOSE_PROJECT_NAME := datasets_based
+export COMPOSE_PROJECT_NAME := worker
 export MONGO_PORT := 27040
 export CACHE_MONGO_URL := mongodb://localhost:${MONGO_PORT}
 export QUEUE_MONGO_URL := mongodb://localhost:${MONGO_PORT}
@@ -12,4 +12,4 @@ include ../../tools/Docker.mk
 
 .PHONY: run
 run:
-	poetry run python src/datasets_based/main.py
+	poetry run python src/worker/main.py
diff --git a/workers/datasets_based/README.md → services/worker/README.md b/workers/datasets_based/README.md → services/worker/README.md
diff --git a/workers/datasets_based/dev.Dockerfile → services/worker/dev.Dockerfile b/workers/datasets_based/dev.Dockerfile → services/worker/dev.Dockerfile
@@ -25,17 +25,17 @@ RUN pip install -U pip
 RUN pip install "poetry==$POETRY_VERSION"
 
 WORKDIR /src
-COPY workers/datasets_based/vendors ./workers/datasets_based/vendors/
-COPY workers/datasets_based/poetry.lock ./workers/datasets_based/poetry.lock
-COPY workers/datasets_based/pyproject.toml ./workers/datasets_based/pyproject.toml
+COPY services/worker/vendors ./services/worker/vendors/
+COPY services/worker/poetry.lock ./services/worker/poetry.lock
+COPY services/worker/pyproject.toml ./services/worker/pyproject.toml
 COPY libs/libcommon ./libs/libcommon
-WORKDIR /src/workers/datasets_based/
+WORKDIR /src/services/worker/
 RUN poetry install --no-cache
 
 # FOR LOCAL DEVELOPMENT ENVIRONMENT
 # No need to copy the source code since we map a volume in docker-compose-base.yaml
-# Removed: COPY workers/datasets_based/src ./src
+# Removed: COPY services/worker/src ./src
 # Removed: RUN poetry install --no-cache
 # However we need to install the package when the container starts
 # Added: poetry install
-ENTRYPOINT ["/bin/sh", "-c" , "poetry install && poetry run python src/datasets_based/main.py"]
+ENTRYPOINT ["/bin/sh", "-c" , "poetry install && poetry run python src/worker/main.py"]
diff --git a/workers/datasets_based/poetry.lock → services/worker/poetry.lock b/workers/datasets_based/poetry.lock → services/worker/poetry.lock
diff --git a/workers/datasets_based/poetry.toml → services/worker/poetry.toml b/workers/datasets_based/poetry.toml → services/worker/poetry.toml
diff --git a/workers/datasets_based/pyproject.toml → services/worker/pyproject.toml b/workers/datasets_based/pyproject.toml → services/worker/pyproject.toml
@@ -1,7 +1,7 @@
 [tool.poetry]
 authors = ["Sylvain Lesage <sylvain.lesage@huggingface.co>"]
-description = "Worker for processing steps that need the datasets library"
-name = "datasets_based"
+description = "Worker that processes jobs and stores the responses in the cache"
+name = "worker"
 version = "1.0.0"
 license = "Apache-2.0"
 
@@ -65,7 +65,7 @@ markers = [
 ]
 
 [tool.coverage.run]
-source = ["datasets_based"]
+source = ["worker"]
 
 [tool.isort]
 profile = "black"

diff --git a/...sets_based/src/datasets_based/__init__.py → services/worker/src/worker/__init__.py b/...sets_based/src/datasets_based/__init__.py → services/worker/src/worker/__init__.py
diff --git a/...atasets_based/src/datasets_based/asset.py → services/worker/src/worker/asset.py b/...atasets_based/src/datasets_based/asset.py → services/worker/src/worker/asset.py