huggingface · severo · Feb 13, 2023 · Feb 13, 2023
diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md
@@ -155,7 +155,7 @@ GITHUB_TOKEN=xxx
 
 ## Mac OS
 
-To install the [datasets based worker](./services/worker) on Mac OS, you can follow the next steps.
+To install the [worker](./services/worker) on Mac OS, you can follow the next steps.
 
 ### First: as an administrator
 

diff --git a/chart/env/dev.yaml b/chart/env/dev.yaml
@@ -239,7 +239,3 @@ sizes:
     limits:
       cpu: 1
       memory: "4Gi"
-
-# --- datasets_based ---
-datasetsBased:
-  contentMaxBytes: "10_000_000"
diff --git a/chart/env/prod.yaml b/chart/env/prod.yaml
@@ -309,7 +309,3 @@ sizes:
     limits:
       cpu: 2
       memory: "1Gi"
-
-# --- datasets_based ---
-datasetsBased:
-  contentMaxBytes: "10_000_000"
diff --git a/chart/templates/_envDatasetsBased.tpl b/chart/templates/_envDatasetsBased.tpl
@@ -8,7 +8,5 @@
   value: "/tmp/modules-cache"
 - name: NUMBA_CACHE_DIR
   value: "/tmp/numba-cache"
-- name: CONTENT_MAX_BYTES
-  value: {{ .Values.datasetsBased.contentMaxBytes}}
 {{- end -}}
 
diff --git a/chart/templates/_envWorker.tpl b/chart/templates/_envWorker.tpl
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2022 The HuggingFace Authors.
+
+{{- define "envWorker" -}}
+- name: WORKER_CONTENT_MAX_BYTES
+  value: {{ .Values.worker.contentMaxBytes | quote}}
+  # WORKER_ENDPOINT is not defined here, it's hard-coded in the template
+- name: WORKER_MAX_DISK_USAGE_PCT
+  value: {{ .Values.worker.maxDiskUsagePct | quote }}
+- name: WORKER_MAX_LOAD_PCT
+  value: {{ .Values.worker.maxLoadPct | quote }}
+- name: WORKER_MAX_MEMORY_PCT
+  value: {{ .Values.worker.maxMemoryPct | quote }}
+- name: WORKER_SLEEP_SECONDS
+  value: {{ .Values.worker.sleepSeconds | quote }}
+- name: WORKER_STORAGE_PATHS
+  value: {{ .Values.assets.storageDirectory | quote }}
+  # ^ note: for datasets_based workers, the datasets cache is automatically added, so no need to add it here
+{{- end -}}
diff --git a/chart/templates/_envWorkerLoop.tpl b/chart/templates/_envWorkerLoop.tpl
diff --git a/chart/templates/_helpers.tpl b/chart/templates/_helpers.tpl
@@ -82,7 +82,7 @@ imagePullSecrets:
 {{ include "datasetsServer.images.image" (dict "imageRoot" .Values.images.services.api "global" .Values.global.huggingface) }}
 {{- end -}}
 
-{{- define "workers.datasetsBased.image" -}}
+{{- define "services.worker.image" -}}
 {{ include "datasetsServer.images.image" (dict "imageRoot" .Values.images.services.worker "global" .Values.global.huggingface) }}
 {{- end -}}
 
@@ -263,4 +263,4 @@ Return the HUB url
 {{- $hubName := ((list $.Release.Name "hub") | join "-") | trunc 63 | trimSuffix "-" -}}
 http://{{ $hubName }}
 {{- end -}}
-{{- end -}}
+{{- end -}}
diff --git a/chart/templates/worker/config-names/_container.tpl b/chart/templates/worker/config-names/_container.tpl
@@ -3,21 +3,19 @@
 
 {{- define "containerWorkerConfigNames" -}}
 - name: "{{ include "name" . }}-worker-config-names"
-  image: {{ include "workers.datasetsBased.image" . }}
+  image: {{ include "services.worker.image" . }}
   imagePullPolicy: {{ .Values.images.pullPolicy }}
   env:
-  - name: DATASETS_BASED_ENDPOINT
+  - name: WORKER_ENDPOINT
     value: "/config-names"
     # ^ hard-coded
   {{ include "envCache" . | nindent 2 }}
   {{ include "envQueue" . | nindent 2 }}
   {{ include "envCommon" . | nindent 2 }}
-  {{ include "envWorkerLoop" . | nindent 2 }}
+  {{ include "envWorker" . | nindent 2 }}
   {{ include "envDatasetsBased" . | nindent 2 }}
   - name: DATASETS_BASED_HF_DATASETS_CACHE
     value: {{ printf "%s/config-names/datasets" .Values.cacheDirectory | quote }}
-  - name: DATASETS_BASED_CONTENT_MAX_BYTES
-    value: {{ .Values.datasetsBased.contentMaxBytes | quote}}
   - name: QUEUE_MAX_JOBS_PER_NAMESPACE
     # value: {{ .Values.queue.maxJobsPerNamespace | quote }}
     # overridden

diff --git a/chart/templates/worker/dataset-info/_container.tpl b/chart/templates/worker/dataset-info/_container.tpl
@@ -3,18 +3,16 @@
 
 {{- define "containerWorkerDatasetInfo" -}}
 - name: "{{ include "name" . }}-worker-dataset-info"
-  image: {{ include "workers.datasetsBased.image" . }}
+  image: {{ include "services.worker.image" . }}
   imagePullPolicy: {{ .Values.images.pullPolicy }}
   env:
-  - name: DATASETS_BASED_ENDPOINT
+  - name: WORKER_ENDPOINT
     value: "/dataset-info"
     # ^ hard-coded
   {{ include "envCache" . | nindent 2 }}
   {{ include "envQueue" . | nindent 2 }}
   {{ include "envCommon" . | nindent 2 }}
-  {{ include "envWorkerLoop" . | nindent 2 }}
-  - name: DATASETS_BASED_CONTENT_MAX_BYTES
-    value: {{ .Values.datasetsBased.contentMaxBytes | quote}}
+  {{ include "envWorker" . | nindent 2 }}
   - name: QUEUE_MAX_JOBS_PER_NAMESPACE
     # value: {{ .Values.queue.maxJobsPerNamespace | quote }}
     # overridden

diff --git a/chart/templates/worker/first-rows/_container.tpl b/chart/templates/worker/first-rows/_container.tpl
@@ -3,25 +3,20 @@
 
 {{- define "containerWorkerFirstRows" -}}
 - name: "{{ include "name" . }}-worker-first-rows"
-  image: {{ include "workers.datasetsBased.image" . }}
+  image: {{ include "services.worker.image" . }}
   imagePullPolicy: {{ .Values.images.pullPolicy }}
   env:
-  - name: DATASETS_BASED_ENDPOINT
+  - name: WORKER_ENDPOINT
     value: "/first-rows"
     # ^ hard-coded
   {{ include "envAssets" . | nindent 2 }}
   {{ include "envCache" . | nindent 2 }}
   {{ include "envQueue" . | nindent 2 }}
   {{ include "envCommon" . | nindent 2 }}
-  {{ include "envWorkerLoop" . | nindent 2 }}
-  - name: WORKER_LOOP_STORAGE_PATHS
-    value: {{ .Values.assets.storageDirectory | quote }}
-    # ^ note: the datasets cache is automatically added, so no need to add it here
+  {{ include "envWorker" . | nindent 2 }}
   {{ include "envDatasetsBased" . | nindent 2 }}
   - name: DATASETS_BASED_HF_DATASETS_CACHE
     value: {{ printf "%s/first-rows/datasets" .Values.cacheDirectory | quote }}
-  - name: DATASETS_BASED_CONTENT_MAX_BYTES
-    value: {{ .Values.datasetsBased.contentMaxBytes | quote}}
   - name: QUEUE_MAX_JOBS_PER_NAMESPACE
     # value: {{ .Values.queue.maxJobsPerNamespace | quote }}
     # overridden

diff --git a/chart/templates/worker/parquet-and-dataset-info/_container.tpl b/chart/templates/worker/parquet-and-dataset-info/_container.tpl
@@ -3,21 +3,19 @@
 
 {{- define "containerWorkerParquetAndDatasetInfo" -}}
 - name: "{{ include "name" . }}-worker-parquet-and-dataset-info"
-  image: {{ include "workers.datasetsBased.image" . }}
+  image: {{ include "services.worker.image" . }}
   imagePullPolicy: {{ .Values.images.pullPolicy }}
   env:
-  - name: DATASETS_BASED_ENDPOINT
+  - name: WORKER_ENDPOINT
     value: "/parquet-and-dataset-info"
     # ^ hard-coded
   {{ include "envCache" . | nindent 2 }}
   {{ include "envQueue" . | nindent 2 }}
   {{ include "envCommon" . | nindent 2 }}
-  {{ include "envWorkerLoop" . | nindent 2 }}
+  {{ include "envWorker" . | nindent 2 }}
   {{ include "envDatasetsBased" . | nindent 2 }}
   - name: DATASETS_BASED_HF_DATASETS_CACHE
     value: {{ printf "%s/parquet-and-dataset-info/datasets" .Values.cacheDirectory | quote }}
-  - name: DATASETS_BASED_CONTENT_MAX_BYTES
-    value: {{ .Values.datasetsBased.contentMaxBytes | quote}}
   - name: QUEUE_MAX_JOBS_PER_NAMESPACE
     # value: {{ .Values.queue.maxJobsPerNamespace | quote }}
     # overridden

diff --git a/chart/templates/worker/parquet/_container.tpl b/chart/templates/worker/parquet/_container.tpl
@@ -3,18 +3,16 @@
 
 {{- define "containerWorkerParquet" -}}
 - name: "{{ include "name" . }}-worker-parquet"
-  image: {{ include "workers.datasetsBased.image" . }}
+  image: {{ include "services.worker.image" . }}
   imagePullPolicy: {{ .Values.images.pullPolicy }}
   env:
-  - name: DATASETS_BASED_ENDPOINT
+  - name: WORKER_ENDPOINT
     value: "/parquet"
     # ^ hard-coded
   {{ include "envCache" . | nindent 2 }}
   {{ include "envQueue" . | nindent 2 }}
   {{ include "envCommon" . | nindent 2 }}
-  {{ include "envWorkerLoop" . | nindent 2 }}
-  - name: DATASETS_BASED_CONTENT_MAX_BYTES
-    value: {{ .Values.datasetsBased.contentMaxBytes | quote}}
+  {{ include "envWorker" . | nindent 2 }}
   - name: QUEUE_MAX_JOBS_PER_NAMESPACE
     # value: {{ .Values.queue.maxJobsPerNamespace | quote }}
     # overridden

diff --git a/chart/templates/worker/sizes/_container.tpl b/chart/templates/worker/sizes/_container.tpl
@@ -3,18 +3,16 @@
 
 {{- define "containerWorkerSizes" -}}
 - name: "{{ include "name" . }}-worker-sizes"
-  image: {{ include "workers.datasetsBased.image" . }}
+  image: {{ include "services.worker.image" . }}
   imagePullPolicy: {{ .Values.images.pullPolicy }}
   env:
-  - name: DATASETS_BASED_ENDPOINT
+  - name: WORKER_ENDPOINT
     value: "/sizes"
     # ^ hard-coded
   {{ include "envCache" . | nindent 2 }}
   {{ include "envQueue" . | nindent 2 }}
   {{ include "envCommon" . | nindent 2 }}
-  {{ include "envWorkerLoop" . | nindent 2 }}
-  - name: DATASETS_BASED_CONTENT_MAX_BYTES
-    value: {{ .Values.datasetsBased.contentMaxBytes | quote}}
+  {{ include "envWorker" . | nindent 2 }}
   - name: QUEUE_MAX_JOBS_PER_NAMESPACE
     # value: {{ .Values.queue.maxJobsPerNamespace | quote }}
     # overridden

diff --git a/chart/templates/worker/split-names/_container.tpl b/chart/templates/worker/split-names/_container.tpl
@@ -3,21 +3,19 @@
 
 {{- define "containerWorkerSplitNames" -}}
 - name: "{{ include "name" . }}-worker-split-names"
-  image: {{ include "workers.datasetsBased.image" . }}
+  image: {{ include "services.worker.image" . }}
   imagePullPolicy: {{ .Values.images.pullPolicy }}
   env:
-  - name: DATASETS_BASED_ENDPOINT
+  - name: WORKER_ENDPOINT
     value: "/split-names"
     # ^ hard-coded
   {{ include "envCache" . | nindent 2 }}
   {{ include "envQueue" . | nindent 2 }}
   {{ include "envCommon" . | nindent 2 }}
-  {{ include "envWorkerLoop" . | nindent 2 }}
+  {{ include "envWorker" . | nindent 2 }}
   {{ include "envDatasetsBased" . | nindent 2 }}
   - name: DATASETS_BASED_HF_DATASETS_CACHE
     value: {{ printf "%s/split-names/datasets" .Values.cacheDirectory | quote }}
-  - name: DATASETS_BASED_CONTENT_MAX_BYTES
-    value: {{ .Values.datasetsBased.contentMaxBytes | quote}}
   - name: QUEUE_MAX_JOBS_PER_NAMESPACE
     # value: {{ .Values.queue.maxJobsPerNamespace | quote }}
     # overridden

diff --git a/chart/templates/worker/splits/_container.tpl b/chart/templates/worker/splits/_container.tpl
@@ -3,21 +3,19 @@
 
 {{- define "containerWorkerSplits" -}}
 - name: "{{ include "name" . }}-worker-splits"
-  image: {{ include "workers.datasetsBased.image" . }}
+  image: {{ include "services.worker.image" . }}
   imagePullPolicy: {{ .Values.images.pullPolicy }}
   env:
-  - name: DATASETS_BASED_ENDPOINT
+  - name: WORKER_ENDPOINT
     value: "/splits"
     # ^ hard-coded
   {{ include "envCache" . | nindent 2 }}
   {{ include "envQueue" . | nindent 2 }}
   {{ include "envCommon" . | nindent 2 }}
-  {{ include "envWorkerLoop" . | nindent 2 }}
+  {{ include "envWorker" . | nindent 2 }}
   {{ include "envDatasetsBased" . | nindent 2 }}
   - name: DATASETS_BASED_HF_DATASETS_CACHE
     value: {{ printf "%s/splits/datasets" .Values.cacheDirectory | quote }}
-  - name: DATASETS_BASED_CONTENT_MAX_BYTES
-    value: {{ .Values.datasetsBased.contentMaxBytes | quote}}
   - name: QUEUE_MAX_JOBS_PER_NAMESPACE
     # value: {{ .Values.queue.maxJobsPerNamespace | quote }}
     # overridden

diff --git a/chart/values.yaml b/chart/values.yaml
@@ -94,7 +94,9 @@ queue:
   # Name of the mongo db database used to store the jobs queue
   mongoDatabase: "datasets_server_queue"
 
-workerLoop:
+worker:
+  # maximum size in bytes of the response content computed by a worker
+  contentMaxBytes: "10_000_000"
   # maximum disk usage of every storage disk in the list (in percentage) to allow a job to start. Set to 0 to disable the test.
   maxDiskUsagePct: 90
   # Max CPU load (%) - if reached, sleeps until it comes back under the limit. Set to 0 to disable the test.
@@ -376,7 +378,3 @@ sizes:
     limits:
       cpu: 0
   tolerations: []
-
-# --- datasets_based ---
-datasetsBased:
-  contentMaxBytes: "10_000_000"
diff --git a/services/worker/README.md b/services/worker/README.md
@@ -6,27 +6,37 @@
 
 Use environment variables to configure the worker. The prefix of each environment variable gives its scope.
 
+## Worker configuration
+
+Set environment variables to configure the worker.
+
+- `WORKER_CONTENT_MAX_BYTES`: the maximum size in bytes of the response content computed by a worker (to prevent returning big responses in the REST API). Defaults to `10_000_000`.
+- `WORKER_ENDPOINT`: the endpoint on which the worker will work (pre-compute and cache the response). The same worker is used for different endpoints to reuse shared code and dependencies. But at runtime, the worker is assigned only one endpoint. Allowed values: `/splits`, `/first_rows`, `/parquet-and-dataset-info`, etc. Defaults to `/splits`.
+- `WORKER_MAX_DISK_USAGE_PCT`: maximum disk usage of every storage disk in the list (in percentage) to allow a job to start. Set to 0 to disable the test. Defaults to 90.
+- `WORKER_MAX_LOAD_PCT`: maximum load of the machine (in percentage: the max between the 1m load and the 5m load divided by the number of CPUs \*100) allowed to start a job. Set to 0 to disable the test. Defaults to 70.
+- `WORKER_MAX_MEMORY_PCT`: maximum memory (RAM + SWAP) usage of the machine (in percentage) allowed to start a job. Set to 0 to disable the test. Defaults to 80.
+- `WORKER_SLEEP_SECONDS`: wait duration in seconds at each loop iteration before checking if resources are available and processing a job if any is available. Note that the loop doesn't wait just after finishing a job: the next job is immediately processed. Defaults to `15`.
+- `WORKER_STORAGE_PATHS`: comma-separated list of paths to check for disk usage. Defaults to empty.
+
 ### Datasets based worker
 
 Set environment variables to configure the datasets-based worker (`DATASETS_BASED_` prefix):
 
-- `DATASETS_BASED_ENDPOINT`: the endpoint on which the worker will work (pre-compute and cache the response). The same worker is used for different endpoints to reuse shared code and dependencies. But at runtime, the worker is assigned only one endpoint. Allowed values: `/splits`, `/first_rows`, and ` /parquet-and-dataset-info`. Defaults to `/splits`.
 - `DATASETS_BASED_HF_DATASETS_CACHE`: directory where the `datasets` library will store the cached datasets' data. If not set, the datasets library will choose the default location. Defaults to None.
-- `DATASETS_BASED_CONTENT_MAX_BYTES`: the maximum size in bytes of the response content computed by a worker (to prevent returning big responses in the REST API). Defaults to `10_000_000`.
 
 Also, set the modules cache configuration for the datasets-based worker. See [../../libs/libcommon/README.md](../../libs/libcommon/README.md). Note that this variable has no `DATASETS_BASED_` prefix:
 
 - `HF_MODULES_CACHE`: directory where the `datasets` library will store the cached dataset scripts. If not set, the datasets library will choose the default location. Defaults to None.
 
-Note that both directories will be appended to `WORKER_LOOP_STORAGE_PATHS` (see [../../libs/libcommon/README.md](../../libs/libcommon/README.md)) to hold the workers when the disk is full.
+Note that both directories will be appended to `WORKER_STORAGE_PATHS` (see [../../libs/libcommon/README.md](../../libs/libcommon/README.md)) to hold the workers when the disk is full.
 
 ### Numba library
 
 Numba requires setting the `NUMBA_CACHE_DIR` environment variable to a writable directory to cache the compiled functions. Required on cloud infrastructure (see https://stackoverflow.com/a/63367171/7351594):
 
 - `NUMBA_CACHE_DIR`: directory where the `numba` decorators (used by `librosa`) can write cache.
 
-Note that this directory will be appended to `WORKER_LOOP_STORAGE_PATHS` (see [../../libs/libcommon/README.md](../../libs/libcommon/README.md)) to hold the workers when the disk is full.
+Note that this directory will be appended to `WORKER_STORAGE_PATHS` (see [../../libs/libcommon/README.md](../../libs/libcommon/README.md)) to hold the workers when the disk is full.
 
 ### Huggingface_hub library
 
@@ -36,7 +46,7 @@ If the Hub is not https://huggingface.co (i.e., if you set the `COMMON_HF_ENDPOI
 
 ### First rows worker
 
-Only needed when the `DATASETS_BASED_ENDPOINT` is set to `/first-rows`.
+Only needed when the `WORKER_ENDPOINT` is set to `/first-rows`.
 
 Set environment variables to configure the first rows worker (`FIRST_ROWS_` prefix):
 
@@ -50,7 +60,7 @@ Also, set the assets-related configuration for the first-rows worker. See [../..
 
 ### Parquet and dataset info worker
 
-Only needed when the `DATASETS_BASED_ENDPOINT` is set to `/parquet-and-dataset-info`.
+Only needed when the `WORKER_ENDPOINT` is set to `/parquet-and-dataset-info`.
 
 Set environment variables to configure the parquet worker (`PARQUET_AND_DATASET_INFO_` prefix):
 
@@ -70,13 +80,3 @@ The splits worker does not need any additional configuration.
 ### Common
 
 See [../../libs/libcommon/README.md](../../libs/libcommon/README.md) for more information about the common configuration.
-
-## Worker loop configuration
-
-Set environment variables to configure the worker loop that processes the queue.
-
-- `WORKER_LOOP_MAX_DISK_USAGE_PCT`: maximum disk usage of every storage disk in the list (in percentage) to allow a job to start. Set to 0 to disable the test. Defaults to 90.
-- `WORKER_LOOP_MAX_LOAD_PCT`: maximum load of the machine (in percentage: the max between the 1m load and the 5m load divided by the number of CPUs \*100) allowed to start a job. Set to 0 to disable the test. Defaults to 70.
-- `WORKER_LOOP_MAX_MEMORY_PCT`: maximum memory (RAM + SWAP) usage of the machine (in percentage) allowed to start a job. Set to 0 to disable the test. Defaults to 80.
-- `WORKER_LOOP_SLEEP_SECONDS`: wait duration in seconds at each loop iteration before checking if resources are available and processing a job if any is available. Note that the loop doesn't wait just after finishing a job: the next job is immediately processed. Defaults to `15`.
-- `WORKER_LOOP_STORAGE_PATHS`: comma-separated list of paths to check for disk usage. Defaults to empty.