diff --git a/chart/templates/worker/first-rows/_container.tpl b/chart/templates/worker/first-rows/_container.tpl index 7fee8754b9..9c2cc31861 100644 --- a/chart/templates/worker/first-rows/_container.tpl +++ b/chart/templates/worker/first-rows/_container.tpl @@ -24,8 +24,6 @@ # value: {{ .Values.queue.maxJobsPerNamespace | quote }} # overridden value: {{ .Values.firstRows.queue.maxJobsPerNamespace | quote }} - - name: FIRST_ROWS_FALLBACK_MAX_DATASET_SIZE - value: {{ .Values.firstRows.fallbackMaxDatasetSize | quote }} - name: FIRST_ROWS_MAX_BYTES value: {{ .Values.firstRows.maxBytes | quote }} - name: FIRST_ROWS_MAX_NUMBER diff --git a/chart/values.yaml b/chart/values.yaml index 4c865211cf..0b3b9cf3d5 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -282,8 +282,6 @@ splits: tolerations: [] firstRows: - # Max size (in bytes) of the dataset to fallback in normal mode if streaming fails - fallbackMaxDatasetSize: "100_000_000" # Max size of the /first-rows endpoint response in bytes maxBytes: "1_000_000" # Max number of rows in the /first-rows endpoint response diff --git a/tools/docker-compose-datasets-server.yml b/tools/docker-compose-datasets-server.yml index 77028267f5..c668ada2a6 100644 --- a/tools/docker-compose-datasets-server.yml +++ b/tools/docker-compose-datasets-server.yml @@ -138,7 +138,6 @@ services: ASSETS_BASE_URL: "http://localhost:${PORT_REVERSE_PROXY-8000}/assets" # hard-coded to work with the reverse-proxy ASSETS_STORAGE_DIRECTORY: ${ASSETS_STORAGE_DIRECTORY-/assets} DATASETS_BASED_ENDPOINT: "/first-rows" # hard-coded - FIRST_ROWS_FALLBACK_MAX_DATASET_SIZE: ${FIRST_ROWS_FALLBACK_MAX_DATASET_SIZE-100_000_000} FIRST_ROWS_MAX_BYTES: ${FIRST_ROWS_MAX_BYTES-1_000_000} FIRST_ROWS_MAX_NUMBER: ${FIRST_ROWS_MAX_NUMBER-100} FIRST_ROWS_MIN_CELL_BYTES: ${FIRST_ROWS_MIN_CELL_BYTES-100} diff --git a/tools/docker-compose-dev-datasets-server.yml b/tools/docker-compose-dev-datasets-server.yml index ad95e0adea..fca77917f9 100644 --- a/tools/docker-compose-dev-datasets-server.yml +++ b/tools/docker-compose-dev-datasets-server.yml @@ -138,7 +138,6 @@ services: ASSETS_BASE_URL: "http://localhost:${PORT_REVERSE_PROXY-8000}/assets" # hard-coded to work with the reverse-proxy ASSETS_STORAGE_DIRECTORY: ${ASSETS_STORAGE_DIRECTORY-/assets} DATASETS_BASED_ENDPOINT: "/first-rows" # hard-coded - FIRST_ROWS_FALLBACK_MAX_DATASET_SIZE: ${FIRST_ROWS_FALLBACK_MAX_DATASET_SIZE-100_000_000} FIRST_ROWS_MAX_BYTES: ${FIRST_ROWS_MAX_BYTES-1_000_000} FIRST_ROWS_MAX_NUMBER: ${FIRST_ROWS_MAX_NUMBER-100} FIRST_ROWS_MIN_CELL_BYTES: ${FIRST_ROWS_MIN_CELL_BYTES-100} diff --git a/workers/datasets_based/README.md b/workers/datasets_based/README.md index 6d8f2fb957..adf390583e 100644 --- a/workers/datasets_based/README.md +++ b/workers/datasets_based/README.md @@ -39,7 +39,6 @@ Only needed when the `DATASETS_BASED_ENDPOINT` is set to `/first-rows`. Set environment variables to configure the first rows worker (`FIRST_ROWS_` prefix): -- `FIRST_ROWS_FALLBACK_MAX_DATASET_SIZE`: the maximum size in bytes of the dataset to fall back into normal mode if streaming fails. Note that it requires to have the size in the info metadata. Set to `0` to disable the fallback. Defaults to `100_000_000`. - `FIRST_ROWS_MAX_BYTES`: the max size of the /first-rows endpoint response in bytes. Defaults to `1_000_000` (1 MB). - `FIRST_ROWS_MAX_NUMBER`: the max number of rows fetched by the worker for the split and provided in the /first-rows endpoint response. Defaults to `100`. - `FIRST_ROWS_MIN_CELL_BYTES`: the minimum size in bytes of a cell when truncating the content of a row (see `FIRST_ROWS_ROWS_MAX_BYTES`). Below this limit, the cell content will not be truncated. Defaults to `100`. diff --git a/workers/datasets_based/src/datasets_based/config.py b/workers/datasets_based/src/datasets_based/config.py index 9373befc78..cedf201efb 100644 --- a/workers/datasets_based/src/datasets_based/config.py +++ b/workers/datasets_based/src/datasets_based/config.py @@ -71,7 +71,6 @@ def from_env() -> "DatasetsBasedConfig": ) -FIRST_ROWS_FALLBACK_MAX_DATASET_SIZE = 100_000_000 FIRST_ROWS_MAX_BYTES = 1_000_000 FIRST_ROWS_MAX_NUMBER = 100 FIRST_ROWS_CELL_MIN_BYTES = 100 @@ -82,7 +81,6 @@ def from_env() -> "DatasetsBasedConfig": @dataclass class FirstRowsConfig: assets: AssetsConfig = field(default_factory=AssetsConfig) - fallback_max_dataset_size: int = FIRST_ROWS_FALLBACK_MAX_DATASET_SIZE max_bytes: int = FIRST_ROWS_MAX_BYTES max_number: int = FIRST_ROWS_MAX_NUMBER min_cell_bytes: int = FIRST_ROWS_CELL_MIN_BYTES @@ -95,9 +93,6 @@ def from_env() -> "FirstRowsConfig": with env.prefixed("FIRST_ROWS_"): return FirstRowsConfig( assets=AssetsConfig.from_env(), - fallback_max_dataset_size=env.int( - name="FALLBACK_MAX_DATASET_SIZE", default=FIRST_ROWS_FALLBACK_MAX_DATASET_SIZE - ), max_bytes=env.int(name="MAX_BYTES", default=FIRST_ROWS_MAX_BYTES), max_number=env.int(name="MAX_NUMBER", default=FIRST_ROWS_MAX_NUMBER), min_cell_bytes=env.int(name="CELL_MIN_BYTES", default=FIRST_ROWS_CELL_MIN_BYTES), diff --git a/workers/datasets_based/src/datasets_based/workers/first_rows.py b/workers/datasets_based/src/datasets_based/workers/first_rows.py index d95460acec..f4aa20eb4f 100644 --- a/workers/datasets_based/src/datasets_based/workers/first_rows.py +++ b/workers/datasets_based/src/datasets_based/workers/first_rows.py @@ -399,12 +399,12 @@ def compute_first_rows_response( assets_base_url: str, hf_token: Optional[str], min_cell_bytes: int, - max_size_fallback: Optional[int], rows_max_bytes: int, rows_max_number: int, rows_min_number: int, columns_max_number: int, assets_directory: str, + max_size_fallback: Optional[int] = None, ) -> FirstRowsResponse: """ Get the response of /first-rows for one specific split of a dataset from huggingface.co. @@ -635,7 +635,6 @@ def compute(self) -> Mapping[str, Any]: assets_directory=self.first_rows_config.assets.storage_directory, hf_token=self.common_config.hf_token, min_cell_bytes=self.first_rows_config.min_cell_bytes, - max_size_fallback=self.first_rows_config.fallback_max_dataset_size, rows_max_bytes=self.first_rows_config.max_bytes, rows_max_number=self.first_rows_config.max_number, rows_min_number=self.first_rows_config.min_number,