chart/values.yaml

global:
  huggingface:
    imageRegistry: ""
    imagePullSecrets: []
    privateHub:
      enabled: false
    ingress:
      domain: huggingface.co
      subdomains:
        datasetsServer: datasets-server

images:
  pullPolicy: IfNotPresent
  pullSecrets: []
  reverseProxy:
    useGlobalRegistry: false
    registry: docker.io
    repository: nginx
    tag: "1.20"
  jobs:
    mongodbMigration:
      registry: huggingface
      useGlobalRegistry: false
      repository: datasets-server-jobs-mongodb_migration
      tag: sha-25ff490
  services:
    admin:
      registry: huggingface
      useGlobalRegistry: false
      repository: datasets-server-services-admin
      tag: sha-25ff490
    api:
      registry: huggingface
      useGlobalRegistry: false
      repository: datasets-server-services-api
      tag: sha-25ff490
  workers:
    datasetsBased:
      registry: huggingface
      useGlobalRegistry: false
      repository: datasets-server-workers-datasets_based
      tag: sha-25ff490


common:
  # URL of the HuggingFace Hub
  hfEndpoint: ""
  # Log level
  logLevel: "INFO"

# --- common parameters ---

secrets:
  mongoUrl:
    fromSecret: false
    secretName: "mongo-url"
    value: mongo://
  appHfToken:
    fromSecret: false
    secretName: ""
    value: hf_app
  userHfToken:
    fromSecret: false
    secretName: "hf-token-francky"
    value: hf_

uid: 1000
gid: 3000

persistence:
  existingClaim: ""
  storageClass: ""
  size: 20Gi

monitoring:
  enabled: false

mongodb:
  enabled: true
  nameOverride: datasets-server-mongodb
  useStatefulSet: true
  auth:
    enabled: false
  serviceAccount:
    create: false

cache:
  # Name of the mongo db database used to cache the API responses
  mongoDatabase: "datasets_server_cache"

queue:
  # Maximum number of jobs running at the same time for the same namespace
  maxJobsPerNamespace: 1
  # Name of the mongo db database used to store the jobs queue
  mongoDatabase: "datasets_server_queue"

workerLoop:
  # maximum disk usage of every storage disk in the list (in percentage) to allow a job to start. Set to 0 to disable the test.
  maxDiskUsagePct: 90
  # Max CPU load (%) - if reached, sleeps until it comes back under the limit. Set to 0 to disable the test.
  maxLoadPct: 0
  # Max memory (RAM + SWAP) (%) - if reached, sleeps until it comes back under the limit. Set to 0 to disable the test.
  maxMemoryPct: 0
  # Number of seconds a worker will sleep before trying to process a new job
  sleepSeconds: 5

assets:
  # base URL for the assets files. It should be set accordingly to the datasets-server domain, eg https://datasets-server.huggingface.co/assets
  # baseUrl: "not used for now"
  # Directory on the shared storage (audio files and images)
  storageDirectory: "/assets"


# Directory where the cache data will be stored
cacheDirectory: "/datasets-server-cache"

# --- jobs (pre-install/upgrade hooks) ---

mongodbMigration:
  # Name of the mongo db database used for storing the migrations history
  mongoDatabase: "datasets_server_maintenance"

  nodeSelector: {}
  resources:
    requests:
      cpu: 0
    limits:
      cpu: 0
  tolerations: []

# --- storage admin (to manually inspect the storage, in /data) ---

storageAdmin:
  nodeSelector: {}
  replicas: 1
  resources:
    requests:
      cpu: 0
    limits:
      cpu: 0
  service:
    type: ClusterIP
    annotations: {}
  tolerations: []

# --- reverse proxy ---

reverseProxy:
  nameOverride: datasets-server-mongodb
  host: localhost
  port: 8080
  nginxTemplateFile: "nginx-templates/default.conf.template"
  openapiFile: "static-files/openapi.json"
  error404File: "nginx-templates/404.html"

  nodeSelector: {}
  replicas: 1
  resources:
    requests:
      cpu: 0
    limits:
      cpu: 0
  service:
    type: ClusterIP
    annotations: {}
  tolerations: []

ingress:
  tls: []
  annotations: {}

# --- services ---

admin:
  # HF organization that is allowed to request the report
  hfOrganization: "huggingface"
  # Number of reports in /cache-reports/... endpoints
  cacheReportsNumResults: 100
  # Number of reports in /cache-reports-with-content/... endpoints
  cacheReportsWithContentNumResults: 100
  # The path of the whoami service on the hub.
  hfWhoamiPath: "/api/whoami-v2"
  # Number of seconds to set in the `max-age` header on technical endpoints
  maxAge: "10"
  # Directory where the uvicorn workers share their prometheus metrics
  # see https://github.com/prometheus/client_python#multiprocess-mode-eg-gunicorn
  prometheusMultiprocDirectory: "/tmp"
  # hostname - it must not be set to localhost to work in Kube!
  uvicornHostname: "0.0.0.0"
  # Number of uvicorn workers for running the application
  uvicornNumWorkers: "1"
  # Application endpoint port
  uvicornPort: 8080

  nodeSelector: {}
  replicas: 1
  resources:
    requests:
      cpu: 0
    limits:
      cpu: 0
  service:
    type: ClusterIP
    annotations: {}
  tolerations: []

api:
  # the path of the external authentication service on the hub.
  # The string must contain `%s` which will be replaced with the dataset name.
  hfAuthPath: "/api/datasets/%s/auth-check"
  # Number of seconds to set in the `max-age` header on data endpoints
  maxAgeLong: "120"
  # Number of seconds to set in the `max-age` header on technical endpoints
  maxAgeShort: "10"
  # Directory where the uvicorn workers will write the prometheus metrics
  # see https://github.com/prometheus/client_python#multiprocess-mode-eg-gunicorn
  prometheusMultiprocDirectory: "/tmp"
  # Hostname - it must not be set to localhost to work in Kube!
  uvicornHostname: "0.0.0.0"
  # Number of uvicorn workers for running the application
  uvicornNumWorkers: "1"
  # Application endpoint port
  uvicornPort: 8080

  nodeSelector: {}
  replicas: 1
  resources:
    requests:
      cpu: 0
    limits:
      cpu: 0
  service:
    type: ClusterIP
    annotations: {}
  tolerations: []

# --- workers ---

configNames:
  # override the common queue parameters
  queue:
    # Maximum number of jobs running at the same time for the same namespace
    maxJobsPerNamespace: 1

  nodeSelector: {}
  replicas: 1
  resources:
    requests:
      cpu: 1
    limits:
      cpu: 1
  tolerations: []

splitNames:
  # override the common queue parameters
  queue:
    # Maximum number of jobs running at the same time for the same namespace
    maxJobsPerNamespace: 1

  nodeSelector: {}
  replicas: 1
  resources:
    requests:
      cpu: 1
    limits:
      cpu: 1
  tolerations: []

splits:
  # override the common queue parameters
  queue:
    # Maximum number of jobs running at the same time for the same namespace
    maxJobsPerNamespace: 1

  nodeSelector: {}
  replicas: 1
  resources:
    requests:
      cpu: 0
    limits:
      cpu: 0
  tolerations: []

firstRows:
  # Max size (in bytes) of the dataset to fallback in normal mode if streaming fails
  fallbackMaxDatasetSize: "100_000_000"
  # Max size of the /first-rows endpoint response in bytes
  maxBytes: "1_000_000"
  # Max number of rows in the /first-rows endpoint response
  maxNumber: 100
  # Min size of a cell in the /first-rows endpoint response in bytes
  minCellBytes: 100
  # Min number of rows in the /first-rows endpoint response
  minNumber: 10
  # override the common queue parameters
  queue:
    # Maximum number of jobs running at the same time for the same namespace
    maxJobsPerNamespace: 1
  # Max number of columns in the /first-rows endpoint response
  columnsMaxNumber: 1_000


  nodeSelector: {}
  replicas: 1
  resources:
    requests:
      cpu: 0
    limits:
      cpu: 0
  tolerations: []

parquetAndDatasetInfo:
  # comma-separated list of the blocked datasets. Defaults to empty.
  blockedDatasets: ""
  # the git commit message when the parquet files are uploaded to the Hub. Defaults to `Update parquet files`.
  commitMessage: "Update parquet files"
  # the maximum size of the supported datasets. Bigger datasets, or datasets that cannot provide the size, are ignored.
  maxDatasetSize: "100_000_000"
  # the git revision of the dataset to use to prepare the parquet files. Defaults to `main`.
  sourceRevision: "main"
  # comma-separated list of the supported datasets. If empty, all the datasets are processed. Defaults to empty.
  supportedDatasets: ""
  # the git revision of the dataset where to store the parquet files. Make sure the hf_token (see the "Common" section) allows to write there. Defaults to `refs/convert/parquet`.
  targetRevision: "refs/convert/parquet"
  # the URL template to build the parquet file URLs. Defaults to `/datasets/%s/resolve/%s/%s`.
  urlTemplate: "/datasets/%s/resolve/%s/%s"
  # override the common queue parameters
  queue:
    # Maximum number of jobs running at the same time for the same namespace
    maxJobsPerNamespace: 1
  nodeSelector: {}
  replicas: 1
  resources:
    requests:
      cpu: 0
    limits:
      cpu: 0
  tolerations: []

parquet:
  # override the common queue parameters
  queue:
    # Maximum number of jobs running at the same time for the same namespace
    maxJobsPerNamespace: 1
  nodeSelector: {}
  replicas: 1
  resources:
    requests:
      cpu: 0
    limits:
      cpu: 0
  tolerations: []

datasetInfo:
  # override the common queue parameters
  queue:
    # Maximum number of jobs running at the same time for the same namespace
    maxJobsPerNamespace: 1
  nodeSelector: {}
  replicas: 1
  resources:
    requests:
      cpu: 0
    limits:
      cpu: 0
  tolerations: []

sizes:
  # override the common queue parameters
  queue:
    # Maximum number of jobs running at the same time for the same namespace
    maxJobsPerNamespace: 1
  nodeSelector: {}
  replicas: 1
  resources:
    requests:
      cpu: 0
    limits:
      cpu: 0
  tolerations: []