huggingface · AndreaFrancis · Mar 31, 2023 · Mar 24, 2023 · Mar 24, 2023 · Mar 27, 2023
diff --git a/libs/libcommon/src/libcommon/config.py b/libs/libcommon/src/libcommon/config.py
@@ -19,6 +19,7 @@
     PROCESSING_STEP_DATASET_SPLIT_NAMES_FROM_DATASET_INFO_VERSION,
     PROCESSING_STEP_DATASET_SPLIT_NAMES_FROM_STREAMING_VERSION,
     PROCESSING_STEP_PARQUET_AND_DATASET_INFO_VERSION,
+    PROCESSING_STEP_SPLIT_FIRST_ROWS_FROM_PARQUET_VERSION,
     PROCESSING_STEP_SPLIT_FIRST_ROWS_FROM_STREAMING_VERSION,
     PROCESSING_STEP_SPLIT_NAMES_FROM_DATASET_INFO_VERSION,
     PROCESSING_STEP_SPLIT_NAMES_FROM_STREAMING_VERSION,
@@ -138,6 +139,11 @@ class ProcessingGraphConfig:
                 "requires": "/parquet-and-dataset-info",
                 "job_runner_version": PROCESSING_STEP_CONFIG_PARQUET_VERSION,
             },
+            "split-first-rows-from-parquet": {
+                "input_type": "split",
+                "requires": "config-parquet",
+                "job_runner_version": PROCESSING_STEP_SPLIT_FIRST_ROWS_FROM_PARQUET_VERSION,
+            },
             "dataset-parquet": {
                 "input_type": "dataset",
                 "requires": "config-parquet",

diff --git a/libs/libcommon/src/libcommon/constants.py b/libs/libcommon/src/libcommon/constants.py
@@ -16,7 +16,10 @@
 PROCESSING_STEP_DATASET_SPLIT_NAMES_FROM_DATASET_INFO_VERSION = 1
 PROCESSING_STEP_DATASET_SPLIT_NAMES_FROM_STREAMING_VERSION = 1
 PROCESSING_STEP_PARQUET_AND_DATASET_INFO_VERSION = 1
+PROCESSING_STEP_SPLIT_FIRST_ROWS_FROM_PARQUET_VERSION = 1
 PROCESSING_STEP_SPLIT_FIRST_ROWS_FROM_STREAMING_VERSION = 2
 PROCESSING_STEP_SPLIT_NAMES_FROM_DATASET_INFO_VERSION = 2
 PROCESSING_STEP_SPLIT_NAMES_FROM_STREAMING_VERSION = 2
 PROCESSING_STEP_SPLITS_VERSION = 2
+
+PARQUET_REVISION = "refs/convert/parquet"
diff --git a/libs/libcommon/tests/test_processing_steps.py b/libs/libcommon/tests/test_processing_steps.py
@@ -23,6 +23,7 @@ def test_default_graph() -> None:
     split_names_from_dataset_info = graph.get_step("/split-names-from-dataset-info")
     dataset_split_names_from_streaming = graph.get_step("dataset-split-names-from-streaming")
     dataset_split_names_from_dataset_info = graph.get_step("dataset-split-names-from-dataset-info")
+    split_first_rows_from_parquet = graph.get_step("split-first-rows-from-parquet")
 
     assert config_names is not None
     assert config_names.parent is None
@@ -51,7 +52,7 @@ def test_default_graph() -> None:
 
     assert config_parquet is not None
     assert config_parquet.parent is parquet_and_dataset_info
-    assert config_parquet.children == [dataset_parquet]
+    assert config_parquet.children == [split_first_rows_from_parquet, dataset_parquet]
     assert config_parquet.get_ancestors() == [parquet_and_dataset_info]
 
     assert dataset_parquet is not None
@@ -98,5 +99,10 @@ def test_default_graph() -> None:
         split_names_from_dataset_info,
     ]
 
+    assert split_first_rows_from_parquet is not None
+    assert split_first_rows_from_parquet.parent is config_parquet
+    assert split_first_rows_from_parquet.children == []
+    assert split_first_rows_from_parquet.get_ancestors() == [parquet_and_dataset_info, config_parquet]
+
     assert graph.get_first_steps() == [config_names, splits, parquet_and_dataset_info]
     assert graph.get_steps_required_by_dataset_viewer() == [splits, split_first_rows_from_streaming]
diff --git a/services/api/src/api/config.py b/services/api/src/api/config.py
@@ -116,7 +116,7 @@ class EndpointConfig:
                 "dataset": ["/splits", "dataset-split-names-from-streaming", "dataset-split-names-from-dataset-info"],
                 "config": ["/split-names-from-streaming", "/split-names-from-dataset-info"],
             },
-            "/first-rows": {"split": ["split-first-rows-from-streaming"]},
+            "/first-rows": {"split": ["split-first-rows-from-streaming", "split-first-rows-from-parquet"]},
             "/parquet-and-dataset-info": {"dataset": ["/parquet-and-dataset-info"]},
             "/parquet": {
                 "dataset": ["dataset-parquet"],

diff --git a/services/api/tests/routes/test_endpoint.py b/services/api/tests/routes/test_endpoint.py
@@ -43,7 +43,7 @@ def test_endpoints_definition() -> None:
     assert first_rows is not None
     assert sorted(list(first_rows)) == ["split"]
     assert first_rows["split"] is not None
-    assert len(first_rows["split"]) == 1  # Only has one processing step
+    assert len(first_rows["split"]) == 2  # Has two processing steps
 
     parquet_and_dataset_info = definition["/parquet-and-dataset-info"]
     assert parquet_and_dataset_info is not None

diff --git a/services/worker/src/worker/features.py b/services/worker/src/worker/features.py
@@ -1,10 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright 2022 The HuggingFace Authors.
 
+import io
 import json
 from typing import Any, List, Optional, Union
 from zlib import adler32
 
+import numpy
+import soundfile  # type:ignore
 from datasets import (
     Array2D,
     Array3D,
@@ -19,7 +22,6 @@
     Value,
 )
 from libcommon.storage import StrPath
-from numpy import ndarray
 from PIL import Image as PILImage  # type: ignore
 
 from worker.asset import create_audio_files, create_image_file
@@ -56,7 +58,11 @@ def image(
     if value is None:
         return None
     if not isinstance(value, PILImage.Image):
-        raise TypeError("image cell must be a PIL image")
+        try:
+            image_bytes = value["bytes"]
+            value = PILImage.open(io.BytesIO(image_bytes))
+        except Exception:
+            raise TypeError("image cell must be a PIL image")
     # attempt to generate one of the supported formats; if unsuccessful, throw an error
     for ext in [".jpg", ".png"]:
         try:
@@ -96,8 +102,12 @@ def audio(
         array = value["array"]
         sampling_rate = value["sampling_rate"]
     except Exception as e:
-        raise TypeError("audio cell must contain 'array' and 'sampling_rate' fields") from e
-    if type(array) != ndarray:
+        if "bytes" in value:
+            bytes_array, sampling_rate = soundfile.read(io.BytesIO(value["bytes"]))
+            array = numpy.array(bytes_array)
+        else:
+            raise TypeError("audio cell must contain 'array' and 'sampling_rate' fields") from e
+    if type(array) != numpy.ndarray:
         raise TypeError("'array' field must be a numpy.ndarray")
     if type(sampling_rate) != int:
         raise TypeError("'sampling_rate' field must be an integer")

diff --git a/services/worker/src/worker/job_runner_factory.py b/services/worker/src/worker/job_runner_factory.py
@@ -31,6 +31,9 @@
     DatasetSplitNamesFromStreamingJobRunner,
 )
 from worker.job_runners.parquet_and_dataset_info import ParquetAndDatasetInfoJobRunner
+from worker.job_runners.split.first_rows_from_parquet import (
+    SplitFirstRowsFromParquetJobRunner,
+)
 from worker.job_runners.split.first_rows_from_streaming import (
     SplitFirstRowsFromStreamingJobRunner,
 )
@@ -172,6 +175,16 @@ def _create_job_runner(self, job_info: JobInfo) -> JobRunner:
                 common_config=self.app_config.common,
                 worker_config=self.app_config.worker,
             )
+        if job_type == SplitFirstRowsFromParquetJobRunner.get_job_type():
+            first_rows_config = FirstRowsConfig.from_env()
+            return SplitFirstRowsFromParquetJobRunner(
+                job_info=job_info,
+                app_config=self.app_config,
+                processing_step=processing_step,
+                hf_datasets_cache=self.hf_datasets_cache,
+                first_rows_config=first_rows_config,
+                assets_directory=self.assets_directory,
+            )
         supported_job_types = [
             ConfigNamesJobRunner.get_job_type(),
             SplitNamesFromStreamingJobRunner.get_job_type(),
@@ -187,5 +200,6 @@ def _create_job_runner(self, job_info: JobInfo) -> JobRunner:
             SplitNamesFromDatasetInfoJobRunner.get_job_type(),
             DatasetSplitNamesFromStreamingJobRunner.get_job_type(),
             DatasetSplitNamesFromDatasetInfoJobRunner.get_job_type(),
+            SplitFirstRowsFromParquetJobRunner.get_job_type(),
         ]
         raise ValueError(f"Unsupported job type: '{job_type}'. The supported job types are: {supported_job_types}")