Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Split first rows from parquet new Job Runner #988

Merged
merged 6 commits into from
Mar 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions libs/libcommon/src/libcommon/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
PROCESSING_STEP_DATASET_SPLIT_NAMES_FROM_DATASET_INFO_VERSION,
PROCESSING_STEP_DATASET_SPLIT_NAMES_FROM_STREAMING_VERSION,
PROCESSING_STEP_PARQUET_AND_DATASET_INFO_VERSION,
PROCESSING_STEP_SPLIT_FIRST_ROWS_FROM_PARQUET_VERSION,
PROCESSING_STEP_SPLIT_FIRST_ROWS_FROM_STREAMING_VERSION,
PROCESSING_STEP_SPLIT_NAMES_FROM_DATASET_INFO_VERSION,
PROCESSING_STEP_SPLIT_NAMES_FROM_STREAMING_VERSION,
Expand Down Expand Up @@ -138,6 +139,11 @@ class ProcessingGraphConfig:
"requires": "/parquet-and-dataset-info",
"job_runner_version": PROCESSING_STEP_CONFIG_PARQUET_VERSION,
},
"split-first-rows-from-parquet": {
"input_type": "split",
"requires": "config-parquet",
"job_runner_version": PROCESSING_STEP_SPLIT_FIRST_ROWS_FROM_PARQUET_VERSION,
},
"dataset-parquet": {
"input_type": "dataset",
"requires": "config-parquet",
Expand Down
3 changes: 3 additions & 0 deletions libs/libcommon/src/libcommon/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@
PROCESSING_STEP_DATASET_SPLIT_NAMES_FROM_DATASET_INFO_VERSION = 1
PROCESSING_STEP_DATASET_SPLIT_NAMES_FROM_STREAMING_VERSION = 1
PROCESSING_STEP_PARQUET_AND_DATASET_INFO_VERSION = 1
PROCESSING_STEP_SPLIT_FIRST_ROWS_FROM_PARQUET_VERSION = 1
PROCESSING_STEP_SPLIT_FIRST_ROWS_FROM_STREAMING_VERSION = 2
PROCESSING_STEP_SPLIT_NAMES_FROM_DATASET_INFO_VERSION = 2
PROCESSING_STEP_SPLIT_NAMES_FROM_STREAMING_VERSION = 2
PROCESSING_STEP_SPLITS_VERSION = 2

PARQUET_REVISION = "refs/convert/parquet"
8 changes: 7 additions & 1 deletion libs/libcommon/tests/test_processing_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def test_default_graph() -> None:
split_names_from_dataset_info = graph.get_step("/split-names-from-dataset-info")
dataset_split_names_from_streaming = graph.get_step("dataset-split-names-from-streaming")
dataset_split_names_from_dataset_info = graph.get_step("dataset-split-names-from-dataset-info")
split_first_rows_from_parquet = graph.get_step("split-first-rows-from-parquet")

assert config_names is not None
assert config_names.parent is None
Expand Down Expand Up @@ -51,7 +52,7 @@ def test_default_graph() -> None:

assert config_parquet is not None
assert config_parquet.parent is parquet_and_dataset_info
assert config_parquet.children == [dataset_parquet]
assert config_parquet.children == [split_first_rows_from_parquet, dataset_parquet]
assert config_parquet.get_ancestors() == [parquet_and_dataset_info]

assert dataset_parquet is not None
Expand Down Expand Up @@ -98,5 +99,10 @@ def test_default_graph() -> None:
split_names_from_dataset_info,
]

assert split_first_rows_from_parquet is not None
assert split_first_rows_from_parquet.parent is config_parquet
assert split_first_rows_from_parquet.children == []
assert split_first_rows_from_parquet.get_ancestors() == [parquet_and_dataset_info, config_parquet]

assert graph.get_first_steps() == [config_names, splits, parquet_and_dataset_info]
assert graph.get_steps_required_by_dataset_viewer() == [splits, split_first_rows_from_streaming]
2 changes: 1 addition & 1 deletion services/api/src/api/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ class EndpointConfig:
"dataset": ["/splits", "dataset-split-names-from-streaming", "dataset-split-names-from-dataset-info"],
"config": ["/split-names-from-streaming", "/split-names-from-dataset-info"],
},
"/first-rows": {"split": ["split-first-rows-from-streaming"]},
"/first-rows": {"split": ["split-first-rows-from-streaming", "split-first-rows-from-parquet"]},
"/parquet-and-dataset-info": {"dataset": ["/parquet-and-dataset-info"]},
"/parquet": {
"dataset": ["dataset-parquet"],
Expand Down
2 changes: 1 addition & 1 deletion services/api/tests/routes/test_endpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def test_endpoints_definition() -> None:
assert first_rows is not None
assert sorted(list(first_rows)) == ["split"]
assert first_rows["split"] is not None
assert len(first_rows["split"]) == 1 # Only has one processing step
assert len(first_rows["split"]) == 2 # Has two processing steps

parquet_and_dataset_info = definition["/parquet-and-dataset-info"]
assert parquet_and_dataset_info is not None
Expand Down
18 changes: 14 additions & 4 deletions services/worker/src/worker/features.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright 2022 The HuggingFace Authors.

import io
import json
from typing import Any, List, Optional, Union
from zlib import adler32

import numpy
import soundfile # type:ignore
from datasets import (
Array2D,
Array3D,
Expand All @@ -19,7 +22,6 @@
Value,
)
from libcommon.storage import StrPath
from numpy import ndarray
from PIL import Image as PILImage # type: ignore

from worker.asset import create_audio_files, create_image_file
Expand Down Expand Up @@ -56,7 +58,11 @@ def image(
if value is None:
return None
if not isinstance(value, PILImage.Image):
raise TypeError("image cell must be a PIL image")
try:
image_bytes = value["bytes"]
value = PILImage.open(io.BytesIO(image_bytes))
except Exception:
raise TypeError("image cell must be a PIL image")
# attempt to generate one of the supported formats; if unsuccessful, throw an error
for ext in [".jpg", ".png"]:
try:
Expand Down Expand Up @@ -96,8 +102,12 @@ def audio(
array = value["array"]
sampling_rate = value["sampling_rate"]
except Exception as e:
raise TypeError("audio cell must contain 'array' and 'sampling_rate' fields") from e
if type(array) != ndarray:
if "bytes" in value:
bytes_array, sampling_rate = soundfile.read(io.BytesIO(value["bytes"]))
array = numpy.array(bytes_array)
else:
raise TypeError("audio cell must contain 'array' and 'sampling_rate' fields") from e
if type(array) != numpy.ndarray:
raise TypeError("'array' field must be a numpy.ndarray")
if type(sampling_rate) != int:
raise TypeError("'sampling_rate' field must be an integer")
Expand Down
14 changes: 14 additions & 0 deletions services/worker/src/worker/job_runner_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@
DatasetSplitNamesFromStreamingJobRunner,
)
from worker.job_runners.parquet_and_dataset_info import ParquetAndDatasetInfoJobRunner
from worker.job_runners.split.first_rows_from_parquet import (
SplitFirstRowsFromParquetJobRunner,
)
from worker.job_runners.split.first_rows_from_streaming import (
SplitFirstRowsFromStreamingJobRunner,
)
Expand Down Expand Up @@ -172,6 +175,16 @@ def _create_job_runner(self, job_info: JobInfo) -> JobRunner:
common_config=self.app_config.common,
worker_config=self.app_config.worker,
)
if job_type == SplitFirstRowsFromParquetJobRunner.get_job_type():
first_rows_config = FirstRowsConfig.from_env()
return SplitFirstRowsFromParquetJobRunner(
job_info=job_info,
app_config=self.app_config,
processing_step=processing_step,
hf_datasets_cache=self.hf_datasets_cache,
first_rows_config=first_rows_config,
assets_directory=self.assets_directory,
)
supported_job_types = [
ConfigNamesJobRunner.get_job_type(),
SplitNamesFromStreamingJobRunner.get_job_type(),
Expand All @@ -187,5 +200,6 @@ def _create_job_runner(self, job_info: JobInfo) -> JobRunner:
SplitNamesFromDatasetInfoJobRunner.get_job_type(),
DatasetSplitNamesFromStreamingJobRunner.get_job_type(),
DatasetSplitNamesFromDatasetInfoJobRunner.get_job_type(),
SplitFirstRowsFromParquetJobRunner.get_job_type(),
]
raise ValueError(f"Unsupported job type: '{job_type}'. The supported job types are: {supported_job_types}")
Loading