Skip to content

Commit

Permalink
test: 💍 add tests for missing fields and None value (#606)
Browse files Browse the repository at this point in the history
* test: 💍 add tests for missing fields and None value

also: use JSONL file for tests

* feat: 🎸 update docker images
  • Loading branch information
severo authored Oct 11, 2022
1 parent ab3c931 commit 7697e4b
Show file tree
Hide file tree
Showing 8 changed files with 53 additions and 5 deletions.
4 changes: 2 additions & 2 deletions chart/docker-images.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
"api": "huggingface/datasets-server-api:sha-7210df0",
"reverseProxy": "docker.io/nginx:1.20",
"worker": {
"splits": "huggingface/datasets-server-worker:sha-a111310",
"firstRows": "huggingface/datasets-server-worker:sha-a111310"
"splits": "huggingface/datasets-server-worker:sha-06c9c4b",
"firstRows": "huggingface/datasets-server-worker:sha-06c9c4b"
}
}
}
3 changes: 3 additions & 0 deletions services/worker/src/worker/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,9 @@ def get_cell_value(
assets_base_url: str,
json_path: List[Union[str, int]] = None,
) -> Any:
# always allow None values in the cells
if cell is None:
return cell
if isinstance(fieldType, Image):
return image(dataset, config, split, row_idx, cell, featureName, assets_base_url, json_path)
elif isinstance(fieldType, Audio):
Expand Down
2 changes: 1 addition & 1 deletion services/worker/src/worker/responses/first_rows.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def transform_rows(
config,
split,
row_idx,
row[featureName],
row[featureName] if featureName in row else None,
featureName,
fieldType,
assets_base_url,
Expand Down
1 change: 1 addition & 0 deletions services/worker/tests/fixtures/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,4 +126,5 @@ def datasets() -> Dict[str, Dataset]:
"sequence_of_dicts": other(
[{"a": {"b": 0}}, {"a": {"b": 1}}], Sequence(feature={"a": {"b": Value(dtype="int64")}})
),
"none_value": other({"a": None}, {"a": Value(dtype="int64")}),
}
17 changes: 17 additions & 0 deletions services/worker/tests/fixtures/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Copyright 2022 The HuggingFace Authors.

import csv
import json

import pytest

Expand All @@ -22,3 +23,19 @@ def csv_path(tmp_path_factory: pytest.TempPathFactory) -> str:
for item in DATA:
writer.writerow(item)
return path


JSONL = [
{"col_1": "0", "col_2": 0, "col_3": 0.0},
{"col_1": None, "col_2": 1, "col_3": 1.0},
{"col_2": 2, "col_3": 2.0},
{"col_1": "3", "col_2": 3, "col_3": 3.0},
]


@pytest.fixture(scope="session")
def jsonl_path(tmp_path_factory: pytest.TempPathFactory) -> str:
path = str(tmp_path_factory.mktemp("data") / "dataset.jsonl")
with open(path, "w", newline="") as f:
f.writelines(json.dumps(o) for o in JSONL)
return path
29 changes: 27 additions & 2 deletions services/worker/tests/fixtures/hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,14 @@ def hub_gated_csv(hf_api: HfApi, hf_token: str, csv_path: str) -> Iterable[str]:
hf_api.delete_repo(repo_id=repo_id, token=hf_token, repo_type="dataset")


@pytest.fixture(scope="session", autouse=True)
def hub_public_jsonl(hf_api: HfApi, hf_token: str, jsonl_path: str) -> Iterable[str]:
repo_id = create_hub_dataset_repo(hf_api=hf_api, hf_token=hf_token, prefix="jsonl", file_paths=[jsonl_path])
yield repo_id
with suppress(requests.exceptions.HTTPError, ValueError):
hf_api.delete_repo(repo_id=repo_id, token=hf_token, repo_type="dataset")


@pytest.fixture(scope="session", autouse=True)
def hub_public_audio(hf_api: HfApi, hf_token: str, datasets: Dict[str, Dataset]) -> Iterable[str]:
repo_id = create_hub_dataset_repo(hf_api=hf_api, hf_token=hf_token, prefix="audio", dataset=datasets["audio"])
Expand Down Expand Up @@ -289,8 +297,6 @@ def get_first_rows_response(dataset: str, cols: Dict[str, Any], rows: List[Any])
}


# # column = "col"

DATA_cols = {
"col_1": {"_type": "Value", "id": None, "dtype": "int64"},
"col_2": {"_type": "Value", "id": None, "dtype": "int64"},
Expand All @@ -303,6 +309,19 @@ def get_first_rows_response(dataset: str, cols: Dict[str, Any], rows: List[Any])
{"col_1": 3, "col_2": 3, "col_3": 3.0},
]


JSONL_cols = {
"col_1": {"_type": "Value", "id": None, "dtype": "string"},
"col_2": {"_type": "Value", "id": None, "dtype": "int64"},
"col_3": {"_type": "Value", "id": None, "dtype": "float64"},
}
JSONL_rows = [
{"col_1": "0", "col_2": 0, "col_3": 0.0},
{"col_1": None, "col_2": 1, "col_3": 1.0},
{"col_1": None, "col_2": 2, "col_3": 2.0},
{"col_1": "3", "col_2": 3, "col_3": 3.0},
]

AUDIO_cols = {
"col": {
"_type": "Audio",
Expand Down Expand Up @@ -391,6 +410,7 @@ def hub_datasets(
hub_public_csv,
hub_private_csv,
hub_gated_csv,
hub_public_jsonl,
hub_public_audio,
hub_public_image,
hub_public_images_list,
Expand Down Expand Up @@ -421,6 +441,11 @@ def hub_datasets(
"splits_response": get_splits_response(hub_gated_csv, None, None),
"first_rows_response": get_first_rows_response(hub_gated_csv, DATA_cols, DATA_rows),
},
"jsonl": {
"name": hub_public_jsonl,
"splits_response": get_splits_response(hub_public_jsonl, None, None),
"first_rows_response": get_first_rows_response(hub_public_jsonl, JSONL_cols, JSONL_rows),
},
"audio": {
"name": hub_public_audio,
"splits_response": get_splits_response(hub_public_audio, 54.0, 1),
Expand Down
1 change: 1 addition & 0 deletions services/worker/tests/responses/test_first_rows.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
("audio", False, None, None),
("image", False, None, None),
("images_list", False, None, None),
("jsonl", False, None, None),
("gated", True, None, None),
("private", True, None, None),
("empty", False, "EmptyDatasetError", "EmptyDatasetError"),
Expand Down
1 change: 1 addition & 0 deletions services/worker/tests/test_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ def test_value(dataset_type, output_value, output_dtype, datasets) -> None:
{"a": Value(dtype="int64"), "b": [Image(decode=True, id=None)], "c": {"ca": [Audio()]}},
),
("sequence_of_dicts", {"a": [{"b": 0}, {"b": 1}]}, "Sequence"),
("none_value", {"a": None}, {"a": Value(dtype="int64", id=None)}),
],
)
def test_others(dataset_type: str, output_value: Any, output_type: Any, datasets: Dict[str, Dataset]) -> None:
Expand Down

0 comments on commit 7697e4b

Please sign in to comment.