From 7697e4bf1d0bded10f47591854316e88c74be094 Mon Sep 17 00:00:00 2001 From: Sylvain Lesage Date: Tue, 11 Oct 2022 11:40:40 +0200 Subject: [PATCH] =?UTF-8?q?test:=20=F0=9F=92=8D=20add=20tests=20for=20miss?= =?UTF-8?q?ing=20fields=20and=20None=20value=20(#606)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * test: 💍 add tests for missing fields and None value also: use JSONL file for tests * feat: 🎸 update docker images --- chart/docker-images.yaml | 4 +-- services/worker/src/worker/features.py | 3 ++ .../worker/src/worker/responses/first_rows.py | 2 +- services/worker/tests/fixtures/datasets.py | 1 + services/worker/tests/fixtures/files.py | 17 +++++++++++ services/worker/tests/fixtures/hub.py | 29 +++++++++++++++++-- .../worker/tests/responses/test_first_rows.py | 1 + services/worker/tests/test_features.py | 1 + 8 files changed, 53 insertions(+), 5 deletions(-) diff --git a/chart/docker-images.yaml b/chart/docker-images.yaml index b1be9ffa46..60d72397b9 100644 --- a/chart/docker-images.yaml +++ b/chart/docker-images.yaml @@ -4,8 +4,8 @@ "api": "huggingface/datasets-server-api:sha-7210df0", "reverseProxy": "docker.io/nginx:1.20", "worker": { - "splits": "huggingface/datasets-server-worker:sha-a111310", - "firstRows": "huggingface/datasets-server-worker:sha-a111310" + "splits": "huggingface/datasets-server-worker:sha-06c9c4b", + "firstRows": "huggingface/datasets-server-worker:sha-06c9c4b" } } } diff --git a/services/worker/src/worker/features.py b/services/worker/src/worker/features.py index 227278bffe..a63e7920d8 100644 --- a/services/worker/src/worker/features.py +++ b/services/worker/src/worker/features.py @@ -122,6 +122,9 @@ def get_cell_value( assets_base_url: str, json_path: List[Union[str, int]] = None, ) -> Any: + # always allow None values in the cells + if cell is None: + return cell if isinstance(fieldType, Image): return image(dataset, config, split, row_idx, cell, featureName, assets_base_url, json_path) elif isinstance(fieldType, Audio): diff --git a/services/worker/src/worker/responses/first_rows.py b/services/worker/src/worker/responses/first_rows.py index d71c06ceca..92f0c63ab8 100644 --- a/services/worker/src/worker/responses/first_rows.py +++ b/services/worker/src/worker/responses/first_rows.py @@ -196,7 +196,7 @@ def transform_rows( config, split, row_idx, - row[featureName], + row[featureName] if featureName in row else None, featureName, fieldType, assets_base_url, diff --git a/services/worker/tests/fixtures/datasets.py b/services/worker/tests/fixtures/datasets.py index c50534c384..1a07dd2d99 100644 --- a/services/worker/tests/fixtures/datasets.py +++ b/services/worker/tests/fixtures/datasets.py @@ -126,4 +126,5 @@ def datasets() -> Dict[str, Dataset]: "sequence_of_dicts": other( [{"a": {"b": 0}}, {"a": {"b": 1}}], Sequence(feature={"a": {"b": Value(dtype="int64")}}) ), + "none_value": other({"a": None}, {"a": Value(dtype="int64")}), } diff --git a/services/worker/tests/fixtures/files.py b/services/worker/tests/fixtures/files.py index 4a2dd290ff..db2037c4ae 100644 --- a/services/worker/tests/fixtures/files.py +++ b/services/worker/tests/fixtures/files.py @@ -2,6 +2,7 @@ # Copyright 2022 The HuggingFace Authors. import csv +import json import pytest @@ -22,3 +23,19 @@ def csv_path(tmp_path_factory: pytest.TempPathFactory) -> str: for item in DATA: writer.writerow(item) return path + + +JSONL = [ + {"col_1": "0", "col_2": 0, "col_3": 0.0}, + {"col_1": None, "col_2": 1, "col_3": 1.0}, + {"col_2": 2, "col_3": 2.0}, + {"col_1": "3", "col_2": 3, "col_3": 3.0}, +] + + +@pytest.fixture(scope="session") +def jsonl_path(tmp_path_factory: pytest.TempPathFactory) -> str: + path = str(tmp_path_factory.mktemp("data") / "dataset.jsonl") + with open(path, "w", newline="") as f: + f.writelines(json.dumps(o) for o in JSONL) + return path diff --git a/services/worker/tests/fixtures/hub.py b/services/worker/tests/fixtures/hub.py index ac3efdf6e6..0cd976280e 100644 --- a/services/worker/tests/fixtures/hub.py +++ b/services/worker/tests/fixtures/hub.py @@ -214,6 +214,14 @@ def hub_gated_csv(hf_api: HfApi, hf_token: str, csv_path: str) -> Iterable[str]: hf_api.delete_repo(repo_id=repo_id, token=hf_token, repo_type="dataset") +@pytest.fixture(scope="session", autouse=True) +def hub_public_jsonl(hf_api: HfApi, hf_token: str, jsonl_path: str) -> Iterable[str]: + repo_id = create_hub_dataset_repo(hf_api=hf_api, hf_token=hf_token, prefix="jsonl", file_paths=[jsonl_path]) + yield repo_id + with suppress(requests.exceptions.HTTPError, ValueError): + hf_api.delete_repo(repo_id=repo_id, token=hf_token, repo_type="dataset") + + @pytest.fixture(scope="session", autouse=True) def hub_public_audio(hf_api: HfApi, hf_token: str, datasets: Dict[str, Dataset]) -> Iterable[str]: repo_id = create_hub_dataset_repo(hf_api=hf_api, hf_token=hf_token, prefix="audio", dataset=datasets["audio"]) @@ -289,8 +297,6 @@ def get_first_rows_response(dataset: str, cols: Dict[str, Any], rows: List[Any]) } -# # column = "col" - DATA_cols = { "col_1": {"_type": "Value", "id": None, "dtype": "int64"}, "col_2": {"_type": "Value", "id": None, "dtype": "int64"}, @@ -303,6 +309,19 @@ def get_first_rows_response(dataset: str, cols: Dict[str, Any], rows: List[Any]) {"col_1": 3, "col_2": 3, "col_3": 3.0}, ] + +JSONL_cols = { + "col_1": {"_type": "Value", "id": None, "dtype": "string"}, + "col_2": {"_type": "Value", "id": None, "dtype": "int64"}, + "col_3": {"_type": "Value", "id": None, "dtype": "float64"}, +} +JSONL_rows = [ + {"col_1": "0", "col_2": 0, "col_3": 0.0}, + {"col_1": None, "col_2": 1, "col_3": 1.0}, + {"col_1": None, "col_2": 2, "col_3": 2.0}, + {"col_1": "3", "col_2": 3, "col_3": 3.0}, +] + AUDIO_cols = { "col": { "_type": "Audio", @@ -391,6 +410,7 @@ def hub_datasets( hub_public_csv, hub_private_csv, hub_gated_csv, + hub_public_jsonl, hub_public_audio, hub_public_image, hub_public_images_list, @@ -421,6 +441,11 @@ def hub_datasets( "splits_response": get_splits_response(hub_gated_csv, None, None), "first_rows_response": get_first_rows_response(hub_gated_csv, DATA_cols, DATA_rows), }, + "jsonl": { + "name": hub_public_jsonl, + "splits_response": get_splits_response(hub_public_jsonl, None, None), + "first_rows_response": get_first_rows_response(hub_public_jsonl, JSONL_cols, JSONL_rows), + }, "audio": { "name": hub_public_audio, "splits_response": get_splits_response(hub_public_audio, 54.0, 1), diff --git a/services/worker/tests/responses/test_first_rows.py b/services/worker/tests/responses/test_first_rows.py index e021f2ccdd..6674b6bc21 100644 --- a/services/worker/tests/responses/test_first_rows.py +++ b/services/worker/tests/responses/test_first_rows.py @@ -17,6 +17,7 @@ ("audio", False, None, None), ("image", False, None, None), ("images_list", False, None, None), + ("jsonl", False, None, None), ("gated", True, None, None), ("private", True, None, None), ("empty", False, "EmptyDatasetError", "EmptyDatasetError"), diff --git a/services/worker/tests/test_features.py b/services/worker/tests/test_features.py index 51dce46d08..fdd6460728 100644 --- a/services/worker/tests/test_features.py +++ b/services/worker/tests/test_features.py @@ -274,6 +274,7 @@ def test_value(dataset_type, output_value, output_dtype, datasets) -> None: {"a": Value(dtype="int64"), "b": [Image(decode=True, id=None)], "c": {"ca": [Audio()]}}, ), ("sequence_of_dicts", {"a": [{"b": 0}, {"b": 1}]}, "Sequence"), + ("none_value", {"a": None}, {"a": Value(dtype="int64", id=None)}), ], ) def test_others(dataset_type: str, output_value: Any, output_type: Any, datasets: Dict[str, Dataset]) -> None: