From 7697e4bf1d0bded10f47591854316e88c74be094 Mon Sep 17 00:00:00 2001
From: Sylvain Lesage <sylvain.lesage@huggingface.co>
Date: Tue, 11 Oct 2022 11:40:40 +0200
Subject: [PATCH] =?UTF-8?q?test:=20=F0=9F=92=8D=20add=20tests=20for=20miss?=
 =?UTF-8?q?ing=20fields=20and=20None=20value=20(#606)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* test: 💍 add tests for missing fields and None value

also: use JSONL file for tests

* feat: 🎸 update docker images
---
 chart/docker-images.yaml                      |  4 +--
 services/worker/src/worker/features.py        |  3 ++
 .../worker/src/worker/responses/first_rows.py |  2 +-
 services/worker/tests/fixtures/datasets.py    |  1 +
 services/worker/tests/fixtures/files.py       | 17 +++++++++++
 services/worker/tests/fixtures/hub.py         | 29 +++++++++++++++++--
 .../worker/tests/responses/test_first_rows.py |  1 +
 services/worker/tests/test_features.py        |  1 +
 8 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/chart/docker-images.yaml b/chart/docker-images.yaml
index b1be9ffa46..60d72397b9 100644
--- a/chart/docker-images.yaml
+++ b/chart/docker-images.yaml
@@ -4,8 +4,8 @@
     "api": "huggingface/datasets-server-api:sha-7210df0",
     "reverseProxy": "docker.io/nginx:1.20",
     "worker": {
-      "splits": "huggingface/datasets-server-worker:sha-a111310",
-      "firstRows": "huggingface/datasets-server-worker:sha-a111310"
+      "splits": "huggingface/datasets-server-worker:sha-06c9c4b",
+      "firstRows": "huggingface/datasets-server-worker:sha-06c9c4b"
     }
   }
 }
diff --git a/services/worker/src/worker/features.py b/services/worker/src/worker/features.py
index 227278bffe..a63e7920d8 100644
--- a/services/worker/src/worker/features.py
+++ b/services/worker/src/worker/features.py
@@ -122,6 +122,9 @@ def get_cell_value(
     assets_base_url: str,
     json_path: List[Union[str, int]] = None,
 ) -> Any:
+    # always allow None values in the cells
+    if cell is None:
+        return cell
     if isinstance(fieldType, Image):
         return image(dataset, config, split, row_idx, cell, featureName, assets_base_url, json_path)
     elif isinstance(fieldType, Audio):
diff --git a/services/worker/src/worker/responses/first_rows.py b/services/worker/src/worker/responses/first_rows.py
index d71c06ceca..92f0c63ab8 100644
--- a/services/worker/src/worker/responses/first_rows.py
+++ b/services/worker/src/worker/responses/first_rows.py
@@ -196,7 +196,7 @@ def transform_rows(
                 config,
                 split,
                 row_idx,
-                row[featureName],
+                row[featureName] if featureName in row else None,
                 featureName,
                 fieldType,
                 assets_base_url,
diff --git a/services/worker/tests/fixtures/datasets.py b/services/worker/tests/fixtures/datasets.py
index c50534c384..1a07dd2d99 100644
--- a/services/worker/tests/fixtures/datasets.py
+++ b/services/worker/tests/fixtures/datasets.py
@@ -126,4 +126,5 @@ def datasets() -> Dict[str, Dataset]:
         "sequence_of_dicts": other(
             [{"a": {"b": 0}}, {"a": {"b": 1}}], Sequence(feature={"a": {"b": Value(dtype="int64")}})
         ),
+        "none_value": other({"a": None}, {"a": Value(dtype="int64")}),
     }
diff --git a/services/worker/tests/fixtures/files.py b/services/worker/tests/fixtures/files.py
index 4a2dd290ff..db2037c4ae 100644
--- a/services/worker/tests/fixtures/files.py
+++ b/services/worker/tests/fixtures/files.py
@@ -2,6 +2,7 @@
 # Copyright 2022 The HuggingFace Authors.
 
 import csv
+import json
 
 import pytest
 
@@ -22,3 +23,19 @@ def csv_path(tmp_path_factory: pytest.TempPathFactory) -> str:
         for item in DATA:
             writer.writerow(item)
     return path
+
+
+JSONL = [
+    {"col_1": "0", "col_2": 0, "col_3": 0.0},
+    {"col_1": None, "col_2": 1, "col_3": 1.0},
+    {"col_2": 2, "col_3": 2.0},
+    {"col_1": "3", "col_2": 3, "col_3": 3.0},
+]
+
+
+@pytest.fixture(scope="session")
+def jsonl_path(tmp_path_factory: pytest.TempPathFactory) -> str:
+    path = str(tmp_path_factory.mktemp("data") / "dataset.jsonl")
+    with open(path, "w", newline="") as f:
+        f.writelines(json.dumps(o) for o in JSONL)
+    return path
diff --git a/services/worker/tests/fixtures/hub.py b/services/worker/tests/fixtures/hub.py
index ac3efdf6e6..0cd976280e 100644
--- a/services/worker/tests/fixtures/hub.py
+++ b/services/worker/tests/fixtures/hub.py
@@ -214,6 +214,14 @@ def hub_gated_csv(hf_api: HfApi, hf_token: str, csv_path: str) -> Iterable[str]:
         hf_api.delete_repo(repo_id=repo_id, token=hf_token, repo_type="dataset")
 
 
+@pytest.fixture(scope="session", autouse=True)
+def hub_public_jsonl(hf_api: HfApi, hf_token: str, jsonl_path: str) -> Iterable[str]:
+    repo_id = create_hub_dataset_repo(hf_api=hf_api, hf_token=hf_token, prefix="jsonl", file_paths=[jsonl_path])
+    yield repo_id
+    with suppress(requests.exceptions.HTTPError, ValueError):
+        hf_api.delete_repo(repo_id=repo_id, token=hf_token, repo_type="dataset")
+
+
 @pytest.fixture(scope="session", autouse=True)
 def hub_public_audio(hf_api: HfApi, hf_token: str, datasets: Dict[str, Dataset]) -> Iterable[str]:
     repo_id = create_hub_dataset_repo(hf_api=hf_api, hf_token=hf_token, prefix="audio", dataset=datasets["audio"])
@@ -289,8 +297,6 @@ def get_first_rows_response(dataset: str, cols: Dict[str, Any], rows: List[Any])
     }
 
 
-#        # column = "col"
-
 DATA_cols = {
     "col_1": {"_type": "Value", "id": None, "dtype": "int64"},
     "col_2": {"_type": "Value", "id": None, "dtype": "int64"},
@@ -303,6 +309,19 @@ def get_first_rows_response(dataset: str, cols: Dict[str, Any], rows: List[Any])
     {"col_1": 3, "col_2": 3, "col_3": 3.0},
 ]
 
+
+JSONL_cols = {
+    "col_1": {"_type": "Value", "id": None, "dtype": "string"},
+    "col_2": {"_type": "Value", "id": None, "dtype": "int64"},
+    "col_3": {"_type": "Value", "id": None, "dtype": "float64"},
+}
+JSONL_rows = [
+    {"col_1": "0", "col_2": 0, "col_3": 0.0},
+    {"col_1": None, "col_2": 1, "col_3": 1.0},
+    {"col_1": None, "col_2": 2, "col_3": 2.0},
+    {"col_1": "3", "col_2": 3, "col_3": 3.0},
+]
+
 AUDIO_cols = {
     "col": {
         "_type": "Audio",
@@ -391,6 +410,7 @@ def hub_datasets(
     hub_public_csv,
     hub_private_csv,
     hub_gated_csv,
+    hub_public_jsonl,
     hub_public_audio,
     hub_public_image,
     hub_public_images_list,
@@ -421,6 +441,11 @@ def hub_datasets(
             "splits_response": get_splits_response(hub_gated_csv, None, None),
             "first_rows_response": get_first_rows_response(hub_gated_csv, DATA_cols, DATA_rows),
         },
+        "jsonl": {
+            "name": hub_public_jsonl,
+            "splits_response": get_splits_response(hub_public_jsonl, None, None),
+            "first_rows_response": get_first_rows_response(hub_public_jsonl, JSONL_cols, JSONL_rows),
+        },
         "audio": {
             "name": hub_public_audio,
             "splits_response": get_splits_response(hub_public_audio, 54.0, 1),
diff --git a/services/worker/tests/responses/test_first_rows.py b/services/worker/tests/responses/test_first_rows.py
index e021f2ccdd..6674b6bc21 100644
--- a/services/worker/tests/responses/test_first_rows.py
+++ b/services/worker/tests/responses/test_first_rows.py
@@ -17,6 +17,7 @@
         ("audio", False, None, None),
         ("image", False, None, None),
         ("images_list", False, None, None),
+        ("jsonl", False, None, None),
         ("gated", True, None, None),
         ("private", True, None, None),
         ("empty", False, "EmptyDatasetError", "EmptyDatasetError"),
diff --git a/services/worker/tests/test_features.py b/services/worker/tests/test_features.py
index 51dce46d08..fdd6460728 100644
--- a/services/worker/tests/test_features.py
+++ b/services/worker/tests/test_features.py
@@ -274,6 +274,7 @@ def test_value(dataset_type, output_value, output_dtype, datasets) -> None:
             {"a": Value(dtype="int64"), "b": [Image(decode=True, id=None)], "c": {"ca": [Audio()]}},
         ),
         ("sequence_of_dicts", {"a": [{"b": 0}, {"b": 1}]}, "Sequence"),
+        ("none_value", {"a": None}, {"a": Value(dtype="int64", id=None)}),
     ],
 )
 def test_others(dataset_type: str, output_value: Any, output_type: Any, datasets: Dict[str, Dataset]) -> None: