update

lithomas1 · Jun 11, 2024 · c54316e · c54316e
1 parent cd6df5e
commit c54316e
Show file tree

Hide file tree

Showing 5 changed files with 145 additions and 32 deletions.
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
@@ -16,3 +16,4 @@ I/O Functions
     :maxdepth: 1
 
     avro
+    json
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
@@ -25,7 +25,7 @@ cpdef void write_json(
     str false_value = "false"
 ):
     """
-    Writes a :py:class:`~cudf._lib.pylibcudf.types.Table` to JSON format.
+    Writes a :py:class:`~cudf._lib.pylibcudf.table.Table` to JSON format.
 
     Parameters
     ----------

diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py
@@ -1,5 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+import io
+import os
 from typing import Optional, Union
 
 import pyarrow as pa
@@ -136,6 +138,19 @@ def is_fixed_width(plc_dtype: plc.DataType):
     )
 
 
+def sink_to_str(sink):
+    if isinstance(sink, (str, os.PathLike)):
+        with open(sink, "r") as f:
+            str_result = f.read()
+    elif isinstance(sink, io.BytesIO):
+        sink.seek(0)
+        str_result = sink.read().decode()
+    else:
+        sink.seek(0)
+        str_result = sink.read()
+    return str_result
+
+
 # TODO: enable uint64, some failing tests
 NUMERIC_PA_TYPES = [pa.int64(), pa.float64()]  # pa.uint64()]
 STRING_PA_TYPES = [pa.string()]

diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py
@@ -1,7 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 # Tell ruff it's OK that some imports occur after the sys.path.insert
 # ruff: noqa: E402
+import io
 import os
+import pathlib
 import sys
 
 import numpy as np
@@ -35,6 +37,8 @@ def numeric_pa_type(request):
     return request.param
 
 
+# TODO: Consider adding another fixture/adapting this
+# fixture to consider nullability
 @pytest.fixture(scope="session", params=[0, 100])
 def table_data(request):
     """
@@ -119,6 +123,24 @@ def _generate_struct_data(typ):
     ), pa_table
 
 
+@pytest.fixture(
+    params=["a.txt", pathlib.Path("a.txt"), io.BytesIO(), io.StringIO()],
+)
+def source_or_sink(request, tmp_path):
+    fp_or_buf = request.param
+    if isinstance(fp_or_buf, str):
+        fp_or_buf = f"{tmp_path}/{fp_or_buf}"
+    elif isinstance(fp_or_buf, os.PathLike):
+        fp_or_buf = tmp_path.joinpath(fp_or_buf)
+
+    yield fp_or_buf
+    # Cleanup after ourselves
+    # since the BytesIO and StringIO objects get cached by pytest
+    if isinstance(fp_or_buf, io.IOBase):
+        fp_or_buf.seek(0)
+        fp_or_buf.truncate(0)
+
+
 @pytest.fixture(
     scope="session", params=[opt for opt in plc.types.Interpolation]
 )

diff --git a/python/cudf/cudf/pylibcudf_tests/test_json.py b/python/cudf/cudf/pylibcudf_tests/test_json.py
@@ -1,36 +1,26 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import io
-import os
-import pathlib
 
 import pandas as pd
+import pyarrow as pa
 import pytest
+from utils import sink_to_str
 
 import cudf._lib.pylibcudf as plc
 
 
-@pytest.fixture(
-    params=["a.txt", pathlib.Path("a.txt"), io.BytesIO(), io.StringIO()],
-)
-def sink(request):
-    yield request.param
-    # Cleanup after ourselves
-    # since the BytesIO and StringIO objects get cached by pytest
-    if isinstance(request.param, io.IOBase):
-        buf = request.param
-        buf.seek(0)
-        buf.truncate(0)
-
-
+@pytest.mark.parametrize("rows_per_chunk", [8, 100])
 @pytest.mark.parametrize("lines", [True, False])
-def test_write_json_basic(table_data, sink, tmp_path, lines):
+def test_write_json_basic(table_data, source_or_sink, lines, rows_per_chunk):
     plc_table_w_meta, pa_table = table_data
-    if isinstance(sink, str):
-        sink = f"{tmp_path}/{sink}"
-    elif isinstance(sink, os.PathLike):
-        sink = tmp_path.joinpath(sink)
+    sink = source_or_sink
+
+    kwargs = dict()
+    if rows_per_chunk <= plc_table_w_meta.tbl.num_rows():
+        kwargs["rows_per_chunk"] = rows_per_chunk
+
     plc.io.json.write_json(
-        plc.io.SinkInfo([sink]), plc_table_w_meta, lines=lines
+        plc.io.SinkInfo([sink]), plc_table_w_meta, lines=lines, **kwargs
     )
 
     # orient=records (basically what the cudf json writer does,
@@ -42,17 +32,102 @@ def test_write_json_basic(table_data, sink, tmp_path, lines):
 
     # Convert everything to string to make
     # comparisons easier
-
-    if isinstance(sink, (str, os.PathLike)):
-        with open(sink, "r") as f:
-            str_result = f.read()
-    elif isinstance(sink, io.BytesIO):
-        sink.seek(0)
-        str_result = sink.read().decode()
-    else:
-        sink.seek(0)
-        str_result = sink.read()
+    str_result = sink_to_str(sink)
 
     pd_result = exp.to_json(orient="records", lines=lines)
 
     assert str_result == pd_result
+
+
+@pytest.mark.parametrize("include_nulls", [True, False])
+@pytest.mark.parametrize("na_rep", ["null", "awef", ""])
+def test_write_json_nulls(na_rep, include_nulls):
+    names = ["a", "b"]
+    pa_tbl = pa.Table.from_arrays(
+        [pa.array([1.0, 2.0, None]), pa.array([True, None, False])],
+        names=names,
+    )
+    plc_tbl = plc.interop.from_arrow(pa_tbl)
+    plc_tbl_w_meta = plc.io.types.TableWithMetadata(
+        plc_tbl, column_names=[(name, []) for name in names]
+    )
+
+    sink = io.StringIO()
+
+    plc.io.json.write_json(
+        plc.io.SinkInfo([sink]),
+        plc_tbl_w_meta,
+        na_rep=na_rep,
+        include_nulls=include_nulls,
+    )
+
+    # orient=records (basically what the cudf json writer does,
+    # doesn't preserve colnames when there are zero rows in table)
+    exp = pa_tbl.to_pandas()
+
+    if len(exp) == 0:
+        exp = pd.DataFrame()
+
+    # Convert everything to string to make
+    # comparisons easier
+    str_result = sink_to_str(sink)
+    pd_result = exp.to_json(orient="records")
+
+    if not include_nulls:
+        # No equivalent in pandas, so we just
+        # sanity check by making sure na_rep
+        # doesn't appear in the output
+
+        # don't quote null
+        for name in names:
+            assert f'{{"{name}":{na_rep}}}' not in str_result
+        return
+
+    # pandas doesn't suppport na_rep
+    # let's just manually do str.replace
+    pd_result = pd_result.replace("null", na_rep)
+
+    assert str_result == pd_result
+
+
+@pytest.mark.parametrize("true_value", ["True", "correct"])
+@pytest.mark.parametrize("false_value", ["False", "wrong"])
+def test_write_json_bool_opts(true_value, false_value):
+    names = ["a"]
+    pa_tbl = pa.Table.from_arrays([pa.array([True, None, False])], names=names)
+    plc_tbl = plc.interop.from_arrow(pa_tbl)
+    plc_tbl_w_meta = plc.io.types.TableWithMetadata(
+        plc_tbl, column_names=[(name, []) for name in names]
+    )
+
+    sink = io.StringIO()
+
+    plc.io.json.write_json(
+        plc.io.SinkInfo([sink]),
+        plc_tbl_w_meta,
+        include_nulls=True,
+        na_rep="null",
+        true_value=true_value,
+        false_value=false_value,
+    )
+
+    # orient=records (basically what the cudf json writer does,
+    # doesn't preserve colnames when there are zero rows in table)
+    exp = pa_tbl.to_pandas()
+
+    if len(exp) == 0:
+        exp = pd.DataFrame()
+
+    # Convert everything to string to make
+    # comparisons easier
+    str_result = sink_to_str(sink)
+    pd_result = exp.to_json(orient="records")
+
+    # pandas doesn't suppport na_rep
+    # let's just manually do str.replace
+    if true_value != "true":
+        pd_result = pd_result.replace("true", true_value)
+    if false_value != "false":
+        pd_result = pd_result.replace("false", false_value)
+
+    assert str_result == pd_result
Original file line number	Diff line number	Diff line change
Expand Up		@@ -16,3 +16,4 @@ I/O Functions
		:maxdepth: 1

		avro
		json