Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
lithomas1 committed Jun 11, 2024
1 parent cd6df5e commit c54316e
Show file tree
Hide file tree
Showing 5 changed files with 145 additions and 32 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ I/O Functions
:maxdepth: 1

avro
json
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/pylibcudf/io/json.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ cpdef void write_json(
str false_value = "false"
):
"""
Writes a :py:class:`~cudf._lib.pylibcudf.types.Table` to JSON format.
Writes a :py:class:`~cudf._lib.pylibcudf.table.Table` to JSON format.
Parameters
----------
Expand Down
15 changes: 15 additions & 0 deletions python/cudf/cudf/pylibcudf_tests/common/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import io
import os
from typing import Optional, Union

import pyarrow as pa
Expand Down Expand Up @@ -136,6 +138,19 @@ def is_fixed_width(plc_dtype: plc.DataType):
)


def sink_to_str(sink):
if isinstance(sink, (str, os.PathLike)):
with open(sink, "r") as f:
str_result = f.read()
elif isinstance(sink, io.BytesIO):
sink.seek(0)
str_result = sink.read().decode()
else:
sink.seek(0)
str_result = sink.read()
return str_result


# TODO: enable uint64, some failing tests
NUMERIC_PA_TYPES = [pa.int64(), pa.float64()] # pa.uint64()]
STRING_PA_TYPES = [pa.string()]
Expand Down
22 changes: 22 additions & 0 deletions python/cudf/cudf/pylibcudf_tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
# Tell ruff it's OK that some imports occur after the sys.path.insert
# ruff: noqa: E402
import io
import os
import pathlib
import sys

import numpy as np
Expand Down Expand Up @@ -35,6 +37,8 @@ def numeric_pa_type(request):
return request.param


# TODO: Consider adding another fixture/adapting this
# fixture to consider nullability
@pytest.fixture(scope="session", params=[0, 100])
def table_data(request):
"""
Expand Down Expand Up @@ -119,6 +123,24 @@ def _generate_struct_data(typ):
), pa_table


@pytest.fixture(
params=["a.txt", pathlib.Path("a.txt"), io.BytesIO(), io.StringIO()],
)
def source_or_sink(request, tmp_path):
fp_or_buf = request.param
if isinstance(fp_or_buf, str):
fp_or_buf = f"{tmp_path}/{fp_or_buf}"
elif isinstance(fp_or_buf, os.PathLike):
fp_or_buf = tmp_path.joinpath(fp_or_buf)

yield fp_or_buf
# Cleanup after ourselves
# since the BytesIO and StringIO objects get cached by pytest
if isinstance(fp_or_buf, io.IOBase):
fp_or_buf.seek(0)
fp_or_buf.truncate(0)


@pytest.fixture(
scope="session", params=[opt for opt in plc.types.Interpolation]
)
Expand Down
137 changes: 106 additions & 31 deletions python/cudf/cudf/pylibcudf_tests/test_json.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,26 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
import io
import os
import pathlib

import pandas as pd
import pyarrow as pa
import pytest
from utils import sink_to_str

import cudf._lib.pylibcudf as plc


@pytest.fixture(
params=["a.txt", pathlib.Path("a.txt"), io.BytesIO(), io.StringIO()],
)
def sink(request):
yield request.param
# Cleanup after ourselves
# since the BytesIO and StringIO objects get cached by pytest
if isinstance(request.param, io.IOBase):
buf = request.param
buf.seek(0)
buf.truncate(0)


@pytest.mark.parametrize("rows_per_chunk", [8, 100])
@pytest.mark.parametrize("lines", [True, False])
def test_write_json_basic(table_data, sink, tmp_path, lines):
def test_write_json_basic(table_data, source_or_sink, lines, rows_per_chunk):
plc_table_w_meta, pa_table = table_data
if isinstance(sink, str):
sink = f"{tmp_path}/{sink}"
elif isinstance(sink, os.PathLike):
sink = tmp_path.joinpath(sink)
sink = source_or_sink

kwargs = dict()
if rows_per_chunk <= plc_table_w_meta.tbl.num_rows():
kwargs["rows_per_chunk"] = rows_per_chunk

plc.io.json.write_json(
plc.io.SinkInfo([sink]), plc_table_w_meta, lines=lines
plc.io.SinkInfo([sink]), plc_table_w_meta, lines=lines, **kwargs
)

# orient=records (basically what the cudf json writer does,
Expand All @@ -42,17 +32,102 @@ def test_write_json_basic(table_data, sink, tmp_path, lines):

# Convert everything to string to make
# comparisons easier

if isinstance(sink, (str, os.PathLike)):
with open(sink, "r") as f:
str_result = f.read()
elif isinstance(sink, io.BytesIO):
sink.seek(0)
str_result = sink.read().decode()
else:
sink.seek(0)
str_result = sink.read()
str_result = sink_to_str(sink)

pd_result = exp.to_json(orient="records", lines=lines)

assert str_result == pd_result


@pytest.mark.parametrize("include_nulls", [True, False])
@pytest.mark.parametrize("na_rep", ["null", "awef", ""])
def test_write_json_nulls(na_rep, include_nulls):
names = ["a", "b"]
pa_tbl = pa.Table.from_arrays(
[pa.array([1.0, 2.0, None]), pa.array([True, None, False])],
names=names,
)
plc_tbl = plc.interop.from_arrow(pa_tbl)
plc_tbl_w_meta = plc.io.types.TableWithMetadata(
plc_tbl, column_names=[(name, []) for name in names]
)

sink = io.StringIO()

plc.io.json.write_json(
plc.io.SinkInfo([sink]),
plc_tbl_w_meta,
na_rep=na_rep,
include_nulls=include_nulls,
)

# orient=records (basically what the cudf json writer does,
# doesn't preserve colnames when there are zero rows in table)
exp = pa_tbl.to_pandas()

if len(exp) == 0:
exp = pd.DataFrame()

# Convert everything to string to make
# comparisons easier
str_result = sink_to_str(sink)
pd_result = exp.to_json(orient="records")

if not include_nulls:
# No equivalent in pandas, so we just
# sanity check by making sure na_rep
# doesn't appear in the output

# don't quote null
for name in names:
assert f'{{"{name}":{na_rep}}}' not in str_result
return

# pandas doesn't suppport na_rep
# let's just manually do str.replace
pd_result = pd_result.replace("null", na_rep)

assert str_result == pd_result


@pytest.mark.parametrize("true_value", ["True", "correct"])
@pytest.mark.parametrize("false_value", ["False", "wrong"])
def test_write_json_bool_opts(true_value, false_value):
names = ["a"]
pa_tbl = pa.Table.from_arrays([pa.array([True, None, False])], names=names)
plc_tbl = plc.interop.from_arrow(pa_tbl)
plc_tbl_w_meta = plc.io.types.TableWithMetadata(
plc_tbl, column_names=[(name, []) for name in names]
)

sink = io.StringIO()

plc.io.json.write_json(
plc.io.SinkInfo([sink]),
plc_tbl_w_meta,
include_nulls=True,
na_rep="null",
true_value=true_value,
false_value=false_value,
)

# orient=records (basically what the cudf json writer does,
# doesn't preserve colnames when there are zero rows in table)
exp = pa_tbl.to_pandas()

if len(exp) == 0:
exp = pd.DataFrame()

# Convert everything to string to make
# comparisons easier
str_result = sink_to_str(sink)
pd_result = exp.to_json(orient="records")

# pandas doesn't suppport na_rep
# let's just manually do str.replace
if true_value != "true":
pd_result = pd_result.replace("true", true_value)
if false_value != "false":
pd_result = pd_result.replace("false", false_value)

assert str_result == pd_result

0 comments on commit c54316e

Please sign in to comment.