-
Notifications
You must be signed in to change notification settings - Fork 901
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve the test data for pylibcudf I/O tests #16247
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,45 +5,17 @@ | |
import pyarrow as pa | ||
import pytest | ||
from utils import ( | ||
COMPRESSION_TYPE_TO_PANDAS, | ||
assert_table_and_meta_eq, | ||
make_source, | ||
sink_to_str, | ||
write_source_str, | ||
) | ||
|
||
import cudf._lib.pylibcudf as plc | ||
from cudf._lib.pylibcudf.io.types import CompressionType | ||
|
||
|
||
def make_json_source(path_or_buf, pa_table, **kwargs): | ||
""" | ||
Uses pandas to write a pyarrow Table to a JSON file. | ||
|
||
The caller is responsible for making sure that no arguments | ||
unsupported by pandas are passed in. | ||
""" | ||
df = pa_table.to_pandas() | ||
if "compression" in kwargs: | ||
kwargs["compression"] = COMPRESSION_TYPE_TO_PANDAS[ | ||
kwargs["compression"] | ||
] | ||
df.to_json(path_or_buf, orient="records", **kwargs) | ||
if isinstance(path_or_buf, io.IOBase): | ||
path_or_buf.seek(0) | ||
return path_or_buf | ||
|
||
|
||
def write_json_bytes(source, json_str): | ||
""" | ||
Write a JSON string to the source | ||
""" | ||
if not isinstance(source, io.IOBase): | ||
with open(source, "w") as source_f: | ||
source_f.write(json_str) | ||
else: | ||
if isinstance(source, io.BytesIO): | ||
json_str = json_str.encode("utf-8") | ||
source.write(json_str) | ||
source.seek(0) | ||
# Shared kwargs to pass to make_source | ||
_COMMON_JSON_SOURCE_KWARGS = {"format": "json", "orient": "records"} | ||
|
||
|
||
@pytest.mark.parametrize("rows_per_chunk", [8, 100]) | ||
|
@@ -156,44 +128,22 @@ def test_write_json_bool_opts(true_value, false_value): | |
|
||
@pytest.mark.parametrize("lines", [True, False]) | ||
def test_read_json_basic( | ||
table_data, source_or_sink, lines, compression_type, request | ||
table_data, source_or_sink, lines, text_compression_type | ||
): | ||
if compression_type in { | ||
# Not supported by libcudf | ||
CompressionType.SNAPPY, | ||
CompressionType.XZ, | ||
CompressionType.ZSTD, | ||
# Not supported by pandas | ||
# TODO: find a way to test these | ||
CompressionType.BROTLI, | ||
CompressionType.LZ4, | ||
CompressionType.LZO, | ||
CompressionType.ZLIB, | ||
}: | ||
pytest.skip("unsupported compression type by pandas/libcudf") | ||
compression_type = text_compression_type | ||
|
||
# can't compress non-binary data with pandas | ||
if isinstance(source_or_sink, io.StringIO): | ||
compression_type = CompressionType.NONE | ||
|
||
_, pa_table = table_data | ||
|
||
source = make_json_source( | ||
source_or_sink, pa_table, lines=lines, compression=compression_type | ||
) | ||
|
||
request.applymarker( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now that the data is more well-formed I guess, this isn't crashing anymore. I think there still might be an issue in libcudf's JSON reader, though. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
pytest.mark.xfail( | ||
condition=( | ||
len(pa_table) > 0 | ||
and compression_type | ||
not in {CompressionType.NONE, CompressionType.AUTO} | ||
), | ||
# note: wasn't able to narrow down the specific types that were failing | ||
# seems to be a little non-deterministic, but always fails with | ||
# cudaErrorInvalidValue invalid argument | ||
reason="libcudf json reader crashes on compressed non empty table_data", | ||
) | ||
source = make_source( | ||
source_or_sink, | ||
pa_table, | ||
lines=lines, | ||
compression=compression_type, | ||
**_COMMON_JSON_SOURCE_KWARGS, | ||
) | ||
|
||
if isinstance(source, io.IOBase): | ||
|
@@ -237,10 +187,11 @@ def test_read_json_dtypes(table_data, source_or_sink): | |
# Simple test for dtypes where we read in | ||
# all numeric data as floats | ||
_, pa_table = table_data | ||
source = make_json_source( | ||
source = make_source( | ||
source_or_sink, | ||
pa_table, | ||
lines=True, | ||
**_COMMON_JSON_SOURCE_KWARGS, | ||
) | ||
|
||
dtypes = [] | ||
|
@@ -295,7 +246,7 @@ def test_read_json_lines_byte_range(source_or_sink, chunk_size): | |
pytest.skip("byte_range doesn't work on StringIO") | ||
|
||
json_str = "[1, 2, 3]\n[4, 5, 6]\n[7, 8, 9]\n" | ||
write_json_bytes(source, json_str) | ||
write_source_str(source, json_str) | ||
|
||
tbls_w_meta = [] | ||
for chunk_start in range(0, len(json_str.encode("utf-8")), chunk_size): | ||
|
@@ -331,7 +282,7 @@ def test_read_json_lines_keep_quotes(keep_quotes, source_or_sink): | |
source = source_or_sink | ||
|
||
json_bytes = '["a", "b", "c"]\n' | ||
write_json_bytes(source, json_bytes) | ||
write_source_str(source, json_bytes) | ||
|
||
tbl_w_meta = plc.io.json.read_json( | ||
plc.io.SourceInfo([source]), lines=True, keep_quotes=keep_quotes | ||
|
@@ -359,8 +310,8 @@ def test_read_json_lines_keep_quotes(keep_quotes, source_or_sink): | |
def test_read_json_lines_recovery_mode(recovery_mode, source_or_sink): | ||
source = source_or_sink | ||
|
||
json_bytes = '{"a":1,"b":10}\n{"a":2,"b":11}\nabc\n{"a":3,"b":12}\n' | ||
write_json_bytes(source, json_bytes) | ||
json_str = '{"a":1,"b":10}\n{"a":2,"b":11}\nabc\n{"a":3,"b":12}\n' | ||
write_source_str(source, json_str) | ||
|
||
if recovery_mode == plc.io.types.JSONRecoveryMode.FAIL: | ||
with pytest.raises(RuntimeError): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added this since most formats don't support all of the compression types, so it probably makes sense to break it out into text (CSV/JSON) vs binary (ORC/Parquet)