Skip to content

Commit

Permalink
Don't write an all-null column with a dictionary (#3141)
Browse files Browse the repository at this point in the history
Fixes #3134
  • Loading branch information
niloc132 authored Dec 6, 2022
1 parent 1e77b4c commit c22c68c
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 2 deletions.
2 changes: 1 addition & 1 deletion Integrations/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def runInDocker = { String name, String sourcePath, List<String> command, Closur
// set up the container, env vars - things that aren't likely to change
from 'deephaven/server-netty:local-build'
runCommand '''set -eux; \\
pip3 install unittest-xml-reporting==3.0.4;\\
pip3 install pyarrow unittest-xml-reporting==3.0.4;\\
mkdir -p /out/report'''
volume '/data'
volume '/cache'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -688,6 +688,10 @@ private static <DATA_TYPE> boolean tryEncodeDictionary(@NotNull final ParquetIns
}
}

if (keyCount == 0 && hasNulls) {
return false;
}

columnWriter.addDictionaryPage(encodedKeys, keyCount);
final Iterator<IntBuffer> arraySizeIt = arraySizeBuffers == null ? null : arraySizeBuffers.iterator();
for (final IntBuffer pageBuffer : pageBuffers) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ private static Table getTableFlat(int size, boolean includeSerializable) {
ArrayList<String> columns =
new ArrayList<>(Arrays.asList("someStringColumn = i % 10 == 0?null:(`` + (i % 101))",
"nonNullString = `` + (i % 60)",
"nullString = (String) null",
"nonNullPolyString = `` + (i % 600)",
"someIntColumn = i",
"someLongColumn = ii",
Expand All @@ -76,7 +77,18 @@ private static Table getTableFlat(int size, boolean includeSerializable) {
"someKey = `` + (int)(i /100)",
"nullKey = i < -1?`123`:null",
"bdColumn = java.math.BigDecimal.valueOf(ii).stripTrailingZeros()",
"biColumn = java.math.BigInteger.valueOf(ii)"));
"biColumn = java.math.BigInteger.valueOf(ii)",
"nullKey = i < -1?`123`:null",
"nullIntColumn = (int)null",
"nullLongColumn = (long)null",
"nullDoubleColumn = (double)null",
"nullFloatColumn = (float)null",
"nullBoolColumn = (Boolean)null",
"nullShortColumn = (short)null",
"nullByteColumn = (byte)null",
"nullCharColumn = (char)null",
"nullTime = (DateTime)null",
"nullString = (String)null"));
if (includeSerializable) {
columns.add("someSerializable = new SomeSillyTest(i)");
}
Expand Down
68 changes: 68 additions & 0 deletions py/server/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,13 @@
import unittest
import tempfile

import pandas
from deephaven.pandas import to_pandas, to_table

from deephaven import empty_table, dtypes, new_table
from deephaven.column import InputColumn
from deephaven.parquet import write, batch_write, read, delete, ColumnInstruction
from deephaven.table import Table

from tests.testbase import BaseTestCase

Expand Down Expand Up @@ -142,6 +146,70 @@ def test_big_decimal(self):
self.assertTrue(os.path.exists(file_location))
shutil.rmtree(base_dir)

def test_round_trip_data(self):
"""
Pass data between DH and pandas via pyarrow, making sure each side can read data the other side writes
"""

# create a table with columns to test different types and edge cases
dh_table = empty_table(20).update(formulas=[
"someStringColumn = i % 10 == 0?null:(`` + (i % 101))",
"nonNullString = `` + (i % 60)",
"nonNullPolyString = `` + (i % 600)",
"someIntColumn = i",
"someLongColumn = ii",
"someDoubleColumn = i*1.1",
"someFloatColumn = (float)(i*1.1)",
"someBoolColumn = i % 3 == 0?true:i%3 == 1?false:null",
"someShortColumn = (short)i",
"someByteColumn = (byte)i",
"someCharColumn = (char)i",
# TODO(deephaven-core#3151) pyarrow indicates this value is out of the allowed range
# "someTime = DateTime.now() + i",
"someKey = `` + (int)(i /100)",
"nullKey = i < -1?`123`:null",
"nullIntColumn = (int)null",
"nullLongColumn = (long)null",
"nullDoubleColumn = (double)null",
"nullFloatColumn = (float)null",
"nullBoolColumn = (Boolean)null",
"nullShortColumn = (short)null",
"nullByteColumn = (byte)null",
"nullCharColumn = (char)null",
"nullTime = (DateTime)null",
"nullString = (String)null",
# TODO(deephaven-core#3151) BigInteger/BigDecimal columns don't roundtrip cleanly
# "nullBigDecColumn = (java.math.BigDecimal)null",
# "nullBigIntColumn = (java.math.BigInteger)null"
])
# These tests are done with each of the fully-supported compression formats
self.round_trip_with_compression("UNCOMPRESSED", dh_table)
self.round_trip_with_compression("SNAPPY", dh_table)
# LZO is not fully supported in python/c++
# self.round_trip_with_compression("LZO", dh_table)
# TODO(deephaven-core#3148) This test seems to write parquet output with LZ4_RAW as the compression type, Java can't read it
# self.round_trip_with_compression("LZ4", dh_table)
self.round_trip_with_compression("GZIP", dh_table)
self.round_trip_with_compression("ZSTD", dh_table)

def round_trip_with_compression(self, compression_codec_name, dh_table):
# dh->parquet->dataframe (via pyarrow)->dh
write(dh_table, "data_from_dh.parquet", compression_codec_name=compression_codec_name)
dataframe = pandas.read_parquet('data_from_dh.parquet', use_nullable_dtypes=True)
result_table = to_table(dataframe)
self.assert_table_equals(dh_table, result_table)

# dh->parquet->dataframe (via pyarrow)->parquet->dh
dataframe.to_parquet('data_from_pandas.parquet', compression=None if compression_codec_name is 'UNCOMPRESSED' else compression_codec_name)
result_table = read('data_from_pandas.parquet')
self.assert_table_equals(dh_table, result_table)

# dh->dataframe (via pyarrow)->parquet->dh
# TODO(deephaven-core#3149) disable for now, since to_pandas results in "None" strings instead of None values
# dataframe = to_pandas(dh_table)
# dataframe.to_parquet('data_from_pandas.parquet', compression=None if compression_codec_name is 'UNCOMPRESSED' else compression_codec_name)
# result_table = read('data_from_pandas.parquet')
# self.assert_table_equals(dh_table, result_table)

if __name__ == '__main__':
unittest.main()

0 comments on commit c22c68c

Please sign in to comment.