Skip to content

Commit

Permalink
change default dtype_backend for to_pandas (#4815)
Browse files Browse the repository at this point in the history
* change default dtype_backend for to_pandas

* Improve docstrings and add comments in test code

* Accept suggested changes to docstrings
  • Loading branch information
jmao-denver authored Nov 14, 2023
1 parent 78f0d11 commit 02c9deb
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 17 deletions.
13 changes: 8 additions & 5 deletions py/server/deephaven/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@ def _column_to_series(table: Table, col_def: Column, conv_null: bool) -> pd.Seri
}


def to_pandas(table: Table, cols: List[str] = None, dtype_backend: Literal[None, "pyarrow", "numpy_nullable"] = None,
def to_pandas(table: Table, cols: List[str] = None,
dtype_backend: Literal[None, "pyarrow", "numpy_nullable"] = "numpy_nullable",
conv_null: bool = True) -> pd.DataFrame:
"""Produces a pandas DataFrame from a table.
Expand All @@ -123,11 +124,13 @@ def to_pandas(table: Table, cols: List[str] = None, dtype_backend: Literal[None,
Args:
table (Table): the source table
cols (List[str]): the source column names, default is None which means include all columns
dtype_backend (str): Which dtype_backend to use, e.g. whether a DataFrame should have NumPy arrays,
dtype_backend (str): which dtype_backend to use, e.g. whether a DataFrame should have NumPy arrays,
nullable dtypes are used for all dtypes that have a nullable implementation when “numpy_nullable” is set,
pyarrow is used for all dtypes if “pyarrow” is set. default is None, meaning Numpy backed DataFrames with
no nullable dtypes.
conv_null (bool): When dtype_backend is not set, whether to check for Deephaven nulls in the data and
pyarrow is used for all dtypes if “pyarrow” is set. None means Numpy backed DataFrames with no nullable
dtypes. Both "numpy_nullable" and "pyarrow" automatically convert Deephaven nulls to Pandas NA and enable
Pandas extension types. Extension types are needed to support types beyond NumPy's type system. Extension
types support operations such as properly mapping Java Strings to Python strings. default is "numpy_nullable".
conv_null (bool): when dtype_backend is not set, whether to check for Deephaven nulls in the data and
automatically replace them with pd.NA. default is True.
Returns:
Expand Down
2 changes: 1 addition & 1 deletion py/server/tests/test_learn_gather.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def base_test(self, source, model, np_dtype):
gatherer_colmajor = lambda rowset, colset: gather.table_to_numpy_2d(rowset, colset,
gather.MemoryLayout.COLUMN_MAJOR, np_dtype)

array_from_table = to_pandas(source, conv_null=False).values
array_from_table = to_pandas(source, dtype_backend=None, conv_null=False).values

gathered_rowmajor = gatherer_rowmajor(rows, cols)
gathered_colmajor = gatherer_colmajor(rows, cols)
Expand Down
16 changes: 8 additions & 8 deletions py/server/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def tearDown(self) -> None:
super().tearDown()

def test_to_pandas_no_conv_null(self):
df = to_pandas(self.test_table, conv_null=False)
df = to_pandas(self.test_table, dtype_backend=None, conv_null=False)
self.assertEqual(len(df.columns), len(self.test_table.columns))
self.assertEqual(df.size, 2 * len(self.test_table.columns))
df_series = [df[col] for col in list(df.columns)]
Expand All @@ -70,7 +70,7 @@ def test_to_pandas_remaps(self):
prepared_table = self.test_table.update(
formulas=["Long = isNull(Long_) ? Double.NaN : Long_"])

df = to_pandas(prepared_table, cols=["Boolean", "Long"], conv_null=False)
df = to_pandas(prepared_table, cols=["Boolean", "Long"], dtype_backend=None, conv_null=False)
self.assertEqual(df['Long'].dtype, np.float64)
self.assertEqual(df['Boolean'].values.dtype, np.bool_)

Expand All @@ -88,12 +88,12 @@ def test_vector_column(self):

test_table = test_table.group_by(["String"])
df = to_pandas(test_table, cols=["String", "Doubles"])
self.assertEqual(df['String'].dtype, np.object_)
self.assertEqual(df['String'].dtype, pd.StringDtype())
self.assertEqual(df['Doubles'].dtype, np.object_)

double_series = df['Doubles']
self.assertEqual([1.0, 2.0], list(double_series[0].toArray()))
self.assertEqual([4.0, 8.0, 16.0], list(double_series[1].toArray()))
self.assertEqual([1.0, 2.0], list(double_series[0]))
self.assertEqual([4.0, 8.0, 16.0], list(double_series[1]))

def test_invalid_col_name(self):
with self.assertRaises(DHError) as cm:
Expand All @@ -114,7 +114,7 @@ def test_to_table(self):
double_col(name="Double", data=[1.01, -1.01]),
]
test_table = new_table(cols=input_cols)
df = to_pandas(test_table, conv_null=False)
df = to_pandas(test_table, dtype_backend=None, conv_null=False)
table_from_df = to_table(df)
self.assert_table_equals(table_from_df, test_table)

Expand All @@ -123,7 +123,7 @@ def test_to_table_boolean_with_none(self):
table_with_null_bool = new_table(cols=input_cols)
prepared_table = table_with_null_bool.update(
formulas=["Boolean = isNull(Boolean) ? (byte)NULL_BYTE : (Boolean == true ? 1: 0)"])
df = to_pandas(prepared_table, conv_null=False)
df = to_pandas(prepared_table, dtype_backend=None, conv_null=False)
table_from_df = to_table(df)
self.assert_table_equals(table_from_df, prepared_table)

Expand Down Expand Up @@ -159,7 +159,7 @@ def test_round_trip_with_nulls(self):
pyobj_col(name="PyObj", data=[CustomClass(1, "1"), None]),
]
test_table = new_table(cols=input_cols)
df = to_pandas(test_table)
df = to_pandas(test_table, dtype_backend=None)
self.assertEqual(len(df.columns), len(test_table.columns))
self.assertEqual(df.size, 2 * len(test_table.columns))
test_table2 = to_table(df)
Expand Down
7 changes: 5 additions & 2 deletions py/server/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,8 @@ def test_dates_and_time(self):
from_disk = read('data_from_dh.parquet')
self.assert_table_equals(dh_table, from_disk)

df_from_disk = to_pandas(from_disk)
# TODO dtype_backend=None is a workaround until https://github.com/deephaven/deephaven-core/issues/4823 is fixed
df_from_disk = to_pandas(from_disk, dtype_backend=None)
if pandas.__version__.split('.')[0] == "1":
df_from_pandas = pandas.read_parquet("data_from_dh.parquet", use_nullable_dtypes=True)
else:
Expand Down Expand Up @@ -384,7 +385,9 @@ def time_test_helper(pa_table, new_schema, dest):
# Write the provided pyarrow table type-casted to the new schema
pyarrow.parquet.write_table(pa_table.cast(new_schema), dest)
from_disk = read(dest)
df_from_disk = to_pandas(from_disk)

# TODO dtype_backend=None is a workaround until https://github.com/deephaven/deephaven-core/issues/4823 is fixed
df_from_disk = to_pandas(from_disk, dtype_backend=None)
original_df = pa_table.to_pandas()
# Compare the dataframes as strings
self.assertTrue((df_from_disk.astype(str) == original_df.astype(str)).all().values.all())
Expand Down
2 changes: 1 addition & 1 deletion py/server/tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -676,7 +676,7 @@ def verify_layout_hint(t: Table, layout_hint_str: str):
self.assertIn("RuntimeError", cm.exception.compact_traceback)

def verify_table_data(self, t: Table, expected: List[Any], assert_not_in: bool = False):
t_data = to_pandas(t).values.flatten()
t_data = to_pandas(t, dtype_backend=None).values.flatten()
for s in expected:
if assert_not_in:
self.assertNotIn(s, t_data)
Expand Down

0 comments on commit 02c9deb

Please sign in to comment.