change default dtype_backend for to_pandas (#4815)

* change default dtype_backend for to_pandas * Improve docstrings and add comments in test code * Accept suggested changes to docstrings
deephaven · Nov 14, 2023 · 02c9deb · 02c9deb
1 parent 78f0d11
commit 02c9deb
Show file tree

Hide file tree

Showing 5 changed files with 23 additions and 17 deletions.
diff --git a/py/server/deephaven/pandas.py b/py/server/deephaven/pandas.py
@@ -112,7 +112,8 @@ def _column_to_series(table: Table, col_def: Column, conv_null: bool) -> pd.Seri
 }
 
 
-def to_pandas(table: Table, cols: List[str] = None, dtype_backend: Literal[None, "pyarrow", "numpy_nullable"] = None,
+def to_pandas(table: Table, cols: List[str] = None,
+              dtype_backend: Literal[None, "pyarrow", "numpy_nullable"] = "numpy_nullable",
               conv_null: bool = True) -> pd.DataFrame:
     """Produces a pandas DataFrame from a table.
 
@@ -123,11 +124,13 @@ def to_pandas(table: Table, cols: List[str] = None, dtype_backend: Literal[None,
     Args:
         table (Table): the source table
         cols (List[str]): the source column names, default is None which means include all columns
-        dtype_backend (str): Which dtype_backend to use, e.g. whether a DataFrame should have NumPy arrays,
+        dtype_backend (str): which dtype_backend to use, e.g. whether a DataFrame should have NumPy arrays,
             nullable dtypes are used for all dtypes that have a nullable implementation when “numpy_nullable” is set,
-            pyarrow is used for all dtypes if “pyarrow” is set. default is None, meaning Numpy backed DataFrames with
-            no nullable dtypes.
-        conv_null (bool): When dtype_backend is not set, whether to check for Deephaven nulls in the data and
+            pyarrow is used for all dtypes if “pyarrow” is set. None means Numpy backed DataFrames with no nullable
+            dtypes. Both "numpy_nullable" and "pyarrow" automatically convert Deephaven nulls to Pandas NA and enable
+            Pandas extension types.  Extension types are needed to support types beyond NumPy's type system.  Extension
+            types support operations such as properly mapping Java Strings to Python strings. default is "numpy_nullable".
+        conv_null (bool): when dtype_backend is not set, whether to check for Deephaven nulls in the data and
             automatically replace them with pd.NA. default is True.
 
     Returns:

diff --git a/py/server/tests/test_learn_gather.py b/py/server/tests/test_learn_gather.py
@@ -141,7 +141,7 @@ def base_test(self, source, model, np_dtype):
         gatherer_colmajor = lambda rowset, colset: gather.table_to_numpy_2d(rowset, colset,
                                                                             gather.MemoryLayout.COLUMN_MAJOR, np_dtype)
 
-        array_from_table = to_pandas(source, conv_null=False).values
+        array_from_table = to_pandas(source, dtype_backend=None, conv_null=False).values
 
         gathered_rowmajor = gatherer_rowmajor(rows, cols)
         gathered_colmajor = gatherer_colmajor(rows, cols)

diff --git a/py/server/tests/test_pandas.py b/py/server/tests/test_pandas.py
@@ -54,7 +54,7 @@ def tearDown(self) -> None:
         super().tearDown()
 
     def test_to_pandas_no_conv_null(self):
-        df = to_pandas(self.test_table, conv_null=False)
+        df = to_pandas(self.test_table, dtype_backend=None, conv_null=False)
         self.assertEqual(len(df.columns), len(self.test_table.columns))
         self.assertEqual(df.size, 2 * len(self.test_table.columns))
         df_series = [df[col] for col in list(df.columns)]
@@ -70,7 +70,7 @@ def test_to_pandas_remaps(self):
         prepared_table = self.test_table.update(
             formulas=["Long = isNull(Long_) ? Double.NaN : Long_"])
 
-        df = to_pandas(prepared_table, cols=["Boolean", "Long"], conv_null=False)
+        df = to_pandas(prepared_table, cols=["Boolean", "Long"], dtype_backend=None, conv_null=False)
         self.assertEqual(df['Long'].dtype, np.float64)
         self.assertEqual(df['Boolean'].values.dtype, np.bool_)
 
@@ -88,12 +88,12 @@ def test_vector_column(self):
 
         test_table = test_table.group_by(["String"])
         df = to_pandas(test_table, cols=["String", "Doubles"])
-        self.assertEqual(df['String'].dtype, np.object_)
+        self.assertEqual(df['String'].dtype, pd.StringDtype())
         self.assertEqual(df['Doubles'].dtype, np.object_)
 
         double_series = df['Doubles']
-        self.assertEqual([1.0, 2.0], list(double_series[0].toArray()))
-        self.assertEqual([4.0, 8.0, 16.0], list(double_series[1].toArray()))
+        self.assertEqual([1.0, 2.0], list(double_series[0]))
+        self.assertEqual([4.0, 8.0, 16.0], list(double_series[1]))
 
     def test_invalid_col_name(self):
         with self.assertRaises(DHError) as cm:
@@ -114,7 +114,7 @@ def test_to_table(self):
             double_col(name="Double", data=[1.01, -1.01]),
         ]
         test_table = new_table(cols=input_cols)
-        df = to_pandas(test_table, conv_null=False)
+        df = to_pandas(test_table, dtype_backend=None, conv_null=False)
         table_from_df = to_table(df)
         self.assert_table_equals(table_from_df, test_table)
 
@@ -123,7 +123,7 @@ def test_to_table_boolean_with_none(self):
         table_with_null_bool = new_table(cols=input_cols)
         prepared_table = table_with_null_bool.update(
             formulas=["Boolean = isNull(Boolean) ? (byte)NULL_BYTE : (Boolean == true ? 1: 0)"])
-        df = to_pandas(prepared_table, conv_null=False)
+        df = to_pandas(prepared_table, dtype_backend=None, conv_null=False)
         table_from_df = to_table(df)
         self.assert_table_equals(table_from_df, prepared_table)
 
@@ -159,7 +159,7 @@ def test_round_trip_with_nulls(self):
             pyobj_col(name="PyObj", data=[CustomClass(1, "1"), None]),
         ]
         test_table = new_table(cols=input_cols)
-        df = to_pandas(test_table)
+        df = to_pandas(test_table, dtype_backend=None)
         self.assertEqual(len(df.columns), len(test_table.columns))
         self.assertEqual(df.size, 2 * len(test_table.columns))
         test_table2 = to_table(df)

diff --git a/py/server/tests/test_parquet.py b/py/server/tests/test_parquet.py
@@ -347,7 +347,8 @@ def test_dates_and_time(self):
         from_disk = read('data_from_dh.parquet')
         self.assert_table_equals(dh_table, from_disk)
 
-        df_from_disk = to_pandas(from_disk)
+        # TODO dtype_backend=None is a workaround until https://github.com/deephaven/deephaven-core/issues/4823 is fixed
+        df_from_disk = to_pandas(from_disk, dtype_backend=None)
         if pandas.__version__.split('.')[0] == "1":
             df_from_pandas = pandas.read_parquet("data_from_dh.parquet", use_nullable_dtypes=True)
         else:
@@ -384,7 +385,9 @@ def time_test_helper(pa_table, new_schema, dest):
             # Write the provided pyarrow table type-casted to the new schema
             pyarrow.parquet.write_table(pa_table.cast(new_schema), dest)
             from_disk = read(dest)
-            df_from_disk = to_pandas(from_disk)
+
+            # TODO dtype_backend=None is a workaround until https://github.com/deephaven/deephaven-core/issues/4823 is fixed
+            df_from_disk = to_pandas(from_disk, dtype_backend=None)
             original_df = pa_table.to_pandas()
             # Compare the dataframes as strings
             self.assertTrue((df_from_disk.astype(str) == original_df.astype(str)).all().values.all())

diff --git a/py/server/tests/test_table.py b/py/server/tests/test_table.py
@@ -676,7 +676,7 @@ def verify_layout_hint(t: Table, layout_hint_str: str):
         self.assertIn("RuntimeError", cm.exception.compact_traceback)
 
     def verify_table_data(self, t: Table, expected: List[Any], assert_not_in: bool = False):
-        t_data = to_pandas(t).values.flatten()
+        t_data = to_pandas(t, dtype_backend=None).values.flatten()
         for s in expected:
             if assert_not_in:
                 self.assertNotIn(s, t_data)