clean-up, remove extension_column kwarg in to_pandas, add docs

apache · Oct 30, 2019 · 013d904 · 013d904
1 parent 6f6b6f6
commit 013d904
Show file tree

Hide file tree

Showing 5 changed files with 99 additions and 57 deletions.
diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst
@@ -224,3 +224,39 @@ data type from above would look like::
             return PeriodType, (self.freq,)
 
 Also the storage type does not need to be fixed but can be parametrized.
+
+Conversion to pandas
+~~~~~~~~~~~~~~~~~~~~
+
+The conversion to pandas (in :meth:`Table.to_pandas`) of columns with an
+extension type can controlled in case there is a corresponding
+`pandas extension array <https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extension-types>`__
+for your extension type.
+
+For this, the :meth:`ExtensionType.to_pandas_dtype` method needs to be
+implemented, and should return a ``pandas.api.extensions.ExtensionDtype``
+subclass instance.
+
+Using the pandas period type from above as example, this would look like::
+
+    class PeriodType(pa.ExtensionType):
+        ...
+
+        def to_pandas_dtype(self):
+            import pandas as pd
+            return pd.PeriodDtype(freq=self.freq)
+
+Secondly, the pandas ``ExtensionDtype`` on its turn needs to have the
+``__from_arrow__`` method implemented: a method that given a pyarrow Array
+or ChunkedArray of the extesion type can construct the corresponding
+pandas ``ExtensionArray``. This method should have the following signature::
+
+
+    class MyExtensionDtype(pd.api.extensions.ExtensionDtype):
+        ...
+
+        def __from_arrow__(self, array: pyarrow.Array/ChunkedArray) -> pandas.ExtensionArray:
+            ...
+
+This way, you can control the conversion of an pyarrow ``Array`` of your pyarrow
+extension type to a pandas ``ExtensionArray`` that can be stored in a DataFrame.
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
@@ -421,8 +421,7 @@ cdef class _PandasConvertible:
             bint date_as_object=True,
             bint use_threads=True,
             bint deduplicate_objects=True,
-            bint ignore_metadata=False,
-            extension_columns=None,
+            bint ignore_metadata=False
     ):
         """
         Convert to a pandas-compatible NumPy array or DataFrame, as appropriate
@@ -467,7 +466,6 @@ cdef class _PandasConvertible:
             deduplicate_objects=deduplicate_objects
         )
         return self._to_pandas(options, categories=categories,
-                               extension_columns=extension_columns,
                                ignore_metadata=ignore_metadata)
 
 

diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
@@ -61,7 +61,6 @@ def get_logical_type_map():
             pa.lib.Type_BINARY: 'bytes',
             pa.lib.Type_FIXED_SIZE_BINARY: 'bytes',
             pa.lib.Type_STRING: 'unicode',
-            # pa.lib.Type_EXTENSION: 'extension',
         })
     return _logical_type_map
 
@@ -647,10 +646,34 @@ def serialized_dict_to_dataframe(data):
 
 
 def _reconstruct_block(item, columns=None, extension_columns=None):
+    """
+    Construct a pandas Block from the `item` dictionary coming from pyarrow's
+    serialization or returned by arrow::python::ConvertTableToPandas.
+
+    This function takes care of converting dictionary types to pandas
+    categorical, Timestamp-with-timezones to the proper pandas Block, and
+    conversion to pandas ExtensionBlock
+
+    Parameters
+    ----------
+    item : dict
+        For basic types, this is a dictionary in the form of
+        {'block': np.ndarray of values, 'placement': pandas block placement}.
+        Additional keys are present for other types (dictionary, timezone,
+        object).
+    columns :
+        Column names of the table being constructed, used for extension types
+    extension_columns : dict
+        Dictionary of {column_name: pandas_dtype} that includes all columns
+        and corresponding dtypes that will be converted to a pandas
+        ExtensionBlock.
+
+    Returns
+    -------
+    pandas Block
+
+    """
     import pandas.core.internals as _int
-    # Construct the individual blocks converting dictionary types to pandas
-    # categorical types and Timestamps-with-timezones types to the proper
-    # pandas Blocks
 
     block_arr = item.get('block', None)
     placement = item['placement']
@@ -675,7 +698,8 @@ def _reconstruct_block(item, columns=None, extension_columns=None):
         name = columns[placement[0]]
         pandas_dtype = extension_columns[name]
         if not hasattr(pandas_dtype, '__from_arrow__'):
-            raise ValueError("This column does not support")
+            raise ValueError("This column does not support to be converted "
+                             "to a pandas ExtensionArray")
         pd_ext_arr = pandas_dtype.__from_arrow__(arr)
         block = _int.make_block(pd_ext_arr, placement=placement,
                                 klass=_int.ExtensionBlock)
@@ -729,8 +753,9 @@ def table_to_blockmanager(options, table, categories=None,
 
 def _get_extension_dtypes(table, columns_metadata, extension_columns):
     """
-    Based on the stored column pandas metadata, infer which columns
-    should be converted to a pandas extension dtype.
+    Based on the stored column pandas metadata and the extension types
+    in the arrow schema, infer which columns should be converted to a
+    pandas extension dtype.
 
     The 'numpy_type' field in the column metadata stores the string
     representation of the original pandas dtype (and, despite its name,

diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
@@ -1269,12 +1269,10 @@ cdef class Table(_PandasConvertible):
 
         return result
 
-    def _to_pandas(self, options, categories=None, extension_columns=None,
-                   ignore_metadata=False):
+    def _to_pandas(self, options, categories=None, ignore_metadata=False):
         from pyarrow.pandas_compat import table_to_blockmanager
         mgr = table_to_blockmanager(
-            options, self, categories, extension_columns,
-            ignore_metadata=ignore_metadata)
+            options, self, categories, ignore_metadata=ignore_metadata)
         return pandas_api.data_frame(mgr)
 
     def to_pydict(self):

diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
@@ -3244,7 +3244,7 @@ def _Int64Dtype__from_arrow__(self, array):
     return int_arr
 
 
-def test_convert_to_extension_array():
+def test_convert_to_extension_array(monkeypatch):
     if LooseVersion(pd.__version__) < "0.26.0.dev":
         pytest.skip("Conversion from IntegerArray to arrow not yet supported")
 
@@ -3262,37 +3262,24 @@ def test_convert_to_extension_array():
     assert len(result._data.blocks) == 1
     assert isinstance(result._data.blocks[0], _int.IntBlock)
 
-    # raise error is explicitly asking for unsupported conversion
-    with pytest.raises(ValueError):
-        table.to_pandas(extension_columns=['b'])
-
-    try:
-        # patch pandas Int64Dtype to have the protocol method
-        pd.Int64Dtype.__from_arrow__ = _Int64Dtype__from_arrow__
-
-        # Int64Dtype is recognized -> convert to extension block by default
-        # for a proper roundtrip
-        result = table.to_pandas()
-        assert isinstance(result._data.blocks[0], _int.IntBlock)
-        assert isinstance(result._data.blocks[1], _int.ExtensionBlock)
-        tm.assert_frame_equal(result, df)
-
-        # explicitly specifying the column works as well
-        # TODO is this useful?
-        result = table.to_pandas(extension_columns=['b'])
-        assert isinstance(result._data.blocks[0], _int.IntBlock)
-        assert isinstance(result._data.blocks[1], _int.ExtensionBlock)
-        tm.assert_frame_equal(result, df)
+    # patch pandas Int64Dtype to have the protocol method
+    monkeypatch.setattr(
+        pd.Int64Dtype, '__from_arrow__', _Int64Dtype__from_arrow__,
+        raising=False)
 
-        # test with missing values
-        df2 = pd.DataFrame({'a': pd.array([1, 2, None], dtype='Int64')})
-        table2 = pa.table(df2)
-        result = table2.to_pandas(extension_columns=['a'])
-        assert isinstance(result._data.blocks[0], _int.ExtensionBlock)
-        tm.assert_frame_equal(result, df2)
+    # Int64Dtype is recognized -> convert to extension block by default
+    # for a proper roundtrip
+    result = table.to_pandas()
+    assert isinstance(result._data.blocks[0], _int.IntBlock)
+    assert isinstance(result._data.blocks[1], _int.ExtensionBlock)
+    tm.assert_frame_equal(result, df)
 
-    finally:
-        del pd.Int64Dtype.__from_arrow__
+    # test with missing values
+    df2 = pd.DataFrame({'a': pd.array([1, 2, None], dtype='Int64')})
+    table2 = pa.table(df2)
+    result = table2.to_pandas()
+    assert isinstance(result._data.blocks[0], _int.ExtensionBlock)
+    tm.assert_frame_equal(result, df2)
 
 
 class MyCustomIntegerType(pa.PyExtensionType):
@@ -3307,7 +3294,7 @@ def to_pandas_dtype(self):
         return pd.Int64Dtype()
 
 
-def test_conversion_extensiontype_to_extensionarray():
+def test_conversion_extensiontype_to_extensionarray(monkeypatch):
     # converting extension type to linked pandas ExtensionDtype/Array
     import pandas.core.internals as _int
 
@@ -3318,19 +3305,17 @@ def test_conversion_extensiontype_to_extensionarray():
     with pytest.raises(ValueError):
         table.to_pandas()
 
-    try:
-        # patch pandas Int64Dtype to have the protocol method
-        pd.Int64Dtype.__from_arrow__ = _Int64Dtype__from_arrow__
+    # patch pandas Int64Dtype to have the protocol method
+    monkeypatch.setattr(
+        pd.Int64Dtype, '__from_arrow__', _Int64Dtype__from_arrow__,
+        raising=False)
 
-        # extension type points to Int64Dtype, which knows how to create a
-        # pandas ExtensionArray
-        result = table.to_pandas()
-        assert isinstance(result._data.blocks[0], _int.ExtensionBlock)
-        expected = pd.DataFrame({'a': pd.array([1, 2, 3, 4], dtype='Int64')})
-        tm.assert_frame_equal(result, expected)
-
-    finally:
-        del pd.Int64Dtype.__from_arrow__
+    # extension type points to Int64Dtype, which knows how to create a
+    # pandas ExtensionArray
+    result = table.to_pandas()
+    assert isinstance(result._data.blocks[0], _int.ExtensionBlock)
+    expected = pd.DataFrame({'a': pd.array([1, 2, 3, 4], dtype='Int64')})
+    tm.assert_frame_equal(result, expected)
 
 
 # ----------------------------------------------------------------------