apache · jorisvandenbossche · Jan 14, 2020 · Jan 23, 2020 · Jan 23, 2020 · Jan 23, 2020
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
@@ -491,7 +491,8 @@ cdef class _PandasConvertible:
             bint deduplicate_objects=True,
             bint ignore_metadata=False,
             bint split_blocks=False,
-            bint self_destruct=False
+            bint self_destruct=False,
+            types_mapper=None
     ):
         """
         Convert to a pandas-compatible NumPy array or DataFrame, as appropriate
@@ -531,6 +532,14 @@ cdef class _PandasConvertible:
             memory while converting the Arrow object to pandas. If you use the
             object after calling to_pandas with this option it will crash your
             program
+        types_mapper : function, default None
+            A function mapping a pyarrow DataType to a pandas ExtensionDtype.
+            This can be used to override the default pandas type for conversion
+            of built-in pyarrow types or in absence of pandas_metadata in the
+            Table schema. The function receives a pyarrow DataType and is
+            expected to return a pandas ExtensionDtype or ``None`` if the
+            default conversion should be used for that type. If you have
+            a dictionary mapping, you can pass ``dict.get`` as function.
 
         Returns
         -------
@@ -548,7 +557,8 @@ cdef class _PandasConvertible:
             self_destruct=self_destruct
         )
         return self._to_pandas(options, categories=categories,
-                               ignore_metadata=ignore_metadata)
+                               ignore_metadata=ignore_metadata,
+                               types_mapper=types_mapper)
 
 
 cdef PandasOptions _convert_pandas_options(dict options):

diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
@@ -741,7 +741,7 @@ def make_datetimetz(tz):
 
 
 def table_to_blockmanager(options, table, categories=None,
-                          extension_columns=None, ignore_metadata=False):
+                          ignore_metadata=False, types_mapper=None):
     from pandas.core.internals import BlockManager
 
     all_columns = []
@@ -756,14 +756,10 @@ def table_to_blockmanager(options, table, categories=None,
         table, index = _reconstruct_index(table, index_descriptors,
                                           all_columns)
         ext_columns_dtypes = _get_extension_dtypes(
-            table, all_columns, extension_columns)
+            table, all_columns, types_mapper)
     else:
         index = _pandas_api.pd.RangeIndex(table.num_rows)
-        if extension_columns:
-            raise ValueError("extension_columns not supported if there is "
-                             "no pandas_metadata")
-        ext_columns_dtypes = _get_extension_dtypes(
-            table, [], extension_columns)
+        ext_columns_dtypes = _get_extension_dtypes(table, [], types_mapper)
 
     _check_data_column_metadata_consistency(all_columns)
     columns = _deserialize_column_index(table, all_columns, column_indexes)
@@ -782,7 +778,7 @@ def table_to_blockmanager(options, table, categories=None,
 ])
 
 
-def _get_extension_dtypes(table, columns_metadata, extension_columns):
+def _get_extension_dtypes(table, columns_metadata, types_mapper=None):
     """
     Based on the stored column pandas metadata and the extension types
     in the arrow schema, infer which columns should be converted to a
@@ -799,46 +795,38 @@ def _get_extension_dtypes(table, columns_metadata, extension_columns):
 
     # older pandas version that does not yet support extension dtypes
     if _pandas_api.extension_dtype is None:
-        if extension_columns is not None:
-            raise ValueError(
-                "Converting to pandas ExtensionDtypes is not supported")
         return ext_columns
 
-    if extension_columns is None:
-        # infer the extension columns from the pandas metadata
-        for col_meta in columns_metadata:
-            name = col_meta['name']
-            dtype = col_meta['numpy_type']
-            if dtype not in _pandas_supported_numpy_types:
-                # pandas_dtype is expensive, so avoid doing this for types
-                # that are certainly numpy dtypes
-                pandas_dtype = _pandas_api.pandas_dtype(dtype)
-                if isinstance(pandas_dtype, _pandas_api.extension_dtype):
-                    if hasattr(pandas_dtype, "__from_arrow__"):
-                        ext_columns[name] = pandas_dtype
-        # infer from extension type in the schema
+    # infer the extension columns from the pandas metadata
+    for col_meta in columns_metadata:
+        name = col_meta['name']
+        dtype = col_meta['numpy_type']
+        if dtype not in _pandas_supported_numpy_types:
+            # pandas_dtype is expensive, so avoid doing this for types
+            # that are certainly numpy dtypes
+            pandas_dtype = _pandas_api.pandas_dtype(dtype)
+            if isinstance(pandas_dtype, _pandas_api.extension_dtype):
+                if hasattr(pandas_dtype, "__from_arrow__"):
+                    ext_columns[name] = pandas_dtype
+
+    # infer from extension type in the schema
+    for field in table.schema:
+        typ = field.type
+        if isinstance(typ, pa.BaseExtensionType):
+            try:
+                pandas_dtype = typ.to_pandas_dtype()
+            except NotImplementedError:
+                pass
+            else:
+                ext_columns[field.name] = pandas_dtype
+
+    # use the specified mapping of built-in arrow types to pandas dtypes
+    if types_mapper:
         for field in table.schema:
             typ = field.type
-            if isinstance(typ, pa.BaseExtensionType):
-                try:
-                    pandas_dtype = typ.to_pandas_dtype()
-                except NotImplementedError:
-                    pass
-                else:
-                    ext_columns[field.name] = pandas_dtype
-
-    else:
-        # get the extension dtype for the specified columns
-        for name in extension_columns:
-            col_meta = [
-                meta for meta in columns_metadata if meta['name'] == name][0]
-            pandas_dtype = _pandas_api.pandas_dtype(col_meta['numpy_type'])
-            if not isinstance(pandas_dtype, _pandas_api.extension_dtype):
-                raise ValueError("not an extension dtype")
-            if not hasattr(pandas_dtype, "__from_arrow__"):
-                raise ValueError("this column does not support to be "
-                                 "converted to extension dtype")
-            ext_columns[name] = pandas_dtype
+            pandas_dtype = types_mapper(typ)
+            if pandas_dtype is not None:
+                ext_columns[field.name] = pandas_dtype
 
     return ext_columns
 

diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
@@ -1361,11 +1361,13 @@ cdef class Table(_PandasConvertible):
 
         return result
 
-    def _to_pandas(self, options, categories=None, ignore_metadata=False):
+    def _to_pandas(self, options, categories=None, ignore_metadata=False,
+                   types_mapper=None):
         from pyarrow.pandas_compat import table_to_blockmanager
         mgr = table_to_blockmanager(
             options, self, categories,
-            ignore_metadata=ignore_metadata)
+            ignore_metadata=ignore_metadata,
+            types_mapper=types_mapper)
         return pandas_api.data_frame(mgr)
 
     def to_pydict(self):

diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
@@ -3564,6 +3564,34 @@ def test_conversion_extensiontype_to_extensionarray(monkeypatch):
         table.to_pandas()
 
 
+def test_to_pandas_extension_dtypes_mapping():
+    if LooseVersion(pd.__version__) < "0.26.0.dev":
+        pytest.skip("Conversion to pandas IntegerArray not yet supported")
+
+    table = pa.table({'a': pa.array([1, 2, 3], pa.int64())})
+
+    # default use numpy dtype
+    result = table.to_pandas()
+    assert result['a'].dtype == np.dtype('int64')
+
+    # specify to override the default
+    result = table.to_pandas(types_mapper={pa.int64(): pd.Int64Dtype()}.get)
+    assert isinstance(result['a'].dtype, pd.Int64Dtype)
+
+    # types that return None in function get normal conversion
+    table = pa.table({'a': pa.array([1, 2, 3], pa.int32())})
+    result = table.to_pandas(types_mapper={pa.int64(): pd.Int64Dtype()}.get)
+    assert result['a'].dtype == np.dtype('int32')
+
+    # `types_mapper` overrules the pandas metadata
+    table = pa.table(pd.DataFrame({'a': pd.array([1, 2, 3], dtype="Int64")}))
+    result = table.to_pandas()
+    assert isinstance(result['a'].dtype, pd.Int64Dtype)
+    result = table.to_pandas(
+        types_mapper={pa.int64(): pd.PeriodDtype('D')}.get)
+    assert isinstance(result['a'].dtype, pd.PeriodDtype)
+
+
 # ----------------------------------------------------------------------
 # Legacy metadata compatibility tests