Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-7569: [Python] Add API to map Arrow types to pandas ExtensionDtypes in to_pandas conversions #6189

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,8 @@ cdef class _PandasConvertible:
bint deduplicate_objects=True,
bint ignore_metadata=False,
bint split_blocks=False,
bint self_destruct=False
bint self_destruct=False,
types_mapper=None
):
"""
Convert to a pandas-compatible NumPy array or DataFrame, as appropriate
Expand Down Expand Up @@ -531,6 +532,14 @@ cdef class _PandasConvertible:
memory while converting the Arrow object to pandas. If you use the
object after calling to_pandas with this option it will crash your
program
types_mapper : function, default None
A function mapping a pyarrow DataType to a pandas ExtensionDtype.
This can be used to override the default pandas type for conversion
of built-in pyarrow types or in absence of pandas_metadata in the
Table schema. The function receives a pyarrow DataType and is
expected to return a pandas ExtensionDtype or ``None`` if the
default conversion should be used for that type. If you have
a dictionary mapping, you can pass ``dict.get`` as function.

Returns
-------
Expand All @@ -548,7 +557,8 @@ cdef class _PandasConvertible:
self_destruct=self_destruct
)
return self._to_pandas(options, categories=categories,
ignore_metadata=ignore_metadata)
ignore_metadata=ignore_metadata,
types_mapper=types_mapper)


cdef PandasOptions _convert_pandas_options(dict options):
Expand Down
76 changes: 32 additions & 44 deletions python/pyarrow/pandas_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -741,7 +741,7 @@ def make_datetimetz(tz):


def table_to_blockmanager(options, table, categories=None,
extension_columns=None, ignore_metadata=False):
ignore_metadata=False, types_mapper=None):
from pandas.core.internals import BlockManager

all_columns = []
Expand All @@ -756,14 +756,10 @@ def table_to_blockmanager(options, table, categories=None,
table, index = _reconstruct_index(table, index_descriptors,
all_columns)
ext_columns_dtypes = _get_extension_dtypes(
table, all_columns, extension_columns)
table, all_columns, types_mapper)
else:
index = _pandas_api.pd.RangeIndex(table.num_rows)
if extension_columns:
raise ValueError("extension_columns not supported if there is "
"no pandas_metadata")
ext_columns_dtypes = _get_extension_dtypes(
table, [], extension_columns)
ext_columns_dtypes = _get_extension_dtypes(table, [], types_mapper)

_check_data_column_metadata_consistency(all_columns)
columns = _deserialize_column_index(table, all_columns, column_indexes)
Expand All @@ -782,7 +778,7 @@ def table_to_blockmanager(options, table, categories=None,
])


def _get_extension_dtypes(table, columns_metadata, extension_columns):
def _get_extension_dtypes(table, columns_metadata, types_mapper=None):
"""
Based on the stored column pandas metadata and the extension types
in the arrow schema, infer which columns should be converted to a
Expand All @@ -799,46 +795,38 @@ def _get_extension_dtypes(table, columns_metadata, extension_columns):

# older pandas version that does not yet support extension dtypes
if _pandas_api.extension_dtype is None:
if extension_columns is not None:
raise ValueError(
"Converting to pandas ExtensionDtypes is not supported")
return ext_columns

if extension_columns is None:
# infer the extension columns from the pandas metadata
for col_meta in columns_metadata:
name = col_meta['name']
dtype = col_meta['numpy_type']
if dtype not in _pandas_supported_numpy_types:
# pandas_dtype is expensive, so avoid doing this for types
# that are certainly numpy dtypes
pandas_dtype = _pandas_api.pandas_dtype(dtype)
if isinstance(pandas_dtype, _pandas_api.extension_dtype):
if hasattr(pandas_dtype, "__from_arrow__"):
ext_columns[name] = pandas_dtype
# infer from extension type in the schema
# infer the extension columns from the pandas metadata
for col_meta in columns_metadata:
name = col_meta['name']
dtype = col_meta['numpy_type']
if dtype not in _pandas_supported_numpy_types:
# pandas_dtype is expensive, so avoid doing this for types
# that are certainly numpy dtypes
pandas_dtype = _pandas_api.pandas_dtype(dtype)
if isinstance(pandas_dtype, _pandas_api.extension_dtype):
if hasattr(pandas_dtype, "__from_arrow__"):
ext_columns[name] = pandas_dtype

# infer from extension type in the schema
for field in table.schema:
typ = field.type
if isinstance(typ, pa.BaseExtensionType):
try:
pandas_dtype = typ.to_pandas_dtype()
except NotImplementedError:
pass
else:
ext_columns[field.name] = pandas_dtype

# use the specified mapping of built-in arrow types to pandas dtypes
if types_mapper:
for field in table.schema:
typ = field.type
if isinstance(typ, pa.BaseExtensionType):
try:
pandas_dtype = typ.to_pandas_dtype()
except NotImplementedError:
pass
else:
ext_columns[field.name] = pandas_dtype

else:
# get the extension dtype for the specified columns
for name in extension_columns:
col_meta = [
meta for meta in columns_metadata if meta['name'] == name][0]
pandas_dtype = _pandas_api.pandas_dtype(col_meta['numpy_type'])
if not isinstance(pandas_dtype, _pandas_api.extension_dtype):
raise ValueError("not an extension dtype")
if not hasattr(pandas_dtype, "__from_arrow__"):
raise ValueError("this column does not support to be "
"converted to extension dtype")
ext_columns[name] = pandas_dtype
Copy link
Member Author

@jorisvandenbossche jorisvandenbossche Jan 23, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed this "else" for when extension_columns was specified, as this is no longer used (I added it initially to be able to specify which columns to convert to test this, when inference from the metadata was not yet implemented)

(this makes the diff a bit harder, but in this whole _get_extension_dtypes basically only the if types_mapper: block is added (the rest is only dedented))

pandas_dtype = types_mapper(typ)
if pandas_dtype is not None:
ext_columns[field.name] = pandas_dtype

return ext_columns

Expand Down
6 changes: 4 additions & 2 deletions python/pyarrow/table.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1361,11 +1361,13 @@ cdef class Table(_PandasConvertible):

return result

def _to_pandas(self, options, categories=None, ignore_metadata=False):
def _to_pandas(self, options, categories=None, ignore_metadata=False,
types_mapper=None):
from pyarrow.pandas_compat import table_to_blockmanager
mgr = table_to_blockmanager(
options, self, categories,
ignore_metadata=ignore_metadata)
ignore_metadata=ignore_metadata,
types_mapper=types_mapper)
return pandas_api.data_frame(mgr)

def to_pydict(self):
Expand Down
28 changes: 28 additions & 0 deletions python/pyarrow/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -3564,6 +3564,34 @@ def test_conversion_extensiontype_to_extensionarray(monkeypatch):
table.to_pandas()


def test_to_pandas_extension_dtypes_mapping():
if LooseVersion(pd.__version__) < "0.26.0.dev":
pytest.skip("Conversion to pandas IntegerArray not yet supported")

table = pa.table({'a': pa.array([1, 2, 3], pa.int64())})

# default use numpy dtype
result = table.to_pandas()
assert result['a'].dtype == np.dtype('int64')

# specify to override the default
result = table.to_pandas(types_mapper={pa.int64(): pd.Int64Dtype()}.get)
assert isinstance(result['a'].dtype, pd.Int64Dtype)

# types that return None in function get normal conversion
table = pa.table({'a': pa.array([1, 2, 3], pa.int32())})
result = table.to_pandas(types_mapper={pa.int64(): pd.Int64Dtype()}.get)
assert result['a'].dtype == np.dtype('int32')

# `types_mapper` overrules the pandas metadata
table = pa.table(pd.DataFrame({'a': pd.array([1, 2, 3], dtype="Int64")}))
result = table.to_pandas()
assert isinstance(result['a'].dtype, pd.Int64Dtype)
result = table.to_pandas(
types_mapper={pa.int64(): pd.PeriodDtype('D')}.get)
assert isinstance(result['a'].dtype, pd.PeriodDtype)


# ----------------------------------------------------------------------
# Legacy metadata compatibility tests

Expand Down