Skip to content

Commit

Permalink
clean-up, remove extension_column kwarg in to_pandas, add docs
Browse files Browse the repository at this point in the history
  • Loading branch information
jorisvandenbossche committed Oct 30, 2019
1 parent 6f6b6f6 commit 013d904
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 57 deletions.
36 changes: 36 additions & 0 deletions docs/source/python/extending_types.rst
Original file line number Diff line number Diff line change
Expand Up @@ -224,3 +224,39 @@ data type from above would look like::
return PeriodType, (self.freq,)

Also the storage type does not need to be fixed but can be parametrized.

Conversion to pandas
~~~~~~~~~~~~~~~~~~~~

The conversion to pandas (in :meth:`Table.to_pandas`) of columns with an
extension type can controlled in case there is a corresponding
`pandas extension array <https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extension-types>`__
for your extension type.

For this, the :meth:`ExtensionType.to_pandas_dtype` method needs to be
implemented, and should return a ``pandas.api.extensions.ExtensionDtype``
subclass instance.

Using the pandas period type from above as example, this would look like::

class PeriodType(pa.ExtensionType):
...

def to_pandas_dtype(self):
import pandas as pd
return pd.PeriodDtype(freq=self.freq)

Secondly, the pandas ``ExtensionDtype`` on its turn needs to have the
``__from_arrow__`` method implemented: a method that given a pyarrow Array
or ChunkedArray of the extesion type can construct the corresponding
pandas ``ExtensionArray``. This method should have the following signature::


class MyExtensionDtype(pd.api.extensions.ExtensionDtype):
...

def __from_arrow__(self, array: pyarrow.Array/ChunkedArray) -> pandas.ExtensionArray:
...

This way, you can control the conversion of an pyarrow ``Array`` of your pyarrow
extension type to a pandas ``ExtensionArray`` that can be stored in a DataFrame.
4 changes: 1 addition & 3 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -421,8 +421,7 @@ cdef class _PandasConvertible:
bint date_as_object=True,
bint use_threads=True,
bint deduplicate_objects=True,
bint ignore_metadata=False,
extension_columns=None,
bint ignore_metadata=False
):
"""
Convert to a pandas-compatible NumPy array or DataFrame, as appropriate
Expand Down Expand Up @@ -467,7 +466,6 @@ cdef class _PandasConvertible:
deduplicate_objects=deduplicate_objects
)
return self._to_pandas(options, categories=categories,
extension_columns=extension_columns,
ignore_metadata=ignore_metadata)


Expand Down
39 changes: 32 additions & 7 deletions python/pyarrow/pandas_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ def get_logical_type_map():
pa.lib.Type_BINARY: 'bytes',
pa.lib.Type_FIXED_SIZE_BINARY: 'bytes',
pa.lib.Type_STRING: 'unicode',
# pa.lib.Type_EXTENSION: 'extension',
})
return _logical_type_map

Expand Down Expand Up @@ -647,10 +646,34 @@ def serialized_dict_to_dataframe(data):


def _reconstruct_block(item, columns=None, extension_columns=None):
"""
Construct a pandas Block from the `item` dictionary coming from pyarrow's
serialization or returned by arrow::python::ConvertTableToPandas.
This function takes care of converting dictionary types to pandas
categorical, Timestamp-with-timezones to the proper pandas Block, and
conversion to pandas ExtensionBlock
Parameters
----------
item : dict
For basic types, this is a dictionary in the form of
{'block': np.ndarray of values, 'placement': pandas block placement}.
Additional keys are present for other types (dictionary, timezone,
object).
columns :
Column names of the table being constructed, used for extension types
extension_columns : dict
Dictionary of {column_name: pandas_dtype} that includes all columns
and corresponding dtypes that will be converted to a pandas
ExtensionBlock.
Returns
-------
pandas Block
"""
import pandas.core.internals as _int
# Construct the individual blocks converting dictionary types to pandas
# categorical types and Timestamps-with-timezones types to the proper
# pandas Blocks

block_arr = item.get('block', None)
placement = item['placement']
Expand All @@ -675,7 +698,8 @@ def _reconstruct_block(item, columns=None, extension_columns=None):
name = columns[placement[0]]
pandas_dtype = extension_columns[name]
if not hasattr(pandas_dtype, '__from_arrow__'):
raise ValueError("This column does not support")
raise ValueError("This column does not support to be converted "
"to a pandas ExtensionArray")
pd_ext_arr = pandas_dtype.__from_arrow__(arr)
block = _int.make_block(pd_ext_arr, placement=placement,
klass=_int.ExtensionBlock)
Expand Down Expand Up @@ -729,8 +753,9 @@ def table_to_blockmanager(options, table, categories=None,

def _get_extension_dtypes(table, columns_metadata, extension_columns):
"""
Based on the stored column pandas metadata, infer which columns
should be converted to a pandas extension dtype.
Based on the stored column pandas metadata and the extension types
in the arrow schema, infer which columns should be converted to a
pandas extension dtype.
The 'numpy_type' field in the column metadata stores the string
representation of the original pandas dtype (and, despite its name,
Expand Down
6 changes: 2 additions & 4 deletions python/pyarrow/table.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1269,12 +1269,10 @@ cdef class Table(_PandasConvertible):

return result

def _to_pandas(self, options, categories=None, extension_columns=None,
ignore_metadata=False):
def _to_pandas(self, options, categories=None, ignore_metadata=False):
from pyarrow.pandas_compat import table_to_blockmanager
mgr = table_to_blockmanager(
options, self, categories, extension_columns,
ignore_metadata=ignore_metadata)
options, self, categories, ignore_metadata=ignore_metadata)
return pandas_api.data_frame(mgr)

def to_pydict(self):
Expand Down
71 changes: 28 additions & 43 deletions python/pyarrow/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -3244,7 +3244,7 @@ def _Int64Dtype__from_arrow__(self, array):
return int_arr


def test_convert_to_extension_array():
def test_convert_to_extension_array(monkeypatch):
if LooseVersion(pd.__version__) < "0.26.0.dev":
pytest.skip("Conversion from IntegerArray to arrow not yet supported")

Expand All @@ -3262,37 +3262,24 @@ def test_convert_to_extension_array():
assert len(result._data.blocks) == 1
assert isinstance(result._data.blocks[0], _int.IntBlock)

# raise error is explicitly asking for unsupported conversion
with pytest.raises(ValueError):
table.to_pandas(extension_columns=['b'])

try:
# patch pandas Int64Dtype to have the protocol method
pd.Int64Dtype.__from_arrow__ = _Int64Dtype__from_arrow__

# Int64Dtype is recognized -> convert to extension block by default
# for a proper roundtrip
result = table.to_pandas()
assert isinstance(result._data.blocks[0], _int.IntBlock)
assert isinstance(result._data.blocks[1], _int.ExtensionBlock)
tm.assert_frame_equal(result, df)

# explicitly specifying the column works as well
# TODO is this useful?
result = table.to_pandas(extension_columns=['b'])
assert isinstance(result._data.blocks[0], _int.IntBlock)
assert isinstance(result._data.blocks[1], _int.ExtensionBlock)
tm.assert_frame_equal(result, df)
# patch pandas Int64Dtype to have the protocol method
monkeypatch.setattr(
pd.Int64Dtype, '__from_arrow__', _Int64Dtype__from_arrow__,
raising=False)

# test with missing values
df2 = pd.DataFrame({'a': pd.array([1, 2, None], dtype='Int64')})
table2 = pa.table(df2)
result = table2.to_pandas(extension_columns=['a'])
assert isinstance(result._data.blocks[0], _int.ExtensionBlock)
tm.assert_frame_equal(result, df2)
# Int64Dtype is recognized -> convert to extension block by default
# for a proper roundtrip
result = table.to_pandas()
assert isinstance(result._data.blocks[0], _int.IntBlock)
assert isinstance(result._data.blocks[1], _int.ExtensionBlock)
tm.assert_frame_equal(result, df)

finally:
del pd.Int64Dtype.__from_arrow__
# test with missing values
df2 = pd.DataFrame({'a': pd.array([1, 2, None], dtype='Int64')})
table2 = pa.table(df2)
result = table2.to_pandas()
assert isinstance(result._data.blocks[0], _int.ExtensionBlock)
tm.assert_frame_equal(result, df2)


class MyCustomIntegerType(pa.PyExtensionType):
Expand All @@ -3307,7 +3294,7 @@ def to_pandas_dtype(self):
return pd.Int64Dtype()


def test_conversion_extensiontype_to_extensionarray():
def test_conversion_extensiontype_to_extensionarray(monkeypatch):
# converting extension type to linked pandas ExtensionDtype/Array
import pandas.core.internals as _int

Expand All @@ -3318,19 +3305,17 @@ def test_conversion_extensiontype_to_extensionarray():
with pytest.raises(ValueError):
table.to_pandas()

try:
# patch pandas Int64Dtype to have the protocol method
pd.Int64Dtype.__from_arrow__ = _Int64Dtype__from_arrow__
# patch pandas Int64Dtype to have the protocol method
monkeypatch.setattr(
pd.Int64Dtype, '__from_arrow__', _Int64Dtype__from_arrow__,
raising=False)

# extension type points to Int64Dtype, which knows how to create a
# pandas ExtensionArray
result = table.to_pandas()
assert isinstance(result._data.blocks[0], _int.ExtensionBlock)
expected = pd.DataFrame({'a': pd.array([1, 2, 3, 4], dtype='Int64')})
tm.assert_frame_equal(result, expected)

finally:
del pd.Int64Dtype.__from_arrow__
# extension type points to Int64Dtype, which knows how to create a
# pandas ExtensionArray
result = table.to_pandas()
assert isinstance(result._data.blocks[0], _int.ExtensionBlock)
expected = pd.DataFrame({'a': pd.array([1, 2, 3, 4], dtype='Int64')})
tm.assert_frame_equal(result, expected)


# ----------------------------------------------------------------------
Expand Down

0 comments on commit 013d904

Please sign in to comment.