diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index 2314be9327ea2..201fcc550ce3f 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -224,3 +224,39 @@ data type from above would look like:: return PeriodType, (self.freq,) Also the storage type does not need to be fixed but can be parametrized. + +Conversion to pandas +~~~~~~~~~~~~~~~~~~~~ + +The conversion to pandas (in :meth:`Table.to_pandas`) of columns with an +extension type can controlled in case there is a corresponding +`pandas extension array `__ +for your extension type. + +For this, the :meth:`ExtensionType.to_pandas_dtype` method needs to be +implemented, and should return a ``pandas.api.extensions.ExtensionDtype`` +subclass instance. + +Using the pandas period type from above as example, this would look like:: + + class PeriodType(pa.ExtensionType): + ... + + def to_pandas_dtype(self): + import pandas as pd + return pd.PeriodDtype(freq=self.freq) + +Secondly, the pandas ``ExtensionDtype`` on its turn needs to have the +``__from_arrow__`` method implemented: a method that given a pyarrow Array +or ChunkedArray of the extesion type can construct the corresponding +pandas ``ExtensionArray``. This method should have the following signature:: + + + class MyExtensionDtype(pd.api.extensions.ExtensionDtype): + ... + + def __from_arrow__(self, array: pyarrow.Array/ChunkedArray) -> pandas.ExtensionArray: + ... + +This way, you can control the conversion of an pyarrow ``Array`` of your pyarrow +extension type to a pandas ``ExtensionArray`` that can be stored in a DataFrame. diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 2793825e80229..afbfd0b1735bf 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -421,8 +421,7 @@ cdef class _PandasConvertible: bint date_as_object=True, bint use_threads=True, bint deduplicate_objects=True, - bint ignore_metadata=False, - extension_columns=None, + bint ignore_metadata=False ): """ Convert to a pandas-compatible NumPy array or DataFrame, as appropriate @@ -467,7 +466,6 @@ cdef class _PandasConvertible: deduplicate_objects=deduplicate_objects ) return self._to_pandas(options, categories=categories, - extension_columns=extension_columns, ignore_metadata=ignore_metadata) diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 3c8e6a71a3bfb..3ff327b504150 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -61,7 +61,6 @@ def get_logical_type_map(): pa.lib.Type_BINARY: 'bytes', pa.lib.Type_FIXED_SIZE_BINARY: 'bytes', pa.lib.Type_STRING: 'unicode', - # pa.lib.Type_EXTENSION: 'extension', }) return _logical_type_map @@ -647,10 +646,34 @@ def serialized_dict_to_dataframe(data): def _reconstruct_block(item, columns=None, extension_columns=None): + """ + Construct a pandas Block from the `item` dictionary coming from pyarrow's + serialization or returned by arrow::python::ConvertTableToPandas. + + This function takes care of converting dictionary types to pandas + categorical, Timestamp-with-timezones to the proper pandas Block, and + conversion to pandas ExtensionBlock + + Parameters + ---------- + item : dict + For basic types, this is a dictionary in the form of + {'block': np.ndarray of values, 'placement': pandas block placement}. + Additional keys are present for other types (dictionary, timezone, + object). + columns : + Column names of the table being constructed, used for extension types + extension_columns : dict + Dictionary of {column_name: pandas_dtype} that includes all columns + and corresponding dtypes that will be converted to a pandas + ExtensionBlock. + + Returns + ------- + pandas Block + + """ import pandas.core.internals as _int - # Construct the individual blocks converting dictionary types to pandas - # categorical types and Timestamps-with-timezones types to the proper - # pandas Blocks block_arr = item.get('block', None) placement = item['placement'] @@ -675,7 +698,8 @@ def _reconstruct_block(item, columns=None, extension_columns=None): name = columns[placement[0]] pandas_dtype = extension_columns[name] if not hasattr(pandas_dtype, '__from_arrow__'): - raise ValueError("This column does not support") + raise ValueError("This column does not support to be converted " + "to a pandas ExtensionArray") pd_ext_arr = pandas_dtype.__from_arrow__(arr) block = _int.make_block(pd_ext_arr, placement=placement, klass=_int.ExtensionBlock) @@ -729,8 +753,9 @@ def table_to_blockmanager(options, table, categories=None, def _get_extension_dtypes(table, columns_metadata, extension_columns): """ - Based on the stored column pandas metadata, infer which columns - should be converted to a pandas extension dtype. + Based on the stored column pandas metadata and the extension types + in the arrow schema, infer which columns should be converted to a + pandas extension dtype. The 'numpy_type' field in the column metadata stores the string representation of the original pandas dtype (and, despite its name, diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 73dfcfd406b82..ab4c2737dc8dc 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -1269,12 +1269,10 @@ cdef class Table(_PandasConvertible): return result - def _to_pandas(self, options, categories=None, extension_columns=None, - ignore_metadata=False): + def _to_pandas(self, options, categories=None, ignore_metadata=False): from pyarrow.pandas_compat import table_to_blockmanager mgr = table_to_blockmanager( - options, self, categories, extension_columns, - ignore_metadata=ignore_metadata) + options, self, categories, ignore_metadata=ignore_metadata) return pandas_api.data_frame(mgr) def to_pydict(self): diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index ff4b95e12027f..fa1f4bc770b36 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -3244,7 +3244,7 @@ def _Int64Dtype__from_arrow__(self, array): return int_arr -def test_convert_to_extension_array(): +def test_convert_to_extension_array(monkeypatch): if LooseVersion(pd.__version__) < "0.26.0.dev": pytest.skip("Conversion from IntegerArray to arrow not yet supported") @@ -3262,37 +3262,24 @@ def test_convert_to_extension_array(): assert len(result._data.blocks) == 1 assert isinstance(result._data.blocks[0], _int.IntBlock) - # raise error is explicitly asking for unsupported conversion - with pytest.raises(ValueError): - table.to_pandas(extension_columns=['b']) - - try: - # patch pandas Int64Dtype to have the protocol method - pd.Int64Dtype.__from_arrow__ = _Int64Dtype__from_arrow__ - - # Int64Dtype is recognized -> convert to extension block by default - # for a proper roundtrip - result = table.to_pandas() - assert isinstance(result._data.blocks[0], _int.IntBlock) - assert isinstance(result._data.blocks[1], _int.ExtensionBlock) - tm.assert_frame_equal(result, df) - - # explicitly specifying the column works as well - # TODO is this useful? - result = table.to_pandas(extension_columns=['b']) - assert isinstance(result._data.blocks[0], _int.IntBlock) - assert isinstance(result._data.blocks[1], _int.ExtensionBlock) - tm.assert_frame_equal(result, df) + # patch pandas Int64Dtype to have the protocol method + monkeypatch.setattr( + pd.Int64Dtype, '__from_arrow__', _Int64Dtype__from_arrow__, + raising=False) - # test with missing values - df2 = pd.DataFrame({'a': pd.array([1, 2, None], dtype='Int64')}) - table2 = pa.table(df2) - result = table2.to_pandas(extension_columns=['a']) - assert isinstance(result._data.blocks[0], _int.ExtensionBlock) - tm.assert_frame_equal(result, df2) + # Int64Dtype is recognized -> convert to extension block by default + # for a proper roundtrip + result = table.to_pandas() + assert isinstance(result._data.blocks[0], _int.IntBlock) + assert isinstance(result._data.blocks[1], _int.ExtensionBlock) + tm.assert_frame_equal(result, df) - finally: - del pd.Int64Dtype.__from_arrow__ + # test with missing values + df2 = pd.DataFrame({'a': pd.array([1, 2, None], dtype='Int64')}) + table2 = pa.table(df2) + result = table2.to_pandas() + assert isinstance(result._data.blocks[0], _int.ExtensionBlock) + tm.assert_frame_equal(result, df2) class MyCustomIntegerType(pa.PyExtensionType): @@ -3307,7 +3294,7 @@ def to_pandas_dtype(self): return pd.Int64Dtype() -def test_conversion_extensiontype_to_extensionarray(): +def test_conversion_extensiontype_to_extensionarray(monkeypatch): # converting extension type to linked pandas ExtensionDtype/Array import pandas.core.internals as _int @@ -3318,19 +3305,17 @@ def test_conversion_extensiontype_to_extensionarray(): with pytest.raises(ValueError): table.to_pandas() - try: - # patch pandas Int64Dtype to have the protocol method - pd.Int64Dtype.__from_arrow__ = _Int64Dtype__from_arrow__ + # patch pandas Int64Dtype to have the protocol method + monkeypatch.setattr( + pd.Int64Dtype, '__from_arrow__', _Int64Dtype__from_arrow__, + raising=False) - # extension type points to Int64Dtype, which knows how to create a - # pandas ExtensionArray - result = table.to_pandas() - assert isinstance(result._data.blocks[0], _int.ExtensionBlock) - expected = pd.DataFrame({'a': pd.array([1, 2, 3, 4], dtype='Int64')}) - tm.assert_frame_equal(result, expected) - - finally: - del pd.Int64Dtype.__from_arrow__ + # extension type points to Int64Dtype, which knows how to create a + # pandas ExtensionArray + result = table.to_pandas() + assert isinstance(result._data.blocks[0], _int.ExtensionBlock) + expected = pd.DataFrame({'a': pd.array([1, 2, 3, 4], dtype='Int64')}) + tm.assert_frame_equal(result, expected) # ----------------------------------------------------------------------