From 8cce4c6f6a06c889655c571e87b65aa4b10552f6 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 12 Sep 2019 22:57:00 +0200 Subject: [PATCH] ENH: Add IntegerArray.__arrow_array__ for custom conversion to Arrow (#28368) * ENH: Add IntegerArray.__arrow_array__ for custom conversion to Arrow * simplify pyarrow version check in tests * add whatsnew --- doc/source/whatsnew/v1.0.0.rst | 10 +++++++--- pandas/core/arrays/integer.py | 8 ++++++++ pandas/tests/arrays/test_integer.py | 12 ++++++++++++ pandas/tests/io/test_parquet.py | 12 ++++++++++++ 4 files changed, 39 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 9998a9a8476431..bc77553924dfab 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -20,15 +20,19 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ -- :meth:`DataFrame.to_latex` now accepts ``caption`` and ``label`` arguments (:issue:`25436`) -- + .. _whatsnew_1000.enhancements.other: Other enhancements ^^^^^^^^^^^^^^^^^^ -- +- :meth:`DataFrame.to_latex` now accepts ``caption`` and ``label`` arguments (:issue:`25436`) +- The :ref:`integer dtype ` with support for missing values can now be converted to + ``pyarrow`` (>= 0.15.0), which means that it is supported in writing to the Parquet file format + when using the ``pyarrow`` engine. It is currently not yet supported when converting back to + pandas (so it will become an integer or float dtype depending on the presence of missing data). + (:issue:`28368`) - .. _whatsnew_1000.api_breaking: diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 069d661e6af34d..7b03bf35faf252 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -367,6 +367,14 @@ def __array__(self, dtype=None): """ return self._coerce_to_ndarray() + def __arrow_array__(self, type=None): + """ + Convert myself into a pyarrow Array. + """ + import pyarrow as pa + + return pa.array(self._data, mask=self._mask, type=type) + _HANDLED_TYPES = (np.ndarray, numbers.Number) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 31a9a0483081ed..55e25caafc4ee4 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.generic import ABCIndexClass import pandas as pd @@ -817,6 +819,16 @@ def test_ufunc_reduce_raises(values): np.add.reduce(a) +@td.skip_if_no("pyarrow", min_version="0.14.1.dev") +def test_arrow_array(data): + # protocol added in 0.15.0 + import pyarrow as pa + + arr = pa.array(data) + expected = pa.array(list(data), type=data.dtype.name.lower(), from_pandas=True) + assert arr.equals(expected) + + # TODO(jreback) - these need testing / are broken # shift diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 9573ac15dc45fa..efc2b6d6c5b3d7 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -488,6 +488,18 @@ def test_empty_dataframe(self, pa): df = pd.DataFrame() check_round_trip(df, pa) + @td.skip_if_no("pyarrow", min_version="0.14.1.dev") + def test_nullable_integer(self, pa): + df = pd.DataFrame({"a": pd.Series([1, 2, 3], dtype="Int64")}) + # currently de-serialized as plain int + expected = df.assign(a=df.a.astype("int64")) + check_round_trip(df, pa, expected=expected) + + df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")}) + # if missing values currently de-serialized as float + expected = df.assign(a=df.a.astype("float64")) + check_round_trip(df, pa, expected=expected) + class TestParquetFastParquet(Base): @td.skip_if_no("fastparquet", min_version="0.2.1")