From 87a53858b2462a73f5c8c7357c0db4764f20c380 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 29 Nov 2018 09:53:21 -0600 Subject: [PATCH] API: Public data for Series and Index: .array and .to_numpy() (#23623) --- doc/source/10min.rst | 31 ++++++- doc/source/advanced.rst | 2 +- doc/source/basics.rst | 104 ++++++++++++++++------ doc/source/categorical.rst | 4 +- doc/source/dsintro.rst | 40 ++++++++- doc/source/enhancingperf.rst | 8 +- doc/source/extending.rst | 2 +- doc/source/indexing.rst | 2 +- doc/source/missing_data.rst | 2 +- doc/source/reshaping.rst | 14 +-- doc/source/text.rst | 4 +- doc/source/timeseries.rst | 10 +-- doc/source/whatsnew/v0.24.0.rst | 49 ++++++++++- pandas/core/base.py | 123 ++++++++++++++++++++++++++- pandas/core/frame.py | 44 ++++++++++ pandas/core/generic.py | 5 ++ pandas/core/indexes/base.py | 19 ++++- pandas/core/indexes/multi.py | 19 +++++ pandas/core/series.py | 11 +++ pandas/tests/frame/test_api.py | 6 ++ pandas/tests/indexes/test_numeric.py | 6 ++ pandas/tests/test_base.py | 51 +++++++++++ 22 files changed, 501 insertions(+), 55 deletions(-) diff --git a/doc/source/10min.rst b/doc/source/10min.rst index ddcb9b5fc26c75..e04a8253e0bef1 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -113,13 +113,40 @@ Here is how to view the top and bottom rows of the frame: df.head() df.tail(3) -Display the index, columns, and the underlying NumPy data: +Display the index, columns: .. ipython:: python df.index df.columns - df.values + +:meth:`DataFrame.to_numpy` gives a NumPy representation of the underlying data. +Note that his can be an expensive operation when your :class:`DataFrame` has +columns with different data types, which comes down to a fundamental difference +between pandas and NumPy: **NumPy arrays have one dtype for the entire array, +while pandas DataFrames have one dtype per column**. When you call +:meth:`DataFrame.to_numpy`, pandas will find the NumPy dtype that can hold *all* +of the dtypes in the DataFrame. This may end up being ``object``, which requires +casting every value to a Python object. + +For ``df``, our :class:`DataFrame` of all floating-point values, +:meth:`DataFrame.to_numpy` is fast and doesn't require copying data. + +.. ipython:: python + + df.to_numpy() + +For ``df2``, the :class:`DataFrame` with multiple dtypes, +:meth:`DataFrame.to_numpy` is relatively expensive. + +.. ipython:: python + + df2.to_numpy() + +.. note:: + + :meth:`DataFrame.to_numpy` does *not* include the index or column + labels in the output. :func:`~DataFrame.describe` shows a quick statistic summary of your data: diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 17214ab62b2782..24a1ac7be7d1db 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -188,7 +188,7 @@ highly performant. If you want to see only the used levels, you can use the .. ipython:: python - df[['foo', 'qux']].columns.values + df[['foo', 'qux']].columns.to_numpy() # for a specific level df[['foo', 'qux']].columns.get_level_values(0) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 69d1e105f62ab4..25e2c8cd1ff9a9 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -46,8 +46,8 @@ of elements to display is five, but you may pass a custom number. .. _basics.attrs: -Attributes and the raw ndarray(s) ---------------------------------- +Attributes and Underlying Data +------------------------------ pandas objects have a number of attributes enabling you to access the metadata @@ -65,14 +65,43 @@ Note, **these attributes can be safely assigned to**! df.columns = [x.lower() for x in df.columns] df -To get the actual data inside a data structure, one need only access the -**values** property: +Pandas objects (:class:`Index`, :class:`Series`, :class:`DataFrame`) can be +thought of as containers for arrays, which hold the actual data and do the +actual computation. For many types, the underlying array is a +:class:`numpy.ndarray`. However, pandas and 3rd party libraries may *extend* +NumPy's type system to add support for custom arrays +(see :ref:`basics.dtypes`). + +To get the actual data inside a :class:`Index` or :class:`Series`, use +the **array** property + +.. ipython:: python + + s.array + s.index.array + +Depending on the data type (see :ref:`basics.dtypes`), :attr:`~Series.array` +be either a NumPy array or an :ref:`ExtensionArray `. +If you know you need a NumPy array, use :meth:`~Series.to_numpy` +or :meth:`numpy.asarray`. .. ipython:: python - s.values - df.values - wp.values + s.to_numpy() + np.asarray(s) + +For Series and Indexes backed by NumPy arrays (like we have here), this will +be the same as :attr:`~Series.array`. When the Series or Index is backed by +a :class:`~pandas.api.extension.ExtensionArray`, :meth:`~Series.to_numpy` +may involve copying data and coercing values. + +Getting the "raw data" inside a :class:`DataFrame` is possibly a bit more +complex. When your ``DataFrame`` only has a single data type for all the +columns, :atr:`DataFrame.to_numpy` will return the underlying data: + +.. ipython:: python + + df.to_numpy() If a DataFrame or Panel contains homogeneously-typed data, the ndarray can actually be modified in-place, and the changes will be reflected in the data @@ -87,6 +116,21 @@ unlike the axis labels, cannot be assigned to. strings are involved, the result will be of object dtype. If there are only floats and integers, the resulting array will be of float dtype. +In the past, pandas recommended :attr:`Series.values` or :attr:`DataFrame.values` +for extracting the data from a Series or DataFrame. You'll still find references +to these in old code bases and online. Going forward, we recommend avoiding +``.values`` and using ``.array`` or ``.to_numpy()``. ``.values`` has the following +drawbacks: + +1. When your Series contains an :ref:`extension type `, it's + unclear whether :attr:`Series.values` returns a NumPy array or the extension array. + :attr:`Series.array` will always return the actual array backing the Series, + while :meth:`Series.to_numpy` will always return a NumPy array. +2. When your DataFrame contains a mixture of data types, :attr:`DataFrame.values` may + involve copying data and coercing values to a common dtype, a relatively expensive + operation. :meth:`DataFrame.to_numpy`, being a method, makes it clearer that the + returned NumPy array may not be a view on the same data in the DataFrame. + .. _basics.accelerate: Accelerated operations @@ -541,7 +585,7 @@ will exclude NAs on Series input by default: .. ipython:: python np.mean(df['one']) - np.mean(df['one'].values) + np.mean(df['one'].to_numpy()) :meth:`Series.nunique` will return the number of unique non-NA values in a Series: @@ -839,7 +883,7 @@ Series operation on each column or row: tsdf = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'], index=pd.date_range('1/1/2000', periods=10)) - tsdf.values[3:7] = np.nan + tsdf.iloc[3:7] = np.nan .. ipython:: python @@ -1875,17 +1919,29 @@ dtypes ------ For the most part, pandas uses NumPy arrays and dtypes for Series or individual -columns of a DataFrame. The main types allowed in pandas objects are ``float``, -``int``, ``bool``, and ``datetime64[ns]`` (note that NumPy does not support -timezone-aware datetimes). - -In addition to NumPy's types, pandas :ref:`extends ` -NumPy's type-system for a few cases. - -* :ref:`Categorical ` -* :ref:`Datetime with Timezone ` -* :ref:`Period ` -* :ref:`Interval ` +columns of a DataFrame. NumPy provides support for ``float``, +``int``, ``bool``, ``timedelta64[ns]`` and ``datetime64[ns]`` (note that NumPy +does not support timezone-aware datetimes). + +Pandas and third-party libraries *extend* NumPy's type system in a few places. +This section describes the extensions pandas has made internally. +See :ref:`extending.extension-types` for how to write your own extension that +works with pandas. See :ref:`ecosystem.extensions` for a list of third-party +libraries that have implemented an extension. + +The following table lists all of pandas extension types. See the respective +documentation sections for more on each type. + +=================== ========================= ================== ============================= ============================= +Kind of Data Data Type Scalar Array Documentation +=================== ========================= ================== ============================= ============================= +tz-aware datetime :class:`DatetimeArray` :class:`Timestamp` :class:`arrays.DatetimeArray` :ref:`timeseries.timezone` +Categorical :class:`CategoricalDtype` (none) :class:`Categorical` :ref:`categorical` +period (time spans) :class:`PeriodDtype` :class:`Period` :class:`arrays.PeriodArray` :ref:`timeseries.periods` +sparse :class:`SparseDtype` (none) :class:`arrays.SparseArray` :ref:`sparse` +intervals :class:`IntervalDtype` :class:`Interval` :class:`arrays.IntervalArray` :ref:`advanced.intervalindex` +nullable integer :clsas:`Int64Dtype`, ... (none) :class:`arrays.IntegerArray` :ref:`integer_na` +=================== ========================= ================== ============================= ============================= Pandas uses the ``object`` dtype for storing strings. @@ -1983,13 +2039,13 @@ from the current type (e.g. ``int`` to ``float``). df3 df3.dtypes -The ``values`` attribute on a DataFrame return the *lower-common-denominator* of the dtypes, meaning +:meth:`DataFrame.to_numpy` will return the *lower-common-denominator* of the dtypes, meaning the dtype that can accommodate **ALL** of the types in the resulting homogeneous dtyped NumPy array. This can force some *upcasting*. .. ipython:: python - df3.values.dtype + df3.to_numpy().dtype astype ~~~~~~ @@ -2211,11 +2267,11 @@ dtypes: 'float64': np.arange(4.0, 7.0), 'bool1': [True, False, True], 'bool2': [False, True, False], - 'dates': pd.date_range('now', periods=3).values, + 'dates': pd.date_range('now', periods=3), 'category': pd.Series(list("ABC")).astype('category')}) df['tdeltas'] = df.dates.diff() df['uint64'] = np.arange(3, 6).astype('u8') - df['other_dates'] = pd.date_range('20130101', periods=3).values + df['other_dates'] = pd.date_range('20130101', periods=3) df['tz_aware_dates'] = pd.date_range('20130101', periods=3, tz='US/Eastern') df diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 2f2430f02f89df..31f2430e4be885 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -178,7 +178,7 @@ are consistent among all columns. To perform table-wise conversion, where all labels in the entire ``DataFrame`` are used as categories for each column, the ``categories`` parameter can be determined programmatically by - ``categories = pd.unique(df.values.ravel())``. + ``categories = pd.unique(df.to_numpy().ravel())``. If you already have ``codes`` and ``categories``, you can use the :func:`~pandas.Categorical.from_codes` constructor to save the factorize step @@ -955,7 +955,7 @@ Use ``.astype`` or ``union_categoricals`` to get ``category`` result. pd.concat([s1, s3]) pd.concat([s1, s3]).astype('category') - union_categoricals([s1.values, s3.values]) + union_categoricals([s1.array, s3.array]) Following table summarizes the results of ``Categoricals`` related concatenations. diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index ccd530d11b8f93..6195212873e75c 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -137,7 +137,43 @@ However, operations such as slicing will also slice the index. s[[4, 3, 1]] np.exp(s) -We will address array-based indexing in a separate :ref:`section `. +.. note:: + + We will address array-based indexing like ``s[[4, 3, 1]]`` + in :ref:`section `. + +Like a NumPy array, a pandas Series has a :attr:`~Series.dtype`. + +.. ipython:: python + + s.dtype + +This is often a NumPy dtype. However, pandas and 3rd-party libraries +extend NumPy's type system in a few places, in which case the dtype would +be a :class:`~pandas.api.extensions.ExtensionDtype`. Some examples within +pandas are :ref:`categorical` and :ref:`integer_na`. See :ref:`basics.dtypes` +for more. + +If you need the actual array backing a ``Series``, use :attr:`Series.array`. + +.. ipython:: python + + s.array + +Again, this is often a NumPy array, but may instead be a +:class:`~pandas.api.extensions.ExtensionArray`. See :ref:`basics.dtypes` for more. +Accessing the array can be useful when you need to do some operation without the +index (to disable :ref:`automatic alignment `, for example). + +While Series is ndarray-like, if you need an *actual* ndarray, then use +:meth:`Series.to_numpy`. + +.. ipython:: python + + s.to_numpy() + +Even if the Series is backed by a :class:`~pandas.api.extensions.ExtensionArray`, +:meth:`Series.to_numpy` will return a NumPy ndarray. Series is dict-like ~~~~~~~~~~~~~~~~~~~ @@ -617,6 +653,8 @@ slicing, see the :ref:`section on indexing `. We will address the fundamentals of reindexing / conforming to new sets of labels in the :ref:`section on reindexing `. +.. _dsintro.alignment: + Data alignment and arithmetic ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst index 2ca8a2b7ac0f88..1c873d604cfe0b 100644 --- a/doc/source/enhancingperf.rst +++ b/doc/source/enhancingperf.rst @@ -221,7 +221,7 @@ the rows, applying our ``integrate_f_typed``, and putting this in the zeros arra You can **not pass** a ``Series`` directly as a ``ndarray`` typed parameter to a Cython function. Instead pass the actual ``ndarray`` using the - ``.values`` attribute of the ``Series``. The reason is that the Cython + :meth:`Series.to_numpy`. The reason is that the Cython definition is specific to an ndarray and not the passed ``Series``. So, do not do this: @@ -230,11 +230,13 @@ the rows, applying our ``integrate_f_typed``, and putting this in the zeros arra apply_integrate_f(df['a'], df['b'], df['N']) - But rather, use ``.values`` to get the underlying ``ndarray``: + But rather, use :meth:`Series.to_numpy` to get the underlying ``ndarray``: .. code-block:: python - apply_integrate_f(df['a'].values, df['b'].values, df['N'].values) + apply_integrate_f(df['a'].to_numpy(), + df['b'].to_numpy(), + df['N'].to_numpy()) .. note:: diff --git a/doc/source/extending.rst b/doc/source/extending.rst index 6c47d0ae8bd848..7046981a3a3643 100644 --- a/doc/source/extending.rst +++ b/doc/source/extending.rst @@ -186,7 +186,7 @@ Instead, you should detect these cases and return ``NotImplemented``. When pandas encounters an operation like ``op(Series, ExtensionArray)``, pandas will -1. unbox the array from the ``Series`` (roughly ``Series.values``) +1. unbox the array from the ``Series`` (``Series.array``) 2. call ``result = op(values, ExtensionArray)`` 3. re-box the result in a ``Series`` diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 5740ab5fa69217..dc0c6dd027b3c5 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -190,7 +190,7 @@ columns. .. ipython:: python - df.loc[:,['B', 'A']] = df[['A', 'B']].values + df.loc[:,['B', 'A']] = df[['A', 'B']].to_numpy() df[['A', 'B']] diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst index 48646376916071..7b6d338ee5b6a6 100644 --- a/doc/source/missing_data.rst +++ b/doc/source/missing_data.rst @@ -678,7 +678,7 @@ Replacing more than one value is possible by passing a list. .. ipython:: python - df00 = df.values[0, 0] + df00 = df.iloc[0, 0] df.replace([1.5, df00], [np.nan, 'a']) df[1].dtype diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 8650b5ed1ba375..19857db1743e80 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -27,12 +27,12 @@ Reshaping by pivoting DataFrame objects tm.N = 3 def unpivot(frame): - N, K = frame.shape - data = {'value': frame.values.ravel('F'), - 'variable': np.asarray(frame.columns).repeat(N), - 'date': np.tile(np.asarray(frame.index), K)} - columns = ['date', 'variable', 'value'] - return pd.DataFrame(data, columns=columns) + N, K = frame.shape + data = {'value': frame.to_numpy().ravel('F'), + 'variable': np.asarray(frame.columns).repeat(N), + 'date': np.tile(np.asarray(frame.index), K)} + columns = ['date', 'variable', 'value'] + return pd.DataFrame(data, columns=columns) df = unpivot(tm.makeTimeDataFrame()) @@ -54,7 +54,7 @@ For the curious here is how the above ``DataFrame`` was created: def unpivot(frame): N, K = frame.shape - data = {'value': frame.values.ravel('F'), + data = {'value': frame.to_numpy().ravel('F'), 'variable': np.asarray(frame.columns).repeat(N), 'date': np.tile(np.asarray(frame.index), K)} return pd.DataFrame(data, columns=['date', 'variable', 'value']) diff --git a/doc/source/text.rst b/doc/source/text.rst index d69888e406f0a6..d677cc38c98885 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -317,8 +317,8 @@ All one-dimensional list-likes can be combined in a list-like container (includi s u - s.str.cat([u.values, - u.index.astype(str).values], na_rep='-') + s.str.cat([u.array, + u.index.astype(str).array], na_rep='-') All elements must match in length to the calling ``Series`` (or ``Index``), except those having an index if ``join`` is not None: diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index cc377f45c4b8d0..4fa1cb8be92345 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -2436,22 +2436,22 @@ a convert on an aware stamp. .. note:: - Using the ``.values`` accessor on a ``Series``, returns an NumPy array of the data. + Using :meth:`Series.to_numpy` on a ``Series``, returns a NumPy array of the data. These values are converted to UTC, as NumPy does not currently support timezones (even though it is *printing* in the local timezone!). .. ipython:: python - s_naive.values - s_aware.values + s_naive.to_numpy() + s_aware.to_numpy() Further note that once converted to a NumPy array these would lose the tz tenor. .. ipython:: python - pd.Series(s_aware.values) + pd.Series(s_aware.to_numpy()) However, these can be easily converted: .. ipython:: python - pd.Series(s_aware.values).dt.tz_localize('UTC').dt.tz_convert('US/Eastern') + pd.Series(s_aware.to_numpy()).dt.tz_localize('UTC').dt.tz_convert('US/Eastern') diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index d4b2fefff322fc..d305cd7bf47e59 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -24,8 +24,53 @@ New features the user to override the engine's default behavior to include or omit the dataframe's indexes from the resulting Parquet file. (:issue:`20768`) - :meth:`DataFrame.corr` and :meth:`Series.corr` now accept a callable for generic calculation methods of correlation, e.g. histogram intersection (:issue:`22684`) -- :func:`DataFrame.to_string` now accepts ``decimal`` as an argument, allowing -the user to specify which decimal separator should be used in the output. (:issue:`23614`) +- :func:`DataFrame.to_string` now accepts ``decimal`` as an argument, allowing the user to specify which decimal separator should be used in the output. (:issue:`23614`) + +.. _whatsnew_0240.values_api: + +Accessing the values in a Series or Index +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:attr:`Series.array` and :attr:`Index.array` have been added for extracting the array backing a +``Series`` or ``Index``. + +.. ipython:: python + + idx = pd.period_range('2000', periods=4) + idx.array + pd.Series(idx).array + +Historically, this would have been done with ``series.values``, but with +``.values`` it was unclear whether the returned value would be the actual array, +some transformation of it, or one of pandas custom arrays (like +``Categorical``). For example, with :class:`PeriodIndex`, ``.values`` generates +a new ndarray of period objects each time. + +.. ipython:: python + + id(idx.values) + id(idx.values) + +If you need an actual NumPy array, use :meth:`Series.to_numpy` or :meth:`Index.to_numpy`. + +.. ipython:: python + + idx.to_numpy() + pd.Series(idx).to_numpy() + +For Series and Indexes backed by normal NumPy arrays, this will be the same thing (and the same +as ``.values``). + +.. ipython:: python + + ser = pd.Series([1, 2, 3]) + ser.array + ser.to_numpy() + +We haven't removed or deprecated :attr:`Series.values` or :attr:`DataFrame.values`, but we +recommend and using ``.array`` or ``.to_numpy()`` instead. + +See :ref:`basics.dtypes` and :ref:`dsintro.attrs` for more. .. _whatsnew_0240.enhancements.extension_array_operators: diff --git a/pandas/core/base.py b/pandas/core/base.py index fd303182959a5d..86de25444cf4c0 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -15,8 +15,8 @@ from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.common import ( - is_datetimelike, is_extension_array_dtype, is_extension_type, is_list_like, - is_object_dtype, is_scalar) + is_datetime64tz_dtype, is_datetimelike, is_extension_array_dtype, + is_extension_type, is_list_like, is_object_dtype, is_scalar) from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna @@ -777,6 +777,125 @@ def base(self): FutureWarning, stacklevel=2) return self.values.base + @property + def array(self): + # type: () -> Union[np.ndarray, ExtensionArray] + """ + The actual Array backing this Series or Index. + + .. versionadded:: 0.24.0 + + Returns + ------- + array : numpy.ndarray or ExtensionArray + This is the actual array stored within this object. This differs + from ``.values`` which may require converting the data + to a different form. + + See Also + -------- + Index.to_numpy : Similar method that always returns a NumPy array. + Series.to_numpy : Similar method that always returns a NumPy array. + + Notes + ----- + This table lays out the different array types for each extension + dtype within pandas. + + ================== ============================= + dtype array type + ================== ============================= + category Categorical + period PeriodArray + interval IntervalArray + IntegerNA IntegerArray + datetime64[ns, tz] DatetimeArray + ================== ============================= + + For any 3rd-party extension types, the array type will be an + ExtensionArray. + + For all remaining dtypes ``.array`` will be the :class:`numpy.ndarray` + stored within. If you absolutely need a NumPy array (possibly with + copying / coercing data), then use :meth:`Series.to_numpy` instead. + + .. note:: + + ``.array`` will always return the underlying object backing the + Series or Index. If a future version of pandas adds a specialized + extension type for a data type, then the return type of ``.array`` + for that data type will change from an object-dtype ndarray to the + new ExtensionArray. + + Examples + -------- + >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) + >>> ser.array + [a, b, a] + Categories (2, object): [a, b] + """ + return self._values + + def to_numpy(self): + """ + A NumPy ndarray representing the values in this Series or Index. + + .. versionadded:: 0.24.0 + + The returned array will be the same up to equality (values equal + in `self` will be equal in the returned array; likewise for values + that are not equal). When `self` contains an ExtensionArray, the + dtype may be different. For example, for a category-dtype Series, + ``to_numpy()`` will return a NumPy array and the categorical dtype + will be lost. + + Returns + ------- + numpy.ndarray + + See Also + -------- + Series.array : Get the actual data stored within. + Index.array : Get the actual data stored within. + DataFrame.to_numpy : Similar method for DataFrame. + + Notes + ----- + For NumPy dtypes, this will be a reference to the actual data stored + in this Series or Index. Modifying the result in place will modify + the data stored in the Series or Index (not that we recommend doing + that). + + For extension types, ``to_numpy()`` *may* require copying data and + coercing the result to a NumPy type (possibly object), which may be + expensive. When you need a no-copy reference to the underlying data, + :attr:`Series.array` should be used instead. + + This table lays out the different dtypes and return types of + ``to_numpy()`` for various dtypes within pandas. + + ================== ================================ + dtype array type + ================== ================================ + category[T] ndarray[T] (same dtype as input) + period ndarray[object] (Periods) + interval ndarray[object] (Intervals) + IntegerNA ndarray[object] + datetime64[ns, tz] ndarray[object] (Timestamps) + ================== ================================ + + Examples + -------- + >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) + >>> ser.to_numpy() + array(['a', 'b', 'a'], dtype=object) + """ + if (is_extension_array_dtype(self.dtype) or + is_datetime64tz_dtype(self.dtype)): + # TODO(DatetimeArray): remove the second clause. + return np.asarray(self._values) + return self._values + @property def _ndarray_values(self): # type: () -> np.ndarray diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ab240218ecda14..06519da9a26d56 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1187,6 +1187,50 @@ def from_dict(cls, data, orient='columns', dtype=None, columns=None): return cls(data, index=index, columns=columns, dtype=dtype) + def to_numpy(self): + """ + Convert the DataFrame to a NumPy array. + + .. versionadded:: 0.24.0 + + The dtype of the returned array will be the common NumPy + dtype of all types in the DataFrame. For example, + if the dtypes are ``float16`` and ``float32``, the results + dtype will be ``float32``. This may require copying data and + coercing values, which may be expensive. + + Returns + ------- + array : numpy.ndarray + + See Also + -------- + Series.to_numpy : Similar method for Series. + + Examples + -------- + >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy() + array([[1, 3], + [2, 4]]) + + With heterogenous data, the lowest common type will have to + be used. + + >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]}) + >>> df.to_numpy() + array([[1. , 3. ], + [2. , 4.5]]) + + For a mix of numeric and non-numeric types, the output array will + have object dtype. + + >>> df['C'] = pd.date_range('2000', periods=2) + >>> df.to_numpy() + array([[1, 3.0, Timestamp('2000-01-01 00:00:00')], + [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object) + """ + return self.values + def to_dict(self, orient='dict', into=dict): """ Convert the DataFrame to a dictionary. diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3c580d29dce8d4..08c07da39128f6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5164,6 +5164,10 @@ def values(self): """ Return a Numpy representation of the DataFrame. + .. warning:: + + We recommend using :meth:`DataFrame.to_numpy` instead. + Only the values in the DataFrame will be returned, the axes labels will be removed. @@ -5225,6 +5229,7 @@ def values(self): See Also -------- + DataFrame.to_numpy : Recommended alternative to this method. pandas.DataFrame.index : Retrieve the index labels. pandas.DataFrame.columns : Retrieving the column names. """ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index de59c035b81b52..e850db4178f415 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3542,6 +3542,23 @@ def _wrap_joined_index(self, joined, other): @property def values(self): """ + Return an array representing the data in the Index. + + .. warning:: + + We recommend using :attr:`Index.array` or + :meth:`Index.to_numpy`, depending on whether you need + a reference to the underlying data or a NumPy array. + + Returns + ------- + array: numpy.ndarray or ExtensionArray + + See Also + -------- + Index.array : Reference to the underlying data. + Index.to_numpy : A NumPy array representing the underlying data. + Return the underlying data as an ndarray. """ return self._data.view(np.ndarray) @@ -3575,7 +3592,7 @@ def _values(self): values _ndarray_values """ - return self.values + return self._data def get_values(self): """ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index f03376c32f7f46..567834b04c1ca4 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -430,6 +430,25 @@ def from_product(cls, iterables, sortorder=None, names=None): def levels(self): return self._levels + @property + def _values(self): + # We override here, since our parent uses _data, which we dont' use. + return self.values + + @property + def array(self): + """ + Raises a ValueError for `MultiIndex` because there's no single + array backing a MultiIndex. + + Raises + ------ + ValueError + """ + msg = ("MultiIndex has no single backing array. Use " + "'MultiIndex.to_numpy()' to get a NumPy array of tuples.") + raise ValueError(msg) + @property def _is_homogeneous_type(self): """Whether the levels of a MultiIndex all have the same dtype. diff --git a/pandas/core/series.py b/pandas/core/series.py index c3bcd2d76c27ab..0d6c9f4d845da2 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -431,10 +431,21 @@ def values(self): """ Return Series as ndarray or ndarray-like depending on the dtype. + .. warning:: + + We recommend using :attr:`Series.array` or + :Series:`Index.to_numpy`, depending on whether you need + a reference to the underlying data or a NumPy array. + Returns ------- arr : numpy.ndarray or ndarray-like + See Also + -------- + Series.array : Reference to the underlying data. + Series.to_numpy : A NumPy array representing the underlying data. + Examples -------- >>> pd.Series([1, 2, 3]).values diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 295a6038509844..074745429af0de 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -319,6 +319,12 @@ def test_values(self, float_frame, float_string_frame): expected = float_frame.reindex(columns=['A', 'B']).values assert_almost_equal(arr, expected) + def test_to_numpy(self): + df = pd.DataFrame({"A": [1, 2], "B": [3, 4.5]}) + expected = np.array([[1, 3], [2, 4.5]]) + result = df.to_numpy() + tm.assert_numpy_array_equal(result, expected) + def test_transpose(self, float_frame): frame = float_frame dft = frame.T diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 854e294bd99064..a64340c02cd222 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -619,6 +619,12 @@ def test_constructor_coercion_signed_to_unsigned(self, uint_dtype): with pytest.raises(OverflowError, match=msg): Index([-1], dtype=uint_dtype) + def test_constructor_unwraps_index(self): + idx = pd.Index([1, 2]) + result = pd.Int64Index(idx) + expected = np.array([1, 2], dtype='int64') + tm.assert_numpy_array_equal(result._data, expected) + def test_coerce_list(self): # coerce things arr = Index([1, 2, 3, 4]) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 7bc67dd99adaec..47fafe2a900b45 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1268,3 +1268,54 @@ def test_ndarray_values(array, expected): r_values = pd.Index(array)._ndarray_values tm.assert_numpy_array_equal(l_values, r_values) tm.assert_numpy_array_equal(l_values, expected) + + +@pytest.mark.parametrize("array, attr", [ + (np.array([1, 2], dtype=np.int64), None), + (pd.Categorical(['a', 'b']), '_codes'), + (pd.core.arrays.period_array(['2000', '2001'], freq='D'), '_data'), + (pd.core.arrays.integer_array([0, np.nan]), '_data'), + (pd.core.arrays.IntervalArray.from_breaks([0, 1]), '_left'), + (pd.SparseArray([0, 1]), '_sparse_values'), + # TODO: DatetimeArray(add) +]) +@pytest.mark.parametrize('box', [pd.Series, pd.Index]) +def test_array(array, attr, box): + if array.dtype.name in ('Int64', 'Sparse[int64, 0]') and box is pd.Index: + pytest.skip("No index type for {}".format(array.dtype)) + result = box(array, copy=False).array + + if attr: + array = getattr(array, attr) + result = getattr(result, attr) + + assert result is array + + +def test_array_multiindex_raises(): + idx = pd.MultiIndex.from_product([['A'], ['a', 'b']]) + with pytest.raises(ValueError, match='MultiIndex'): + idx.array + + +@pytest.mark.parametrize('array, expected', [ + (np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64)), + (pd.Categorical(['a', 'b']), np.array(['a', 'b'], dtype=object)), + (pd.core.arrays.period_array(['2000', '2001'], freq='D'), + np.array([pd.Period('2000', freq="D"), pd.Period('2001', freq='D')])), + (pd.core.arrays.integer_array([0, np.nan]), + np.array([0, np.nan], dtype=object)), + (pd.core.arrays.IntervalArray.from_breaks([0, 1, 2]), + np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object)), + (pd.SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)), + # TODO: DatetimeArray(add) +]) +@pytest.mark.parametrize('box', [pd.Series, pd.Index]) +def test_to_numpy(array, expected, box): + thing = box(array) + + if array.dtype.name in ('Int64', 'Sparse[int64, 0]') and box is pd.Index: + pytest.skip("No index type for {}".format(array.dtype)) + + result = thing.to_numpy() + tm.assert_numpy_array_equal(result, expected)