Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH/PERF: optional mask in nullable dtypes #42012

Closed
wants to merge 14 commits into from
26 changes: 17 additions & 9 deletions asv_bench/benchmarks/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@

class BooleanArray:
def setup(self):
self.values_bool = np.array([True, False, True, False])
self.values_float = np.array([1.0, 0.0, 1.0, 0.0])
self.values_integer = np.array([1, 0, 1, 0])
self.values_integer_like = [1, 0, 1, 0]
self.data = np.array([True, False, True, False])
self.mask = np.array([False, False, True, False])
self.values_bool = np.array([True, False, True, False] * 1000)
self.values_float = np.array([1.0, 0.0, 1.0, 0.0] * 1000)
self.values_integer = np.array([1, 0, 1, 0] * 1000)
self.values_integer_like = [1, 0, 1, 0] * 1000
self.data = np.array([True, False, True, False] * 1000)
self.mask = np.array([False, False, True, False] * 1000)

def time_constructor(self):
pd.arrays.BooleanArray(self.data, self.mask)
Expand All @@ -30,12 +30,20 @@ def time_from_float_array(self):

class IntegerArray:
def setup(self):
self.values_integer = np.array([1, 0, 1, 0])
self.data = np.array([1, 2, 3, 4], dtype="int64")
self.mask = np.array([False, False, True, False])
self.values_integer = np.array([1, 0, 1, 0] * 1000)
self.data = np.array([1, 2, 3, 4] * 1000, dtype="int64")
self.mask = np.array([False, False, True, False] * 1000)

def time_constructor(self):
pd.arrays.IntegerArray(self.data, self.mask)

def time_from_integer_array(self):
pd.array(self.values_integer, dtype="Int64")


class NullableArrayMemory:
params = [["boolean", "Int8", "Float32"]]
param_names = ["dtype"]

def track_array_size(self, dtype):
return pd.array(np.ones(1000), dtype=dtype).nbytes
19 changes: 16 additions & 3 deletions asv_bench/benchmarks/series_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,22 @@ def time_any(self, N, case, dtype):
self.s.any()


class Isna:
params = ["float", "Float64", "Int64", "boolean"]
param_names = ["dtype"]

def setup(self, dtype):
self.ser = Series(np.ones(10000), dtype=dtype)
self.ser_nulls = self.ser.copy()
self.ser_nulls[::2] = np.nan

def time_isna_no_nans(self, dtype):
self.ser.isna()

def time_isna_nans(self, dtype):
self.ser_nulls.isna()


class NanOps:

params = [
Expand All @@ -217,9 +233,6 @@ class NanOps:
param_names = ["func", "N", "dtype"]

def setup(self, func, N, dtype):
if func == "argmax" and dtype in {"Int64", "boolean"}:
# Skip argmax for nullable int since this doesn't work yet (GH-24382)
raise NotImplementedError
self.s = Series([1] * N, dtype=dtype)
self.func = getattr(self.s, func)

Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ Deprecations

Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
-
- Performance improvement and memory savings for operations with nullable data types when no missing values are present (:issue:`30435`)
-

.. ---------------------------------------------------------------------------
Expand Down
52 changes: 36 additions & 16 deletions pandas/core/array_algos/masked_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
masked_reductions.py is for reduction algorithms using a mask-based approach
for missing values.
"""
from __future__ import annotations

from typing import Callable

Expand All @@ -15,7 +16,7 @@
def _sumprod(
func: Callable,
values: np.ndarray,
mask: np.ndarray,
mask: np.ndarray | None,
*,
skipna: bool = True,
min_count: int = 0,
Expand All @@ -30,42 +31,54 @@ def _sumprod(
Numpy array with the values (can be of any dtype that support the
operation).
mask : np.ndarray
Boolean numpy array (True values indicate missing values).
Boolean numpy array (True values indicate missing values). None is equivalent
to all False
skipna : bool, default True
Whether to skip NA.
min_count : int, default 0
The required number of valid values to perform the operation. If fewer than
``min_count`` non-NA values are present the result will be NA.
"""
if not skipna:
if mask.any() or check_below_min_count(values.shape, None, min_count):
if mask is not None or check_below_min_count(values.shape, None, min_count):
return libmissing.NA
else:
return func(values)
else:
if check_below_min_count(values.shape, mask, min_count):
return libmissing.NA
return func(values, where=~mask)
if mask is not None:
return func(values, where=~mask)
else:
return func(values)


def sum(
values: np.ndarray, mask: np.ndarray, *, skipna: bool = True, min_count: int = 0
values: np.ndarray,
mask: np.ndarray | None,
*,
skipna: bool = True,
min_count: int = 0,
):
return _sumprod(
np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count
)


def prod(
values: np.ndarray, mask: np.ndarray, *, skipna: bool = True, min_count: int = 0
values: np.ndarray,
mask: np.ndarray | None,
*,
skipna: bool = True,
min_count: int = 0,
):
return _sumprod(
np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count
)


def _minmax(
func: Callable, values: np.ndarray, mask: np.ndarray, *, skipna: bool = True
func: Callable, values: np.ndarray, mask: np.ndarray | None, *, skipna: bool = True
):
"""
Reduction for 1D masked array.
Expand All @@ -76,38 +89,45 @@ def _minmax(
values : np.ndarray
Numpy array with the values (can be of any dtype that support the
operation).
mask : np.ndarray
Boolean numpy array (True values indicate missing values).
mask : np.ndarray or None
Boolean numpy array (True values indicate missing values). None is equivalent
to all False.
skipna : bool, default True
Whether to skip NA.
"""
if not skipna:
if mask.any() or not values.size:
if mask is not None or not values.size:
# min/max with empty array raise in numpy, pandas returns NA
return libmissing.NA
else:
return func(values)
else:
subset = values[~mask]
if mask is not None:
subset = values[~mask]
else:
subset = values
if subset.size:
return func(subset)
else:
# min/max with empty array raise in numpy, pandas returns NA
return libmissing.NA


def min(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True):
def min(values: np.ndarray, mask: np.ndarray | None, *, skipna: bool = True):
return _minmax(np.min, values=values, mask=mask, skipna=skipna)


def max(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True):
def max(values: np.ndarray, mask: np.ndarray | None, *, skipna: bool = True):
return _minmax(np.max, values=values, mask=mask, skipna=skipna)


def mean(values: np.ndarray, mask: np.ndarray, skipna: bool = True):
if not values.size or mask.all():
def mean(values: np.ndarray, mask: np.ndarray | None, skipna: bool = True):
if not values.size or (mask is not None and mask.all()):
return libmissing.NA
_sum = _sumprod(np.sum, values=values, mask=mask, skipna=skipna)
count = np.count_nonzero(~mask)
if mask is not None:
count = np.count_nonzero(~mask)
else:
count = len(values)
mean_value = _sum / count
return mean_value
Loading