From 0542c4be99d4129acc51256d23e82f963af4eca4 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 21 Dec 2020 07:50:01 -0800 Subject: [PATCH 1/5] REF: simplify Index.__new__ --- pandas/core/indexes/base.py | 99 ++++++++++++++++++++++++---------- pandas/core/indexes/numeric.py | 15 ++++-- 2 files changed, 84 insertions(+), 30 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 8d48a6277d412..369e5f87d33ca 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -66,6 +66,12 @@ validate_all_hashable, ) from pandas.core.dtypes.concat import concat_compat +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + DatetimeTZDtype, + IntervalDtype, + PeriodDtype, +) from pandas.core.dtypes.generic import ( ABCDatetimeIndex, ABCMultiIndex, @@ -332,11 +338,6 @@ def __new__( # index-like elif isinstance(data, (np.ndarray, Index, ABCSeries)): # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 - from pandas.core.indexes.numeric import ( - Float64Index, - Int64Index, - UInt64Index, - ) if dtype is not None: # we need to avoid having numpy coerce @@ -347,38 +348,27 @@ def __new__( data = _maybe_cast_with_dtype(data, dtype, copy) dtype = data.dtype # TODO: maybe not for object? - # maybe coerce to a sub-class - if is_signed_integer_dtype(data.dtype): - return Int64Index(data, copy=copy, dtype=dtype, name=name) - elif is_unsigned_integer_dtype(data.dtype): - return UInt64Index(data, copy=copy, dtype=dtype, name=name) - elif is_float_dtype(data.dtype): - return Float64Index(data, copy=copy, dtype=dtype, name=name) + if data.dtype.kind in ["i", "u", "f"]: + # maybe coerce to a sub-class + klass = cls._dtype_to_subclass(data.dtype) + arr = klass._ensure_array(data, dtype, copy) + return klass._simple_new(arr, name=name) + elif issubclass(data.dtype.type, bool) or is_bool_dtype(data): subarr = data.astype("object") else: subarr = com.asarray_tuplesafe(data, dtype=object) - # asarray_tuplesafe does not always copy underlying data, - # so need to make sure that this happens - if copy: - subarr = subarr.copy() - if dtype is None: new_data, new_dtype = _maybe_cast_data_without_dtype(subarr) - if new_dtype is not None: - return cls( - new_data, dtype=new_dtype, copy=False, name=name, **kwargs - ) + return cls(new_data, dtype=new_dtype, copy=copy, name=name, **kwargs) + subarr = cls._ensure_array(subarr, dtype, copy) if kwargs: raise TypeError(f"Unexpected keyword arguments {repr(set(kwargs))}") - if subarr.ndim > 1: - # GH#13601, GH#20285, GH#27125 - raise ValueError("Index data must be 1-dimensional") return cls._simple_new(subarr, name) - elif data is None or is_scalar(data): + elif is_scalar(data): raise cls._scalar_data_error(data) elif hasattr(data, "__array__"): return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, **kwargs) @@ -400,6 +390,60 @@ def __new__( subarr = com.asarray_tuplesafe(data, dtype=object) return Index(subarr, dtype=dtype, copy=copy, name=name, **kwargs) + @classmethod + def _ensure_array(cls, data, dtype, copy: bool): + """ + Ensure we have a valid array to pass to _simple_new. + """ + if data.ndim > 1: + # GH#13601, GH#20285, GH#27125 + raise ValueError("Index data must be 1-dimensional") + if copy: + # asarray_tuplesafe does not always copy underlying data, + # so need to make sure that this happens + data = data.copy() + return data + + @classmethod + def _dtype_to_subclass(cls, dtype: DtypeObj): + # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 + + if isinstance(dtype, DatetimeTZDtype) or dtype == np.dtype("M8[ns]"): + from pandas import DatetimeIndex + + return DatetimeIndex + if dtype == "m8[ns]": + from pandas import TimedeltaIndex + + return TimedeltaIndex + if isinstance(dtype, CategoricalDtype): + from pandas import CategoricalIndex + + return CategoricalIndex + if isinstance(dtype, IntervalDtype): + from pandas import IntervalIndex + + return IntervalIndex + if isinstance(dtype, PeriodDtype): + from pandas import PeriodIndex + + return PeriodIndex + + if is_float_dtype(dtype): + from pandas import Float64Index + + return Float64Index + if is_unsigned_integer_dtype(dtype): + from pandas import UInt64Index + + return UInt64Index + if is_signed_integer_dtype(dtype): + from pandas import Int64Index + + return Int64Index + + raise NotImplementedError(dtype) + """ NOTE for new Index creation: @@ -6048,6 +6092,7 @@ def _maybe_cast_data_without_dtype(subarr): TimedeltaArray, ) + assert subarr.dtype == object, subarr.dtype inferred = lib.infer_dtype(subarr, skipna=False) if inferred == "integer": @@ -6057,11 +6102,11 @@ def _maybe_cast_data_without_dtype(subarr): except ValueError: pass - return subarr, object + return subarr, np.dtype(object) elif inferred in ["floating", "mixed-integer-float", "integer-na"]: # TODO: Returns IntegerArray for integer-na case in the future - return subarr, np.float64 + return subarr, np.dtype(np.float64) elif inferred == "interval": try: diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 91d27d9922aa5..2c2888e1c6f72 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -46,11 +46,20 @@ class NumericIndex(Index): _can_hold_strings = False def __new__(cls, data=None, dtype=None, copy=False, name=None): - cls._validate_dtype(dtype) name = maybe_extract_name(name, data, cls) - # Coerce to ndarray if not already ndarray or Index + subarr = cls._ensure_array(data, dtype, copy) + return cls._simple_new(subarr, name=name) + + @classmethod + def _ensure_array(cls, data, dtype, copy: bool): + """ + Ensure we have a valid array to pass to _simple_new. + """ + cls._validate_dtype(dtype) + if not isinstance(data, (np.ndarray, Index)): + # Coerce to ndarray if not already ndarray or Index if is_scalar(data): raise cls._scalar_data_error(data) @@ -74,7 +83,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None): raise ValueError("Index data must be 1-dimensional") subarr = np.asarray(subarr) - return cls._simple_new(subarr, name=name) + return subarr @classmethod def _validate_dtype(cls, dtype: Dtype) -> None: From c897c6021fcd558948d885ff906be4130c0fd3d9 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 21 Dec 2020 08:37:06 -0800 Subject: [PATCH 2/5] REF: Index.__new__ use helpers --- pandas/core/indexes/base.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 369e5f87d33ca..0e81bee296275 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -337,7 +337,6 @@ def __new__( # index-like elif isinstance(data, (np.ndarray, Index, ABCSeries)): - # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 if dtype is not None: # we need to avoid having numpy coerce @@ -350,29 +349,28 @@ def __new__( if data.dtype.kind in ["i", "u", "f"]: # maybe coerce to a sub-class - klass = cls._dtype_to_subclass(data.dtype) - arr = klass._ensure_array(data, dtype, copy) - return klass._simple_new(arr, name=name) - - elif issubclass(data.dtype.type, bool) or is_bool_dtype(data): - subarr = data.astype("object") + arr = data else: - subarr = com.asarray_tuplesafe(data, dtype=object) + arr = com.asarray_tuplesafe(data, dtype=object) - if dtype is None: - new_data, new_dtype = _maybe_cast_data_without_dtype(subarr) - return cls(new_data, dtype=new_dtype, copy=copy, name=name, **kwargs) + if dtype is None: + new_data, new_dtype = _maybe_cast_data_without_dtype(arr) + return cls( + new_data, dtype=new_dtype, copy=copy, name=name, **kwargs + ) - subarr = cls._ensure_array(subarr, dtype, copy) + klass = cls._dtype_to_subclass(arr.dtype) + arr = klass._ensure_array(arr, dtype, copy) if kwargs: raise TypeError(f"Unexpected keyword arguments {repr(set(kwargs))}") - return cls._simple_new(subarr, name) + return klass._simple_new(arr, name) elif is_scalar(data): raise cls._scalar_data_error(data) elif hasattr(data, "__array__"): return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, **kwargs) else: + if tupleize_cols and is_list_like(data): # GH21470: convert iterable to list before determining if empty if is_iterator(data): @@ -442,6 +440,10 @@ def _dtype_to_subclass(cls, dtype: DtypeObj): return Int64Index + if dtype == object: + # NB: assuming away MultiIndex + return Index + raise NotImplementedError(dtype) """ From 7e5b6b9385f5ce03ca1fc5904a86c34f837ed35a Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 21 Dec 2020 10:33:22 -0800 Subject: [PATCH 3/5] dont return dtype from _maybe_cast_data_without_dtype --- pandas/core/indexes/base.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 0e81bee296275..7320842f7e460 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -354,7 +354,8 @@ def __new__( arr = com.asarray_tuplesafe(data, dtype=object) if dtype is None: - new_data, new_dtype = _maybe_cast_data_without_dtype(arr) + new_data = _maybe_cast_data_without_dtype(arr) + new_dtype = new_data.dtype return cls( new_data, dtype=new_dtype, copy=copy, name=name, **kwargs ) @@ -6100,20 +6101,21 @@ def _maybe_cast_data_without_dtype(subarr): if inferred == "integer": try: data = _try_convert_to_int_array(subarr, False, None) - return data, data.dtype + return data except ValueError: pass - return subarr, np.dtype(object) + return subarr elif inferred in ["floating", "mixed-integer-float", "integer-na"]: # TODO: Returns IntegerArray for integer-na case in the future - return subarr, np.dtype(np.float64) + data = np.asarray(subarr).astype(np.float64) + return data elif inferred == "interval": try: data = IntervalArray._from_sequence(subarr, copy=False) - return data, data.dtype + return data except ValueError: # GH27172: mixed closed Intervals --> object dtype pass @@ -6124,7 +6126,7 @@ def _maybe_cast_data_without_dtype(subarr): if inferred.startswith("datetime"): try: data = DatetimeArray._from_sequence(subarr, copy=False) - return data, data.dtype + return data except (ValueError, OutOfBoundsDatetime): # GH 27011 # If we have mixed timezones, just send it @@ -6133,15 +6135,15 @@ def _maybe_cast_data_without_dtype(subarr): elif inferred.startswith("timedelta"): data = TimedeltaArray._from_sequence(subarr, copy=False) - return data, data.dtype + return data elif inferred == "period": try: data = PeriodArray._from_sequence(subarr) - return data, data.dtype + return data except IncompatibleFrequency: pass - return subarr, subarr.dtype + return subarr def _try_convert_to_int_array( From 86c28e6e616d1a9cf33a3f1a6451adc3aa315cb9 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 24 Dec 2020 10:18:10 -0800 Subject: [PATCH 4/5] if -> elif --- pandas/core/indexes/base.py | 16 ++++++++-------- pandas/core/internals/blocks.py | 7 ++----- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5066a0080181c..535f603cf694c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -411,37 +411,37 @@ def _dtype_to_subclass(cls, dtype: DtypeObj): from pandas import DatetimeIndex return DatetimeIndex - if dtype == "m8[ns]": + elif dtype == "m8[ns]": from pandas import TimedeltaIndex return TimedeltaIndex - if isinstance(dtype, CategoricalDtype): + elif isinstance(dtype, CategoricalDtype): from pandas import CategoricalIndex return CategoricalIndex - if isinstance(dtype, IntervalDtype): + elif isinstance(dtype, IntervalDtype): from pandas import IntervalIndex return IntervalIndex - if isinstance(dtype, PeriodDtype): + elif isinstance(dtype, PeriodDtype): from pandas import PeriodIndex return PeriodIndex - if is_float_dtype(dtype): + elif is_float_dtype(dtype): from pandas import Float64Index return Float64Index - if is_unsigned_integer_dtype(dtype): + elif is_unsigned_integer_dtype(dtype): from pandas import UInt64Index return UInt64Index - if is_signed_integer_dtype(dtype): + elif is_signed_integer_dtype(dtype): from pandas import Int64Index return Int64Index - if dtype == object: + elif dtype == object: # NB: assuming away MultiIndex return Index diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index a3744519e9c2b..138a19779b831 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1063,15 +1063,12 @@ def putmask(self, mask, new, axis: int = 0) -> List["Block"]: # We only get here for non-Extension Blocks, so _try_coerce_args # is only relevant for DatetimeBlock and TimedeltaBlock if self.dtype.kind in ["m", "M"]: - blk = self - if not inplace: - blk = self.copy() - arr = blk.array_values() + arr = self.array_values() arr = cast("NDArrayBackedExtensionArray", arr) if transpose: arr = arr.T arr.putmask(mask, new) - return [blk] + return [self] if lib.is_scalar(new): new = convert_scalar_for_putitemlike(new, self.values.dtype) From 8dc0330f9240b3862c4676bb506052ab62cc2531 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 29 Dec 2020 08:00:55 -0800 Subject: [PATCH 5/5] copy=False in astype --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 62ca8673fdaed..802f605e37f42 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6173,7 +6173,7 @@ def _maybe_cast_data_without_dtype(subarr): elif inferred in ["floating", "mixed-integer-float", "integer-na"]: # TODO: Returns IntegerArray for integer-na case in the future - data = np.asarray(subarr).astype(np.float64) + data = np.asarray(subarr).astype(np.float64, copy=False) return data elif inferred == "interval":