diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 275c977e9b37b..802f605e37f42 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -66,6 +66,12 @@ validate_all_hashable, ) from pandas.core.dtypes.concat import concat_compat +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + DatetimeTZDtype, + IntervalDtype, + PeriodDtype, +) from pandas.core.dtypes.generic import ( ABCDatetimeIndex, ABCMultiIndex, @@ -331,12 +337,6 @@ def __new__( # index-like elif isinstance(data, (np.ndarray, Index, ABCSeries)): - # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 - from pandas.core.indexes.numeric import ( - Float64Index, - Int64Index, - UInt64Index, - ) if dtype is not None: # we need to avoid having numpy coerce @@ -347,42 +347,31 @@ def __new__( data = _maybe_cast_with_dtype(data, dtype, copy) dtype = data.dtype # TODO: maybe not for object? - # maybe coerce to a sub-class - if is_signed_integer_dtype(data.dtype): - return Int64Index(data, copy=copy, dtype=dtype, name=name) - elif is_unsigned_integer_dtype(data.dtype): - return UInt64Index(data, copy=copy, dtype=dtype, name=name) - elif is_float_dtype(data.dtype): - return Float64Index(data, copy=copy, dtype=dtype, name=name) - elif issubclass(data.dtype.type, bool) or is_bool_dtype(data): - subarr = data.astype("object") + if data.dtype.kind in ["i", "u", "f"]: + # maybe coerce to a sub-class + arr = data else: - subarr = com.asarray_tuplesafe(data, dtype=object) - - # asarray_tuplesafe does not always copy underlying data, - # so need to make sure that this happens - if copy: - subarr = subarr.copy() + arr = com.asarray_tuplesafe(data, dtype=object) - if dtype is None: - new_data, new_dtype = _maybe_cast_data_without_dtype(subarr) - if new_dtype is not None: + if dtype is None: + new_data = _maybe_cast_data_without_dtype(arr) + new_dtype = new_data.dtype return cls( - new_data, dtype=new_dtype, copy=False, name=name, **kwargs + new_data, dtype=new_dtype, copy=copy, name=name, **kwargs ) + klass = cls._dtype_to_subclass(arr.dtype) + arr = klass._ensure_array(arr, dtype, copy) if kwargs: raise TypeError(f"Unexpected keyword arguments {repr(set(kwargs))}") - if subarr.ndim > 1: - # GH#13601, GH#20285, GH#27125 - raise ValueError("Index data must be 1-dimensional") - return cls._simple_new(subarr, name) + return klass._simple_new(arr, name) - elif data is None or is_scalar(data): + elif is_scalar(data): raise cls._scalar_data_error(data) elif hasattr(data, "__array__"): return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, **kwargs) else: + if tupleize_cols and is_list_like(data): # GH21470: convert iterable to list before determining if empty if is_iterator(data): @@ -400,6 +389,64 @@ def __new__( subarr = com.asarray_tuplesafe(data, dtype=object) return Index(subarr, dtype=dtype, copy=copy, name=name, **kwargs) + @classmethod + def _ensure_array(cls, data, dtype, copy: bool): + """ + Ensure we have a valid array to pass to _simple_new. + """ + if data.ndim > 1: + # GH#13601, GH#20285, GH#27125 + raise ValueError("Index data must be 1-dimensional") + if copy: + # asarray_tuplesafe does not always copy underlying data, + # so need to make sure that this happens + data = data.copy() + return data + + @classmethod + def _dtype_to_subclass(cls, dtype: DtypeObj): + # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423 + + if isinstance(dtype, DatetimeTZDtype) or dtype == np.dtype("M8[ns]"): + from pandas import DatetimeIndex + + return DatetimeIndex + elif dtype == "m8[ns]": + from pandas import TimedeltaIndex + + return TimedeltaIndex + elif isinstance(dtype, CategoricalDtype): + from pandas import CategoricalIndex + + return CategoricalIndex + elif isinstance(dtype, IntervalDtype): + from pandas import IntervalIndex + + return IntervalIndex + elif isinstance(dtype, PeriodDtype): + from pandas import PeriodIndex + + return PeriodIndex + + elif is_float_dtype(dtype): + from pandas import Float64Index + + return Float64Index + elif is_unsigned_integer_dtype(dtype): + from pandas import UInt64Index + + return UInt64Index + elif is_signed_integer_dtype(dtype): + from pandas import Int64Index + + return Int64Index + + elif dtype == object: + # NB: assuming away MultiIndex + return Index + + raise NotImplementedError(dtype) + """ NOTE for new Index creation: @@ -6112,25 +6159,27 @@ def _maybe_cast_data_without_dtype(subarr): TimedeltaArray, ) + assert subarr.dtype == object, subarr.dtype inferred = lib.infer_dtype(subarr, skipna=False) if inferred == "integer": try: data = _try_convert_to_int_array(subarr, False, None) - return data, data.dtype + return data except ValueError: pass - return subarr, object + return subarr elif inferred in ["floating", "mixed-integer-float", "integer-na"]: # TODO: Returns IntegerArray for integer-na case in the future - return subarr, np.float64 + data = np.asarray(subarr).astype(np.float64, copy=False) + return data elif inferred == "interval": try: data = IntervalArray._from_sequence(subarr, copy=False) - return data, data.dtype + return data except ValueError: # GH27172: mixed closed Intervals --> object dtype pass @@ -6141,7 +6190,7 @@ def _maybe_cast_data_without_dtype(subarr): if inferred.startswith("datetime"): try: data = DatetimeArray._from_sequence(subarr, copy=False) - return data, data.dtype + return data except (ValueError, OutOfBoundsDatetime): # GH 27011 # If we have mixed timezones, just send it @@ -6150,15 +6199,15 @@ def _maybe_cast_data_without_dtype(subarr): elif inferred.startswith("timedelta"): data = TimedeltaArray._from_sequence(subarr, copy=False) - return data, data.dtype + return data elif inferred == "period": try: data = PeriodArray._from_sequence(subarr) - return data, data.dtype + return data except IncompatibleFrequency: pass - return subarr, subarr.dtype + return subarr def _try_convert_to_int_array( diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 91d27d9922aa5..2c2888e1c6f72 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -46,11 +46,20 @@ class NumericIndex(Index): _can_hold_strings = False def __new__(cls, data=None, dtype=None, copy=False, name=None): - cls._validate_dtype(dtype) name = maybe_extract_name(name, data, cls) - # Coerce to ndarray if not already ndarray or Index + subarr = cls._ensure_array(data, dtype, copy) + return cls._simple_new(subarr, name=name) + + @classmethod + def _ensure_array(cls, data, dtype, copy: bool): + """ + Ensure we have a valid array to pass to _simple_new. + """ + cls._validate_dtype(dtype) + if not isinstance(data, (np.ndarray, Index)): + # Coerce to ndarray if not already ndarray or Index if is_scalar(data): raise cls._scalar_data_error(data) @@ -74,7 +83,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None): raise ValueError("Index data must be 1-dimensional") subarr = np.asarray(subarr) - return cls._simple_new(subarr, name=name) + return subarr @classmethod def _validate_dtype(cls, dtype: Dtype) -> None: