From c5611e87aca9a676e1010957135c3ad022a78b15 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 2 Jan 2025 20:49:06 +0100 Subject: [PATCH] Backport PR #60454: String dtype: coerce missing values in indexers for string dtype Index --- pandas/_libs/index.pyx | 10 +----- pandas/tests/frame/indexing/test_indexing.py | 3 -- pandas/tests/indexes/string/test_indexing.py | 33 ++++++++++---------- pandas/tests/reshape/test_pivot.py | 12 +++---- 4 files changed, 22 insertions(+), 36 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 365cc7c3cecfc..8bb839dee436d 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -536,23 +536,15 @@ cdef class StringObjectEngine(ObjectEngine): cdef: object na_value - bint uses_na def __init__(self, ndarray values, na_value): super().__init__(values) self.na_value = na_value - self.uses_na = na_value is C_NA - - cdef bint _checknull(self, object val): - if self.uses_na: - return val is C_NA - else: - return util.is_nan(val) cdef _check_type(self, object val): if isinstance(val, str): return val - elif self._checknull(val): + elif checknull(val): return self.na_value else: raise KeyError(val) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index c0ab51a484cdf..aa81257965696 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -9,8 +9,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs import iNaT from pandas.errors import ( InvalidIndexError, @@ -517,7 +515,6 @@ def test_setitem_ambig(self, using_infer_string): else: assert dm[2].dtype == np.object_ - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_None(self, float_frame): # GH #766 float_frame[None] = float_frame["A"] diff --git a/pandas/tests/indexes/string/test_indexing.py b/pandas/tests/indexes/string/test_indexing.py index d1a278af337b7..648ee47ddc34c 100644 --- a/pandas/tests/indexes/string/test_indexing.py +++ b/pandas/tests/indexes/string/test_indexing.py @@ -13,6 +13,15 @@ def _isnan(val): return False +def _equivalent_na(dtype, null): + if dtype.na_value is pd.NA and null is pd.NA: + return True + elif _isnan(dtype.na_value) and _isnan(null): + return True + else: + return False + + class TestGetLoc: def test_get_loc(self, any_string_dtype): index = Index(["a", "b", "c"], dtype=any_string_dtype) @@ -41,14 +50,7 @@ def test_get_loc_non_missing(self, any_string_dtype, nulls_fixture): def test_get_loc_missing(self, any_string_dtype, nulls_fixture): index = Index(["a", "b", nulls_fixture], dtype=any_string_dtype) - if any_string_dtype == "string" and ( - (any_string_dtype.na_value is pd.NA and nulls_fixture is not pd.NA) - or (_isnan(any_string_dtype.na_value) and not _isnan(nulls_fixture)) - ): - with pytest.raises(KeyError): - index.get_loc(nulls_fixture) - else: - assert index.get_loc(nulls_fixture) == 2 + assert index.get_loc(nulls_fixture) == 2 class TestGetIndexer: @@ -93,9 +95,8 @@ def test_get_indexer_missing(self, any_string_dtype, null, using_infer_string): result = index.get_indexer(["a", null, "c"]) if using_infer_string: expected = np.array([0, 2, -1], dtype=np.intp) - elif any_string_dtype == "string" and ( - (any_string_dtype.na_value is pd.NA and null is not pd.NA) - or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + elif any_string_dtype == "string" and not _equivalent_na( + any_string_dtype, null ): expected = np.array([0, -1, -1], dtype=np.intp) else: @@ -115,9 +116,8 @@ def test_get_indexer_non_unique_nas( if using_infer_string: expected_indexer = np.array([0, 2], dtype=np.intp) expected_missing = np.array([], dtype=np.intp) - elif any_string_dtype == "string" and ( - (any_string_dtype.na_value is pd.NA and null is not pd.NA) - or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + elif any_string_dtype == "string" and not _equivalent_na( + any_string_dtype, null ): expected_indexer = np.array([0, -1], dtype=np.intp) expected_missing = np.array([1], dtype=np.intp) @@ -133,9 +133,8 @@ def test_get_indexer_non_unique_nas( if using_infer_string: expected_indexer = np.array([0, 1, 3], dtype=np.intp) - elif any_string_dtype == "string" and ( - (any_string_dtype.na_value is pd.NA and null is not pd.NA) - or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + elif any_string_dtype == "string" and not _equivalent_na( + any_string_dtype, null ): pass else: diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 75268ccee1d8c..519564a96aa7e 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2619,6 +2619,8 @@ def test_pivot_columns_not_given(self): with pytest.raises(TypeError, match="missing 1 required keyword-only argument"): df.pivot() # pylint: disable=missing-kwoa + # this still fails because columns=None gets passed down to unstack as level=None + # while at that point None was converted to NaN @pytest.mark.xfail( using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" ) @@ -2637,10 +2639,7 @@ def test_pivot_columns_is_none(self): expected = DataFrame({1: 3}, index=Index([2], name="b")) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" - ) - def test_pivot_index_is_none(self): + def test_pivot_index_is_none(self, using_infer_string): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2651,11 +2650,10 @@ def test_pivot_index_is_none(self): result = df.pivot(columns="b", index=None, values="c") expected = DataFrame(3, index=[1], columns=Index([2], name="b")) + if using_infer_string: + expected.index.name = np.nan tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" - ) def test_pivot_values_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3})