Skip to content

Commit

Permalink
Backport PR pandas-dev#60454: String dtype: coerce missing values in …
Browse files Browse the repository at this point in the history
…indexers for string dtype Index
  • Loading branch information
jorisvandenbossche authored and meeseeksmachine committed Jan 2, 2025
1 parent e53967b commit c5611e8
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 36 deletions.
10 changes: 1 addition & 9 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -536,23 +536,15 @@ cdef class StringObjectEngine(ObjectEngine):

cdef:
object na_value
bint uses_na

def __init__(self, ndarray values, na_value):
super().__init__(values)
self.na_value = na_value
self.uses_na = na_value is C_NA

cdef bint _checknull(self, object val):
if self.uses_na:
return val is C_NA
else:
return util.is_nan(val)

cdef _check_type(self, object val):
if isinstance(val, str):
return val
elif self._checknull(val):
elif checknull(val):
return self.na_value
else:
raise KeyError(val)
Expand Down
3 changes: 0 additions & 3 deletions pandas/tests/frame/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas._libs import iNaT
from pandas.errors import (
InvalidIndexError,
Expand Down Expand Up @@ -517,7 +515,6 @@ def test_setitem_ambig(self, using_infer_string):
else:
assert dm[2].dtype == np.object_

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_setitem_None(self, float_frame):
# GH #766
float_frame[None] = float_frame["A"]
Expand Down
33 changes: 16 additions & 17 deletions pandas/tests/indexes/string/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,15 @@ def _isnan(val):
return False


def _equivalent_na(dtype, null):
if dtype.na_value is pd.NA and null is pd.NA:
return True
elif _isnan(dtype.na_value) and _isnan(null):
return True
else:
return False


class TestGetLoc:
def test_get_loc(self, any_string_dtype):
index = Index(["a", "b", "c"], dtype=any_string_dtype)
Expand Down Expand Up @@ -41,14 +50,7 @@ def test_get_loc_non_missing(self, any_string_dtype, nulls_fixture):

def test_get_loc_missing(self, any_string_dtype, nulls_fixture):
index = Index(["a", "b", nulls_fixture], dtype=any_string_dtype)
if any_string_dtype == "string" and (
(any_string_dtype.na_value is pd.NA and nulls_fixture is not pd.NA)
or (_isnan(any_string_dtype.na_value) and not _isnan(nulls_fixture))
):
with pytest.raises(KeyError):
index.get_loc(nulls_fixture)
else:
assert index.get_loc(nulls_fixture) == 2
assert index.get_loc(nulls_fixture) == 2


class TestGetIndexer:
Expand Down Expand Up @@ -93,9 +95,8 @@ def test_get_indexer_missing(self, any_string_dtype, null, using_infer_string):
result = index.get_indexer(["a", null, "c"])
if using_infer_string:
expected = np.array([0, 2, -1], dtype=np.intp)
elif any_string_dtype == "string" and (
(any_string_dtype.na_value is pd.NA and null is not pd.NA)
or (_isnan(any_string_dtype.na_value) and not _isnan(null))
elif any_string_dtype == "string" and not _equivalent_na(
any_string_dtype, null
):
expected = np.array([0, -1, -1], dtype=np.intp)
else:
Expand All @@ -115,9 +116,8 @@ def test_get_indexer_non_unique_nas(
if using_infer_string:
expected_indexer = np.array([0, 2], dtype=np.intp)
expected_missing = np.array([], dtype=np.intp)
elif any_string_dtype == "string" and (
(any_string_dtype.na_value is pd.NA and null is not pd.NA)
or (_isnan(any_string_dtype.na_value) and not _isnan(null))
elif any_string_dtype == "string" and not _equivalent_na(
any_string_dtype, null
):
expected_indexer = np.array([0, -1], dtype=np.intp)
expected_missing = np.array([1], dtype=np.intp)
Expand All @@ -133,9 +133,8 @@ def test_get_indexer_non_unique_nas(

if using_infer_string:
expected_indexer = np.array([0, 1, 3], dtype=np.intp)
elif any_string_dtype == "string" and (
(any_string_dtype.na_value is pd.NA and null is not pd.NA)
or (_isnan(any_string_dtype.na_value) and not _isnan(null))
elif any_string_dtype == "string" and not _equivalent_na(
any_string_dtype, null
):
pass
else:
Expand Down
12 changes: 5 additions & 7 deletions pandas/tests/reshape/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -2619,6 +2619,8 @@ def test_pivot_columns_not_given(self):
with pytest.raises(TypeError, match="missing 1 required keyword-only argument"):
df.pivot() # pylint: disable=missing-kwoa

# this still fails because columns=None gets passed down to unstack as level=None
# while at that point None was converted to NaN
@pytest.mark.xfail(
using_string_dtype(), reason="TODO(infer_string) None is cast to NaN"
)
Expand All @@ -2637,10 +2639,7 @@ def test_pivot_columns_is_none(self):
expected = DataFrame({1: 3}, index=Index([2], name="b"))
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(
using_string_dtype(), reason="TODO(infer_string) None is cast to NaN"
)
def test_pivot_index_is_none(self):
def test_pivot_index_is_none(self, using_infer_string):
# GH#48293
df = DataFrame({None: [1], "b": 2, "c": 3})

Expand All @@ -2651,11 +2650,10 @@ def test_pivot_index_is_none(self):

result = df.pivot(columns="b", index=None, values="c")
expected = DataFrame(3, index=[1], columns=Index([2], name="b"))
if using_infer_string:
expected.index.name = np.nan
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(
using_string_dtype(), reason="TODO(infer_string) None is cast to NaN"
)
def test_pivot_values_is_none(self):
# GH#48293
df = DataFrame({None: [1], "b": 2, "c": 3})
Expand Down

0 comments on commit c5611e8

Please sign in to comment.