Skip to content

Commit

Permalink
BUG: Coerce to numeric despite uint64 conflict (pandas-dev#17823)
Browse files Browse the repository at this point in the history
  • Loading branch information
gfyoung authored and No-Stream committed Nov 28, 2017
1 parent 783dc95 commit e9015c3
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 53 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -829,6 +829,7 @@ Conversion
- Bug in ``Timestamp.replace`` when replacing ``tzinfo`` around DST changes (:issue:`15683`)
- Bug in ``Timedelta`` construction and arithmetic that would not propagate the ``Overflow`` exception (:issue:`17367`)
- Bug in :meth:`~DataFrame.astype` converting to object dtype when passed extension type classes (`DatetimeTZDtype``, ``CategoricalDtype``) rather than instances. Now a ``TypeError`` is raised when a class is passed (:issue:`17780`).
- Bug in :meth:`to_numeric` in which elements were not always being coerced to numeric when ``errors='coerce'`` (:issue:`17007`, :issue:`17125`)

Indexing
^^^^^^^^
Expand Down
27 changes: 12 additions & 15 deletions pandas/_libs/src/inference.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -165,20 +165,8 @@ cdef class Seen(object):
two conflict cases was also detected. However, we are
trying to force conversion to a numeric dtype.
"""
if self.uint_ and (self.null_ or self.sint_):
if not self.coerce_numeric:
return True

if self.null_:
msg = ("uint64 array detected, and such an "
"array cannot contain NaN.")
else: # self.sint_ = 1
msg = ("uint64 and negative values detected. "
"Cannot safely return a numeric array "
"without truncating data.")

raise ValueError(msg)
return False
return (self.uint_ and (self.null_ or self.sint_)
and not self.coerce_numeric)

cdef inline saw_null(self):
"""
Expand Down Expand Up @@ -1103,10 +1091,17 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
seen.saw_int(val)

if val >= 0:
uints[i] = val
if val <= oUINT64_MAX:
uints[i] = val
else:
seen.float_ = True

if val <= oINT64_MAX:
ints[i] = val

if seen.sint_ and seen.uint_:
seen.float_ = True

elif util.is_bool_object(val):
floats[i] = uints[i] = ints[i] = bools[i] = val
seen.bool_ = True
Expand Down Expand Up @@ -1154,6 +1149,8 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
uints[i] = as_int
if as_int <= oINT64_MAX:
ints[i] = as_int

seen.float_ = seen.float_ or (seen.uint_ and seen.sint_)
else:
seen.float_ = True
except (TypeError, ValueError) as e:
Expand Down
75 changes: 37 additions & 38 deletions pandas/tests/dtypes/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@
from pandas.util import testing as tm


@pytest.fixture(params=[True, False], ids=lambda val: str(val))
def coerce(request):
return request.param


def test_is_sequence():
is_seq = inference.is_sequence
assert (is_seq((1, 2)))
Expand Down Expand Up @@ -340,44 +345,38 @@ def test_convert_numeric_uint64(self):
exp = np.array([2**63], dtype=np.uint64)
tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp)

def test_convert_numeric_uint64_nan(self):
msg = 'uint64 array detected'
cases = [(np.array([2**63, np.nan], dtype=object), set()),
(np.array([str(2**63), np.nan], dtype=object), set()),
(np.array([np.nan, 2**63], dtype=object), set()),
(np.array([np.nan, str(2**63)], dtype=object), set()),
(np.array([2**63, 2**63 + 1], dtype=object), set([2**63])),
(np.array([str(2**63), str(2**63 + 1)],
dtype=object), set([2**63]))]

for coerce in (True, False):
for arr, na_values in cases:
if coerce:
with tm.assert_raises_regex(ValueError, msg):
lib.maybe_convert_numeric(arr, na_values,
coerce_numeric=coerce)
else:
tm.assert_numpy_array_equal(lib.maybe_convert_numeric(
arr, na_values), arr)

def test_convert_numeric_int64_uint64(self):
msg = 'uint64 and negative values detected'
cases = [np.array([2**63, -1], dtype=object),
np.array([str(2**63), -1], dtype=object),
np.array([str(2**63), str(-1)], dtype=object),
np.array([-1, 2**63], dtype=object),
np.array([-1, str(2**63)], dtype=object),
np.array([str(-1), str(2**63)], dtype=object)]

for coerce in (True, False):
for case in cases:
if coerce:
with tm.assert_raises_regex(ValueError, msg):
lib.maybe_convert_numeric(case, set(),
coerce_numeric=coerce)
else:
tm.assert_numpy_array_equal(lib.maybe_convert_numeric(
case, set()), case)
@pytest.mark.parametrize("arr", [
np.array([2**63, np.nan], dtype=object),
np.array([str(2**63), np.nan], dtype=object),
np.array([np.nan, 2**63], dtype=object),
np.array([np.nan, str(2**63)], dtype=object)])
def test_convert_numeric_uint64_nan(self, coerce, arr):
expected = arr.astype(float) if coerce else arr.copy()
result = lib.maybe_convert_numeric(arr, set(),
coerce_numeric=coerce)
tm.assert_almost_equal(result, expected)

def test_convert_numeric_uint64_nan_values(self, coerce):
arr = np.array([2**63, 2**63 + 1], dtype=object)
na_values = set([2**63])

expected = (np.array([np.nan, 2**63 + 1], dtype=float)
if coerce else arr.copy())
result = lib.maybe_convert_numeric(arr, na_values,
coerce_numeric=coerce)
tm.assert_almost_equal(result, expected)

@pytest.mark.parametrize("case", [
np.array([2**63, -1], dtype=object),
np.array([str(2**63), -1], dtype=object),
np.array([str(2**63), str(-1)], dtype=object),
np.array([-1, 2**63], dtype=object),
np.array([-1, str(2**63)], dtype=object),
np.array([str(-1), str(2**63)], dtype=object)])
def test_convert_numeric_int64_uint64(self, case, coerce):
expected = case.astype(float) if coerce else case.copy()
result = lib.maybe_convert_numeric(case, set(), coerce_numeric=coerce)
tm.assert_almost_equal(result, expected)

def test_maybe_convert_objects_uint64(self):
# see gh-4471
Expand Down
25 changes: 25 additions & 0 deletions pandas/tests/tools/test_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,3 +381,28 @@ def test_downcast_limits(self):
for dtype, downcast, min_max in dtype_downcast_min_max:
series = pd.to_numeric(pd.Series(min_max), downcast=downcast)
assert series.dtype == dtype

def test_coerce_uint64_conflict(self):
# see gh-17007 and gh-17125
#
# Still returns float despite the uint64-nan conflict,
# which would normally force the casting to object.
df = pd.DataFrame({"a": [200, 300, "", "NaN", 30000000000000000000]})
expected = pd.Series([200, 300, np.nan, np.nan,
30000000000000000000], dtype=float, name="a")
result = to_numeric(df["a"], errors="coerce")
tm.assert_series_equal(result, expected)

s = pd.Series(["12345678901234567890", "1234567890", "ITEM"])
expected = pd.Series([12345678901234567890,
1234567890, np.nan], dtype=float)
result = to_numeric(s, errors="coerce")
tm.assert_series_equal(result, expected)

# For completeness, check against "ignore" and "raise"
result = to_numeric(s, errors="ignore")
tm.assert_series_equal(result, s)

msg = "Unable to parse string"
with tm.assert_raises_regex(ValueError, msg):
to_numeric(s, errors="raise")

0 comments on commit e9015c3

Please sign in to comment.