From bc6752a5f49008c6113afeb78358a3b268c6c70a Mon Sep 17 00:00:00 2001 From: Oleg Shteynbuk Date: Fri, 2 Jun 2017 23:33:47 -0400 Subject: [PATCH 01/14] consolidated the duplicate definitions of NA values (in parsers & IO) --- pandas/tests/io/parser/na_values.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/tests/io/parser/na_values.py b/pandas/tests/io/parser/na_values.py index 170f9d428c9cc..fb277f00d8651 100644 --- a/pandas/tests/io/parser/na_values.py +++ b/pandas/tests/io/parser/na_values.py @@ -69,10 +69,7 @@ def test_non_string_na_values(self): tm.assert_frame_equal(out, expected) def test_default_na_values(self): - _NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', - '#N/A', 'N/A', 'n/a', 'NA', '#NA', 'NULL', 'null', - 'NaN', 'nan', '-NaN', '-nan', '#N/A N/A', '']) - assert _NA_VALUES == parsers._NA_VALUES + _NA_VALUES = parsers._NA_VALUES nv = len(_NA_VALUES) def f(i, v): From 0650da0d3e014ccf6a4445f6b06bba1e49fe7457 Mon Sep 17 00:00:00 2001 From: Oleg Shteynbuk Date: Sat, 3 Jun 2017 05:29:59 -0400 Subject: [PATCH 02/14] consolidated the duplicate definitions of NA values in parsers.pyx --- pandas/_libs/parsers.pyx | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 2549c8545908d..f5c06a0becb86 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -50,6 +50,8 @@ from pandas.core.algorithms import take_1d from pandas.core.dtypes.concat import union_categoricals from pandas import Index +import pandas.io.parsers as parsers + import time import os @@ -276,10 +278,10 @@ DEFAULT_CHUNKSIZE = 256 * 1024 # common NA values # no longer excluding inf representations # '1.#INF','-1.#INF', '1.#INF000000', -_NA_VALUES = [b'-1.#IND', b'1.#QNAN', b'1.#IND', b'-1.#QNAN', - b'#N/A N/A', b'n/a', b'NA', b'#NA', b'NULL', b'null', b'NaN', - b'nan', b''] - +def c_type_conv(st): + cdef bytes py_bytes = st.encode() + return py_bytes +_NA_VALUES = [c_type_conv(x) for x in parsers._NA_VALUES] cdef class TextReader: """ From b0c1d4d902b072efedc9d54f0f0aee8706cd4d3e Mon Sep 17 00:00:00 2001 From: Oleg Shteynbuk Date: Sat, 3 Jun 2017 06:18:21 -0400 Subject: [PATCH 03/14] added blank lines - E302 expected 2 blank lines, found 1 in parsers.pyx --- pandas/_libs/parsers.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index f5c06a0becb86..c79d4da11ff8f 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -281,8 +281,10 @@ DEFAULT_CHUNKSIZE = 256 * 1024 def c_type_conv(st): cdef bytes py_bytes = st.encode() return py_bytes + _NA_VALUES = [c_type_conv(x) for x in parsers._NA_VALUES] + cdef class TextReader: """ From 1f0a350dcc6df99f9d8b201f7102b08a04503ff8 Mon Sep 17 00:00:00 2001 From: Oleg Shteynbuk Date: Sat, 3 Jun 2017 07:04:48 -0400 Subject: [PATCH 04/14] added pesky blank lines - E302 expected 2 blank lines, found 1 in parsers.pyx --- pandas/_libs/parsers.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index c79d4da11ff8f..5dc8d4930c8e9 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -275,13 +275,14 @@ cdef extern from "parser/io.h": DEFAULT_CHUNKSIZE = 256 * 1024 -# common NA values -# no longer excluding inf representations -# '1.#INF','-1.#INF', '1.#INF000000', + def c_type_conv(st): cdef bytes py_bytes = st.encode() return py_bytes +# common NA values +# no longer excluding inf representations +# '1.#INF','-1.#INF', '1.#INF000000', _NA_VALUES = [c_type_conv(x) for x in parsers._NA_VALUES] From 0579460911f63d7a93d3f12c5360f1184eba3e80 Mon Sep 17 00:00:00 2001 From: Oleg Shteynbuk Date: Sat, 3 Jun 2017 07:22:33 -0400 Subject: [PATCH 05/14] added pesky blank lines - E302 expected 2 blank lines, found 1 in parsers.pyx --- pandas/_libs/parsers.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 5dc8d4930c8e9..5e4b262b0d013 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -280,6 +280,7 @@ def c_type_conv(st): cdef bytes py_bytes = st.encode() return py_bytes + # common NA values # no longer excluding inf representations # '1.#INF','-1.#INF', '1.#INF000000', From 6225c7c46d7627e54b1f460a7bda8bc0531b9d88 Mon Sep 17 00:00:00 2001 From: Oleg Shteynbuk Date: Sun, 4 Jun 2017 23:29:23 -0400 Subject: [PATCH 06/14] changed to using existing function _ensure_encoded in parsers.pyx --- pandas/_libs/parsers.pyx | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 5e4b262b0d013..fe1656b78eaf6 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -275,16 +275,10 @@ cdef extern from "parser/io.h": DEFAULT_CHUNKSIZE = 256 * 1024 - -def c_type_conv(st): - cdef bytes py_bytes = st.encode() - return py_bytes - - # common NA values # no longer excluding inf representations # '1.#INF','-1.#INF', '1.#INF000000', -_NA_VALUES = [c_type_conv(x) for x in parsers._NA_VALUES] +_NA_VALUES =_ensure_encoded(in parsers._NA_VALUES) cdef class TextReader: From a7e7f01a3433162c9880d61cafbd58809ce90d1b Mon Sep 17 00:00:00 2001 From: Oleg Shteynbuk Date: Mon, 5 Jun 2017 00:23:17 -0400 Subject: [PATCH 07/14] reverted na_values.py back to using assert --- pandas/tests/io/parser/na_values.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/na_values.py b/pandas/tests/io/parser/na_values.py index fb277f00d8651..170f9d428c9cc 100644 --- a/pandas/tests/io/parser/na_values.py +++ b/pandas/tests/io/parser/na_values.py @@ -69,7 +69,10 @@ def test_non_string_na_values(self): tm.assert_frame_equal(out, expected) def test_default_na_values(self): - _NA_VALUES = parsers._NA_VALUES + _NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', + '#N/A', 'N/A', 'n/a', 'NA', '#NA', 'NULL', 'null', + 'NaN', 'nan', '-NaN', '-nan', '#N/A N/A', '']) + assert _NA_VALUES == parsers._NA_VALUES nv = len(_NA_VALUES) def f(i, v): From dc9a36804aab3d8cc1f4f45943174bed8985dad5 Mon Sep 17 00:00:00 2001 From: Oleg Shteynbuk Date: Mon, 5 Jun 2017 00:44:53 -0400 Subject: [PATCH 08/14] fixing typo --- pandas/_libs/parsers.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index fe1656b78eaf6..4830a138812cd 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -278,7 +278,7 @@ DEFAULT_CHUNKSIZE = 256 * 1024 # common NA values # no longer excluding inf representations # '1.#INF','-1.#INF', '1.#INF000000', -_NA_VALUES =_ensure_encoded(in parsers._NA_VALUES) +_NA_VALUES =_ensure_encoded(parsers._NA_VALUES) cdef class TextReader: From e8f6e821d91f069a39d38a8e997198bb7670c50a Mon Sep 17 00:00:00 2001 From: Oleg Shteynbuk Date: Mon, 5 Jun 2017 01:24:37 -0400 Subject: [PATCH 09/14] convert set to list --- pandas/_libs/parsers.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 4830a138812cd..3bfca2a742770 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -278,7 +278,7 @@ DEFAULT_CHUNKSIZE = 256 * 1024 # common NA values # no longer excluding inf representations # '1.#INF','-1.#INF', '1.#INF000000', -_NA_VALUES =_ensure_encoded(parsers._NA_VALUES) +_NA_VALUES = _ensure_encoded(list(parsers._NA_VALUES)) cdef class TextReader: From 5770506c4ed92edf7d2b8f9f89646d3cbf9e5857 Mon Sep 17 00:00:00 2001 From: Oleg Shteynbuk Date: Mon, 5 Jun 2017 02:51:32 -0400 Subject: [PATCH 10/14] put definition before reference --- pandas/_libs/parsers.pyx | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 3bfca2a742770..99e6058db0ff8 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -275,11 +275,6 @@ cdef extern from "parser/io.h": DEFAULT_CHUNKSIZE = 256 * 1024 -# common NA values -# no longer excluding inf representations -# '1.#INF','-1.#INF', '1.#INF000000', -_NA_VALUES = _ensure_encoded(list(parsers._NA_VALUES)) - cdef class TextReader: """ @@ -1380,6 +1375,12 @@ cdef asbytes(object o): return str(o) +# common NA values +# no longer excluding inf representations +# '1.#INF','-1.#INF', '1.#INF000000', +_NA_VALUES = _ensure_encoded(list(parsers._NA_VALUES)) + + def _is_file_like(obj): if PY3: import io From 5fa829eb3e4eeaa76909e936038e8bdf9f7cd79c Mon Sep 17 00:00:00 2001 From: Oleg Shteynbuk Date: Mon, 5 Jun 2017 12:52:48 -0400 Subject: [PATCH 11/14] import _NA_VALUES from common instead of parsers, make import explicit --- pandas/_libs/parsers.pyx | 4 ++-- pandas/tests/io/parser/na_values.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 99e6058db0ff8..4d8f223be0f9d 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -50,7 +50,7 @@ from pandas.core.algorithms import take_1d from pandas.core.dtypes.concat import union_categoricals from pandas import Index -import pandas.io.parsers as parsers +import pandas.io.common as common import time import os @@ -1378,7 +1378,7 @@ cdef asbytes(object o): # common NA values # no longer excluding inf representations # '1.#INF','-1.#INF', '1.#INF000000', -_NA_VALUES = _ensure_encoded(list(parsers._NA_VALUES)) +_NA_VALUES = _ensure_encoded(list(common._NA_VALUES)) def _is_file_like(obj): diff --git a/pandas/tests/io/parser/na_values.py b/pandas/tests/io/parser/na_values.py index 170f9d428c9cc..b90254d11db5d 100644 --- a/pandas/tests/io/parser/na_values.py +++ b/pandas/tests/io/parser/na_values.py @@ -8,7 +8,7 @@ import numpy as np from numpy import nan -import pandas.io.parsers as parsers +import pandas.io.common as common import pandas.util.testing as tm from pandas import DataFrame, Index, MultiIndex @@ -72,7 +72,7 @@ def test_default_na_values(self): _NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A', 'N/A', 'n/a', 'NA', '#NA', 'NULL', 'null', 'NaN', 'nan', '-NaN', '-nan', '#N/A N/A', '']) - assert _NA_VALUES == parsers._NA_VALUES + assert _NA_VALUES == common._NA_VALUES nv = len(_NA_VALUES) def f(i, v): From c49944814fcd429ce8bbf9f69bb35ef0d6e677c8 Mon Sep 17 00:00:00 2001 From: Oleg Shteynbuk Date: Mon, 5 Jun 2017 14:53:25 -0400 Subject: [PATCH 12/14] in import replace common with com --- pandas/_libs/parsers.pyx | 4 ++-- pandas/tests/io/parser/na_values.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 4d8f223be0f9d..7375a2197c6b7 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -50,7 +50,7 @@ from pandas.core.algorithms import take_1d from pandas.core.dtypes.concat import union_categoricals from pandas import Index -import pandas.io.common as common +import pandas.io.common as com import time import os @@ -1378,7 +1378,7 @@ cdef asbytes(object o): # common NA values # no longer excluding inf representations # '1.#INF','-1.#INF', '1.#INF000000', -_NA_VALUES = _ensure_encoded(list(common._NA_VALUES)) +_NA_VALUES = _ensure_encoded(list(com._NA_VALUES)) def _is_file_like(obj): diff --git a/pandas/tests/io/parser/na_values.py b/pandas/tests/io/parser/na_values.py index b90254d11db5d..c6d1cc79b82d7 100644 --- a/pandas/tests/io/parser/na_values.py +++ b/pandas/tests/io/parser/na_values.py @@ -8,7 +8,7 @@ import numpy as np from numpy import nan -import pandas.io.common as common +import pandas.io.common as com import pandas.util.testing as tm from pandas import DataFrame, Index, MultiIndex @@ -72,7 +72,7 @@ def test_default_na_values(self): _NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A', 'N/A', 'n/a', 'NA', '#NA', 'NULL', 'null', 'NaN', 'nan', '-NaN', '-nan', '#N/A N/A', '']) - assert _NA_VALUES == common._NA_VALUES + assert _NA_VALUES == com._NA_VALUES nv = len(_NA_VALUES) def f(i, v): From 03a335b682bb1f4499b583bfc831e33b82b85dae Mon Sep 17 00:00:00 2001 From: Oleg Shteynbuk Date: Wed, 7 Jun 2017 18:16:29 -0400 Subject: [PATCH 13/14] remove duplicate na values in in io.rst --- doc/source/io.rst | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 7ea476514e88d..ef85ec0119919 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -225,9 +225,8 @@ NA and Missing Data Handling na_values : scalar, str, list-like, or dict, default ``None`` Additional strings to recognize as NA/NaN. If dict passed, specific per-column - NA values. By default the following values are interpreted as NaN: - ``'-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a', 'NA', - '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', ''``. + NA values. By default the following values are interpreted as NaN: See :ref:`na values const + ` below. keep_default_na : boolean, default ``True`` If na_values are specified and keep_default_na is ``False`` the default NaN values are overridden, otherwise they're appended to. @@ -1030,8 +1029,11 @@ the corresponding equivalent values will also imply a missing value (in this cas ``[5.0,5]`` are recognized as ``NaN``. To completely override the default values that are recognized as missing, specify ``keep_default_na=False``. -The default ``NaN`` recognized values are ``['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A','N/A', 'NA', -'#NA', 'NULL', 'NaN', '-NaN', 'nan', '-nan']``. Although a 0-length string + +.. _io.navaluesconst: + +The default ``NaN`` recognized values are ``['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', +'n/a', 'NA', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', '']``. Although a 0-length string ``''`` is not included in the default ``NaN`` values list, it is still treated as a missing value. From 10cba2cf73fd6212ea5ad9cb10c01e8db3eceed2 Mon Sep 17 00:00:00 2001 From: Oleg Shteynbuk Date: Fri, 9 Jun 2017 16:19:14 -0400 Subject: [PATCH 14/14] improved text related to na values in io.rst file --- doc/source/io.rst | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index ef85ec0119919..69377fad15270 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -225,8 +225,9 @@ NA and Missing Data Handling na_values : scalar, str, list-like, or dict, default ``None`` Additional strings to recognize as NA/NaN. If dict passed, specific per-column - NA values. By default the following values are interpreted as NaN: See :ref:`na values const - ` below. + NA values. See :ref:`na values const ` below + for a list of the values interpreted as NaN by default. + keep_default_na : boolean, default ``True`` If na_values are specified and keep_default_na is ``False`` the default NaN values are overridden, otherwise they're appended to. @@ -1033,9 +1034,7 @@ To completely override the default values that are recognized as missing, specif .. _io.navaluesconst: The default ``NaN`` recognized values are ``['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', -'n/a', 'NA', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', '']``. Although a 0-length string -``''`` is not included in the default ``NaN`` values list, it is still treated -as a missing value. +'n/a', 'NA', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', '']``. .. code-block:: python