diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index a4598b315cbb7..ff017c743a00f 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1705,6 +1705,7 @@ I/O ^^^ - Bug in :func:`read_csv` in which a column specified with ``CategoricalDtype`` of boolean categories was not being correctly coerced from string values to booleans (:issue:`20498`) +- Bug in :func:`read_csv` in which unicode column names were not being properly recognized with Python 2.x (:issue:`13253`) - Bug in :meth:`DataFrame.to_sql` when writing timezone aware data (``datetime64[ns, tz]`` dtype) would raise a ``TypeError`` (:issue:`9086`) - Bug in :meth:`DataFrame.to_sql` where a naive :class:`DatetimeIndex` would be written as ``TIMESTAMP WITH TIMEZONE`` type in supported databases, e.g. PostgreSQL (:issue:`23510`) - Bug in :meth:`read_excel()` when ``parse_cols`` is specified with an empty dataset (:issue:`9208`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 5590e8f445c67..b31d3f665f47f 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1296,15 +1296,28 @@ def _validate_usecols_arg(usecols): if usecols is not None: if callable(usecols): return usecols, None - # GH20529, ensure is iterable container but not string. - elif not is_list_like(usecols): + + if not is_list_like(usecols): + # see gh-20529 + # + # Ensure it is iterable container but not string. raise ValueError(msg) - else: - usecols_dtype = lib.infer_dtype(usecols, skipna=False) - if usecols_dtype not in ('empty', 'integer', - 'string', 'unicode'): - raise ValueError(msg) - return set(usecols), usecols_dtype + + usecols_dtype = lib.infer_dtype(usecols, skipna=False) + + if usecols_dtype not in ("empty", "integer", + "string", "unicode"): + raise ValueError(msg) + + usecols = set(usecols) + + if usecols_dtype == "unicode": + # see gh-13253 + # + # Python 2.x compatibility + usecols = {col.encode("utf-8") for col in usecols} + + return usecols, usecols_dtype return usecols, None diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py index 068227908a285..652f78d198ee8 100644 --- a/pandas/tests/io/parser/test_usecols.py +++ b/pandas/tests/io/parser/test_usecols.py @@ -9,7 +9,7 @@ import pytest from pandas._libs.tslib import Timestamp -from pandas.compat import PY2, StringIO +from pandas.compat import StringIO from pandas import DataFrame, Index import pandas.util.testing as tm @@ -387,8 +387,7 @@ def test_usecols_with_mixed_encoding_strings(all_parsers, usecols): @pytest.mark.parametrize("usecols", [ ["あああ", "いい"], - pytest.param([u"あああ", u"いい"], marks=pytest.mark.skipif( - PY2, reason="Buggy behavior: see gh-13253")) + [u"あああ", u"いい"] ]) def test_usecols_with_multi_byte_characters(all_parsers, usecols): data = """あああ,いい,ううう,ええええ