Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Backport PR #48651 on branch 1.5.x (REGR: TextIOWrapper raising an error in read_csv) #48666

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.5.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ including other versions of pandas.

Fixed regressions
~~~~~~~~~~~~~~~~~
-
- Regression in :func:`.read_csv` causing an ``EmptyDataError`` when using an UTF-8 file handle that was already read from (:issue:`48646`)
-

.. ---------------------------------------------------------------------------
Expand Down
12 changes: 0 additions & 12 deletions pandas/io/parsers/c_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

from collections import defaultdict
import inspect
from io import TextIOWrapper
from typing import (
TYPE_CHECKING,
Hashable,
Expand Down Expand Up @@ -67,17 +66,6 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None:
# Have to pass int, would break tests using TextReader directly otherwise :(
kwds["on_bad_lines"] = self.on_bad_lines.value

# c-engine can cope with utf-8 bytes. Remove TextIOWrapper when its errors
# policy is the same as the one given to read_csv
if (
isinstance(src, TextIOWrapper)
and src.encoding == "utf-8"
and (src.errors or "strict") == kwds["encoding_errors"]
):
# error: Incompatible types in assignment (expression has type "BinaryIO",
# variable has type "ReadCsvBuffer[str]")
src = src.buffer # type: ignore[assignment]

for key in (
"storage_options",
"encoding",
Expand Down
11 changes: 11 additions & 0 deletions pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
from pandas.io.common import (
IOHandles,
get_handle,
stringify_path,
validate_header_arg,
)
from pandas.io.parsers.arrow_parser_wrapper import ArrowParserWrapper
Expand Down Expand Up @@ -1726,6 +1727,16 @@ def _make_engine(
if engine == "pyarrow":
is_text = False
mode = "rb"
elif (
engine == "c"
and self.options.get("encoding", "utf-8") == "utf-8"
and isinstance(stringify_path(f), str)
):
# c engine can decode utf-8 bytes, adding TextIOWrapper makes
# the c-engine especially for memory_map=True far slower
is_text = False
if "b" not in mode:
mode += "b"
self.handles = get_handle(
f,
mode,
Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/io/parser/common/test_common_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -928,3 +928,17 @@ def test_read_table_posargs_deprecation(all_parsers):
"except for the argument 'filepath_or_buffer' will be keyword-only"
)
parser.read_table_check_warnings(FutureWarning, msg, data, " ")


def test_read_seek(all_parsers):
# GH48646
parser = all_parsers
prefix = "### DATA\n"
content = "nkey,value\ntables,rectangular\n"
with tm.ensure_clean() as path:
Path(path).write_text(prefix + content)
with open(path, encoding="utf-8") as file:
file.readline()
actual = parser.read_csv(file)
expected = parser.read_csv(StringIO(content))
tm.assert_frame_equal(actual, expected)