Skip to content

Commit

Permalink
BUG: REGR: read_csv with memory_map=True raises UnicodeDecodeError: '…
Browse files Browse the repository at this point in the history
…utf-8' codec can't decode byte 0xc4 in position 262143: unexpected end of data (#43647)
  • Loading branch information
michal-gh authored Sep 21, 2021
1 parent ef97dd4 commit 5a369d6
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 1 deletion.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.4.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Fixed regressions
- Fixed performance regression in :meth:`MultiIndex.equals` (:issue:`43549`)
- Fixed regression in :meth:`Series.cat.reorder_categories` failing to update the categories on the ``Series`` (:issue:`43232`)
- Fixed regression in :meth:`Series.cat.categories` setter failing to update the categories on the ``Series`` (:issue:`43334`)
- Fixed regression in :meth:`pandas.read_csv` raising ``UnicodeDecodeError`` exception when ``memory_map=True`` (:issue:`43540`)
-

.. ---------------------------------------------------------------------------
Expand Down
3 changes: 2 additions & 1 deletion pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -877,7 +877,8 @@ def read(self, size: int = -1) -> str | bytes:
if self.decode:
# memory mapping is applied before compression. Encoding should
# be applied to the de-compressed data.
return content.decode(self.encoding, errors=self.errors)
final = size == -1 or len(content) < size
return self.decoder.decode(content, final=final)
return content

def __next__(self) -> str:
Expand Down
20 changes: 20 additions & 0 deletions pandas/tests/io/parser/test_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,26 @@ def test_encoding_memory_map(all_parsers, encoding):
tm.assert_frame_equal(df, expected)


@skip_pyarrow
def test_chunk_splits_multibyte_char(all_parsers):
"""
Chunk splits a multibyte character with memory_map=True
GH 43540
"""
parser = all_parsers
# DEFAULT_CHUNKSIZE = 262144, defined in parsers.pyx
df = DataFrame(data=["a" * 127] * 2048)

# Put two-bytes utf-8 encoded character "ą" at the end of chunk
# utf-8 encoding of "ą" is b'\xc4\x85'
df.iloc[2047] = "a" * 127 + "ą"
with tm.ensure_clean("bug-gh43540.csv") as fname:
df.to_csv(fname, index=False, header=False, encoding="utf-8")
dfr = parser.read_csv(fname, header=None, memory_map=True, engine="c")
tm.assert_frame_equal(dfr, df)


def test_not_readable(all_parsers):
# GH43439
parser = all_parsers
Expand Down

0 comments on commit 5a369d6

Please sign in to comment.