diff --git a/doc/source/whatsnew/v1.3.4.rst b/doc/source/whatsnew/v1.3.4.rst index 87b08fae52c15..6212f2c6f3399 100644 --- a/doc/source/whatsnew/v1.3.4.rst +++ b/doc/source/whatsnew/v1.3.4.rst @@ -19,6 +19,7 @@ Fixed regressions - Fixed performance regression in :meth:`MultiIndex.equals` (:issue:`43549`) - Fixed regression in :meth:`Series.cat.reorder_categories` failing to update the categories on the ``Series`` (:issue:`43232`) - Fixed regression in :meth:`Series.cat.categories` setter failing to update the categories on the ``Series`` (:issue:`43334`) +- Fixed regression in :meth:`pandas.read_csv` raising ``UnicodeDecodeError`` exception when ``memory_map=True`` (:issue:`43540`) - .. --------------------------------------------------------------------------- diff --git a/pandas/io/common.py b/pandas/io/common.py index ba1cc82bfea56..6dfddd571b88f 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -877,7 +877,8 @@ def read(self, size: int = -1) -> str | bytes: if self.decode: # memory mapping is applied before compression. Encoding should # be applied to the de-compressed data. - return content.decode(self.encoding, errors=self.errors) + final = size == -1 or len(content) < size + return self.decoder.decode(content, final=final) return content def __next__(self) -> str: diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index 680c437f7087e..6ca3fdf9a6258 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -252,6 +252,26 @@ def test_encoding_memory_map(all_parsers, encoding): tm.assert_frame_equal(df, expected) +@skip_pyarrow +def test_chunk_splits_multibyte_char(all_parsers): + """ + Chunk splits a multibyte character with memory_map=True + + GH 43540 + """ + parser = all_parsers + # DEFAULT_CHUNKSIZE = 262144, defined in parsers.pyx + df = DataFrame(data=["a" * 127] * 2048) + + # Put two-bytes utf-8 encoded character "ą" at the end of chunk + # utf-8 encoding of "ą" is b'\xc4\x85' + df.iloc[2047] = "a" * 127 + "ą" + with tm.ensure_clean("bug-gh43540.csv") as fname: + df.to_csv(fname, index=False, header=False, encoding="utf-8") + dfr = parser.read_csv(fname, header=None, memory_map=True, engine="c") + tm.assert_frame_equal(dfr, df) + + def test_not_readable(all_parsers): # GH43439 parser = all_parsers