Backport PR #43647 on branch 1.3.x (BUG: REGR: read_csv with memory_m…

…ap=True raises UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc4 in position 262143: unexpected end of data ) (#43691) Co-authored-by: michal-gh <michaltus@gmail.com> Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com>
pandas-dev · Sep 22, 2021 · 3162b24 · 3162b24
1 parent 4782ec1
commit 3162b24
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 1 deletion.
diff --git a/doc/source/whatsnew/v1.3.4.rst b/doc/source/whatsnew/v1.3.4.rst
@@ -19,6 +19,7 @@ Fixed regressions
 - Fixed performance regression in :meth:`MultiIndex.equals` (:issue:`43549`)
 - Fixed regression in :meth:`Series.cat.reorder_categories` failing to update the categories on the ``Series`` (:issue:`43232`)
 - Fixed regression in :meth:`Series.cat.categories` setter failing to update the categories on the ``Series`` (:issue:`43334`)
+- Fixed regression in :meth:`pandas.read_csv` raising ``UnicodeDecodeError`` exception when ``memory_map=True`` (:issue:`43540`)
 -
 
 .. ---------------------------------------------------------------------------

diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -859,7 +859,8 @@ def read(self, size: int = -1) -> str | bytes:
         if self.decode:
             # memory mapping is applied before compression. Encoding should
             # be applied to the de-compressed data.
-            return content.decode(self.encoding, errors=self.errors)
+            final = size == -1 or len(content) < size
+            return self.decoder.decode(content, final=final)
         return content
 
     def __next__(self) -> str:

diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py
@@ -239,6 +239,25 @@ def test_encoding_memory_map(all_parsers, encoding):
     tm.assert_frame_equal(df, expected)
 
 
+def test_chunk_splits_multibyte_char(all_parsers):
+    """
+    Chunk splits a multibyte character with memory_map=True
+
+    GH 43540
+    """
+    parser = all_parsers
+    # DEFAULT_CHUNKSIZE = 262144, defined in parsers.pyx
+    df = DataFrame(data=["a" * 127] * 2048)
+
+    # Put two-bytes utf-8 encoded character "ą" at the end of chunk
+    # utf-8 encoding of "ą" is b'\xc4\x85'
+    df.iloc[2047] = "a" * 127 + "ą"
+    with tm.ensure_clean("bug-gh43540.csv") as fname:
+        df.to_csv(fname, index=False, header=False, encoding="utf-8")
+        dfr = parser.read_csv(fname, header=None, memory_map=True, engine="c")
+    tm.assert_frame_equal(dfr, df)
+
+
 def test_not_readable(all_parsers):
     # GH43439
     parser = all_parsers