From 95efbbe588d66d99bc28b04f03290630a22c1a35 Mon Sep 17 00:00:00 2001 From: Tzu-ping Chung Date: Mon, 3 Aug 2020 09:27:50 +0800 Subject: [PATCH 1/3] News --- news/8684.bugfix | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 news/8684.bugfix diff --git a/news/8684.bugfix b/news/8684.bugfix new file mode 100644 index 00000000000..18e6ed9bc80 --- /dev/null +++ b/news/8684.bugfix @@ -0,0 +1,2 @@ +Use the same encoding logic from Python 3 to handle ZIP archive entries on +Python 2, so non-ASCII paths can be resolved as expected. From d4995cb89eed0a2d348e220c6ef061b3d816e0f4 Mon Sep 17 00:00:00 2001 From: Tzu-ping Chung Date: Mon, 3 Aug 2020 06:38:36 +0800 Subject: [PATCH 2/3] Implement heuristics to get non-ASCII ZIP entries --- src/pip/_internal/operations/install/wheel.py | 28 +++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/src/pip/_internal/operations/install/wheel.py b/src/pip/_internal/operations/install/wheel.py index 681fc0aa8ef..f2fde0b087d 100644 --- a/src/pip/_internal/operations/install/wheel.py +++ b/src/pip/_internal/operations/install/wheel.py @@ -78,6 +78,7 @@ Union, cast, ) + from zipfile import ZipInfo from pip._vendor.pkg_resources import Distribution @@ -420,6 +421,28 @@ def __init__(self, src_record_path, dest_path, zip_file): self._zip_file = zip_file self.changed = False + def _getinfo(self): + # type: () -> ZipInfo + if not PY2: + return self._zip_file.getinfo(self.src_record_path) + + # Python 2 does not expose a way to detect a ZIP's encoding, so we + # "guess" with the heuristics below: + # 1. Try encoding the path with UTF-8. + # 2. Check the matching info's flags for language encoding (bit 11). + # 3. If the flag is set, assume UTF-8 is correct. + # 4. If any of the above steps fails, fallback to getting an info with + # CP437 (matching Python 3). + try: + arcname = self.src_record_path.encode("utf-8") + info = self._zip_file.getinfo(arcname) + if info.flag_bits & 0x800: + return info + except (KeyError, UnicodeEncodeError): + pass + arcname = self.src_record_path.encode("cp437") + return self._zip_file.getinfo(arcname) + def save(self): # type: () -> None # directory creation is lazy and after file filtering @@ -439,11 +462,12 @@ def save(self): if os.path.exists(self.dest_path): os.unlink(self.dest_path) - with self._zip_file.open(self.src_record_path) as f: + zipinfo = self._getinfo() + + with self._zip_file.open(zipinfo) as f: with open(self.dest_path, "wb") as dest: shutil.copyfileobj(f, dest) - zipinfo = self._zip_file.getinfo(self.src_record_path) if zip_item_is_executable(zipinfo): set_extracted_file_to_default_mode_plus_executable(self.dest_path) From a12e2f147997dd0932727150ae596ca489a41e78 Mon Sep 17 00:00:00 2001 From: Tzu-ping Chung Date: Mon, 3 Aug 2020 14:59:42 +0800 Subject: [PATCH 3/3] PEP 427 mandates UTF-8, we don't need the fallback --- news/8684.bugfix | 4 ++-- src/pip/_internal/operations/install/wheel.py | 21 ++++--------------- 2 files changed, 6 insertions(+), 19 deletions(-) diff --git a/news/8684.bugfix b/news/8684.bugfix index 18e6ed9bc80..528291d736a 100644 --- a/news/8684.bugfix +++ b/news/8684.bugfix @@ -1,2 +1,2 @@ -Use the same encoding logic from Python 3 to handle ZIP archive entries on -Python 2, so non-ASCII paths can be resolved as expected. +Use UTF-8 to handle ZIP archive entries on Python 2 according to PEP 427, so +non-ASCII paths can be resolved as expected. diff --git a/src/pip/_internal/operations/install/wheel.py b/src/pip/_internal/operations/install/wheel.py index f2fde0b087d..e91b1b8d558 100644 --- a/src/pip/_internal/operations/install/wheel.py +++ b/src/pip/_internal/operations/install/wheel.py @@ -425,23 +425,10 @@ def _getinfo(self): # type: () -> ZipInfo if not PY2: return self._zip_file.getinfo(self.src_record_path) - - # Python 2 does not expose a way to detect a ZIP's encoding, so we - # "guess" with the heuristics below: - # 1. Try encoding the path with UTF-8. - # 2. Check the matching info's flags for language encoding (bit 11). - # 3. If the flag is set, assume UTF-8 is correct. - # 4. If any of the above steps fails, fallback to getting an info with - # CP437 (matching Python 3). - try: - arcname = self.src_record_path.encode("utf-8") - info = self._zip_file.getinfo(arcname) - if info.flag_bits & 0x800: - return info - except (KeyError, UnicodeEncodeError): - pass - arcname = self.src_record_path.encode("cp437") - return self._zip_file.getinfo(arcname) + # Python 2 does not expose a way to detect a ZIP's encoding, but the + # wheel specification (PEP 427) explicitly mandates that paths should + # use UTF-8, so we assume it is true. + return self._zip_file.getinfo(self.src_record_path.encode("utf-8")) def save(self): # type: () -> None