gsnedders · sorcio · May 3, 2022
diff --git a/webencodings/__init__.py b/webencodings/__init__.py
@@ -22,12 +22,18 @@
 VERSION = '0.6-dev'
 
 
-# Some names in Encoding are not valid Python aliases. Remap these.
 PYTHON_NAMES = {
-    'iso-8859-8-i': 'iso-8859-8',
+    # Some names in Encoding are not valid Python aliases. Remap these:
+    'iso-8859-8-i': 'iso8859-8',
     'x-mac-cyrillic': 'mac-cyrillic',
     'macintosh': 'mac-roman',
-    'windows-874': 'cp874'}
+    'windows-874': 'cp874',
+    # Some WHATWG-defined names conflict with a Python alias for an
+    # incompatible codec. These should be remapped to the correct one:
+    'shift_jis': 'cp932',
+    'big5': 'big5hkscs',
+    'euc-kr': 'cp949',
+}
 
 CACHE = {}
 

diff --git a/webencodings/tests.py b/webencodings/tests.py
@@ -14,7 +14,7 @@
 from __future__ import unicode_literals
 
 from . import (lookup, LABELS, decode, encode, iter_decode, iter_encode,
-               IncrementalDecoder, IncrementalEncoder, UTF8)
+               IncrementalDecoder, IncrementalEncoder, UTF8, PYTHON_NAMES)
 
 
 def assert_raises(exception, function, *args, **kwargs):
@@ -45,6 +45,21 @@ def test_labels():
     assert lookup('LATİN1') is None  # ASCII-only case insensitivity.
 
 
+def test_remapping():
+    def codec_name(name):
+        encoding = lookup(name)
+        assert encoding is not None
+        return encoding.codec_info.name
+
+    assert codec_name('iso-8859-8-i') == 'iso8859-8'
+    assert codec_name('x-mac-cyrillic') == 'mac-cyrillic'
+    assert codec_name('macintosh') == 'mac-roman'
+    assert codec_name('windows-874') == 'cp874'
+    assert codec_name('shift_jis') == 'cp932'
+    assert codec_name('big5') == 'big5hkscs'
+    assert codec_name('euc-kr') == 'cp949'
+
+
 def test_all_labels():
     for label in LABELS:
         assert decode(b'', label) == ('', lookup(label))
@@ -95,6 +110,16 @@ def test_decode():
     assert decode(b'\x00\xe9', 'UTF-16') == ('\ue900', lookup('utf-16le'))
 
 
+def test_decode_legacy_cjk():
+    assert decode(b'\x87\x82\x87@ \xedB', "windows-31j") == (
+        "№① 鍈", lookup("shift-jis"))
+    assert decode(b'\xc7g\xc6\xf1\xc6\xfd\xc7g\xc6\xf1\xc6\xfd', "big5-hkscs") == (
+        "むかしむかし", lookup("big5"))
+    assert decode(b'\x8cc\xb9\xe6\xb0\xa2\xc7\xcf', "windows-949") == (
+        "똠방각하", lookup("euc-kr"))
+    assert decode(b'\x92w', 'big5') == ('㐵', lookup('big5'))
+
+
 def test_encode():
     assert encode('é', 'latin1') == b'\xe9'
     assert encode('é', 'utf8') == b'\xc3\xa9'