Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

remap to use correct python cjk codecs #31

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions webencodings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,18 @@
VERSION = '0.6-dev'


# Some names in Encoding are not valid Python aliases. Remap these.
PYTHON_NAMES = {
'iso-8859-8-i': 'iso-8859-8',
# Some names in Encoding are not valid Python aliases. Remap these:
'iso-8859-8-i': 'iso8859-8',
'x-mac-cyrillic': 'mac-cyrillic',
'macintosh': 'mac-roman',
'windows-874': 'cp874'}
'windows-874': 'cp874',
# Some WHATWG-defined names conflict with a Python alias for an
# incompatible codec. These should be remapped to the correct one:
'shift_jis': 'cp932',
'big5': 'big5hkscs',
'euc-kr': 'cp949',
}

CACHE = {}

Expand Down
27 changes: 26 additions & 1 deletion webencodings/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from __future__ import unicode_literals

from . import (lookup, LABELS, decode, encode, iter_decode, iter_encode,
IncrementalDecoder, IncrementalEncoder, UTF8)
IncrementalDecoder, IncrementalEncoder, UTF8, PYTHON_NAMES)


def assert_raises(exception, function, *args, **kwargs):
Expand Down Expand Up @@ -45,6 +45,21 @@ def test_labels():
assert lookup('LATİN1') is None # ASCII-only case insensitivity.


def test_remapping():
def codec_name(name):
encoding = lookup(name)
assert encoding is not None
return encoding.codec_info.name

assert codec_name('iso-8859-8-i') == 'iso8859-8'
assert codec_name('x-mac-cyrillic') == 'mac-cyrillic'
assert codec_name('macintosh') == 'mac-roman'
assert codec_name('windows-874') == 'cp874'
assert codec_name('shift_jis') == 'cp932'
assert codec_name('big5') == 'big5hkscs'
assert codec_name('euc-kr') == 'cp949'


def test_all_labels():
for label in LABELS:
assert decode(b'', label) == ('', lookup(label))
Expand Down Expand Up @@ -95,6 +110,16 @@ def test_decode():
assert decode(b'\x00\xe9', 'UTF-16') == ('\ue900', lookup('utf-16le'))


def test_decode_legacy_cjk():
assert decode(b'\x87\x82\x87@ \xedB', "windows-31j") == (
"№① 鍈", lookup("shift-jis"))
assert decode(b'\xc7g\xc6\xf1\xc6\xfd\xc7g\xc6\xf1\xc6\xfd', "big5-hkscs") == (
"むかしむかし", lookup("big5"))
assert decode(b'\x8cc\xb9\xe6\xb0\xa2\xc7\xcf', "windows-949") == (
"똠방각하", lookup("euc-kr"))
assert decode(b'\x92w', 'big5') == ('㐵', lookup('big5'))


def test_encode():
assert encode('é', 'latin1') == b'\xe9'
assert encode('é', 'utf8') == b'\xc3\xa9'
Expand Down