Skip to content

Commit

Permalink
New tests
Browse files Browse the repository at this point in the history
*  the emojize() regular expression covers all characters in emoji names including different Unicode forms (NFD/NFC)
*  the regular expression doesn't contain unnecessary characters
*  the EMOJI_DATA only contains NFKC Unicode form
*  emojize() can handle NFC and NFD

(They all fail at the moment)
  • Loading branch information
cvzi committed Sep 30, 2022
1 parent c5ac514 commit cdd6c51
Show file tree
Hide file tree
Showing 2 changed files with 135 additions and 5 deletions.
82 changes: 77 additions & 5 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,14 @@

from __future__ import unicode_literals

import sys
import random
import re
import emoji
import pytest
import unicodedata

_IS_PYTHON_2 = sys.version_info < (3, 0)

# Build all language packs (i.e. fill the cache):
emoji.emojize("", language="alias")
Expand All @@ -22,12 +25,81 @@ def ascii(s):
return s.encode("unicode-escape").decode()


def all_language_and_alias_packs():
yield ('alias', emoji.unicode_codes.get_aliases_unicode_dict())

for lang_code in emoji.LANGUAGES:
yield (lang_code, emoji.unicode_codes.get_emoji_unicode_dict(lang_code))


def normalize(form, s):
if _IS_PYTHON_2 and isinstance(s, str):
s = unicode(s)
return unicodedata.normalize(form, s)


def test_emojize_name_only():
for lang_code, emoji_pack in emoji.unicode_codes._EMOJI_UNICODE.items():
for name in emoji_pack.keys():
actual = emoji.emojize(name, language=lang_code)
expected = emoji_pack[name]
assert expected == actual, '%s != %s' % (expected, actual)
# Check that the regular expression emoji.core._EMOJI_NAME_PATTERN contains all the necesseary characters
from emoji.core import _EMOJI_NAME_PATTERN

pattern = re.compile(u'[^%s]' % (_EMOJI_NAME_PATTERN, ), flags=re.UNICODE)

for lang_code, emoji_pack in all_language_and_alias_packs():
for name_in_db in emoji_pack.keys():

pairs = [
('Form from EMOJI_DATA',name_in_db),
('NFKC', normalize('NFKC', name_in_db)),
('NFKD', normalize('NFKD', name_in_db)),
('NFD', normalize('NFD', name_in_db)),
('NFC', normalize('NFC', name_in_db))
]
for form, name in pairs:
actual = emoji.emojize(name, language=lang_code)
expected = emoji_pack[name_in_db]

if expected != actual:
print("Regular expression is missing a character:")
print("Emoji name %r in form %r contains:" % (name, form))
print("\n".join(["%r (%r) is not in the regular expression" % (x, x.encode('unicode-escape').decode()) for x in pattern.findall(name[1:-1])]))

assert expected == actual, '%s != %s' % (expected, actual)
assert pattern.search(name[1:-1]) is None


def test_regular_expression_minimal():
# Check that the regular expression emoji.core._EMOJI_NAME_PATTERN only contains the necesseary characters
from emoji.core import _EMOJI_NAME_PATTERN

pattern_str = u'[^%s]' % (_EMOJI_NAME_PATTERN, )
i = 2
while i < len(pattern_str) - 1:
c = pattern_str[i]
if c == '\\':
i += 2
continue
pattern = re.compile(pattern_str.replace(c, ''), flags=re.UNICODE)
failed = False
for lang_code, emoji_pack in all_language_and_alias_packs():
for name_in_db in emoji_pack.keys():
name_in_db = name_in_db[1:-1]
names = [
name_in_db,
normalize('NFKC', name_in_db),
normalize('NFKD', name_in_db),
normalize('NFD',name_in_db),
normalize('NFC', name_in_db)
]
for str in names:
if pattern.search(str):
failed = True
break
if failed:
break
if not failed:
assert failed, "char: %r is not necessary in regular expression" % (c, )

i += 1


def test_emojize_complicated_string():
Expand Down
58 changes: 58 additions & 0 deletions tests/test_nfkc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# -*- coding: UTF-8 -*-


"""Unittests for emoji.core"""

import sys
import emoji
import unicodedata

def is_normalized(form, s):
if sys.version_info >= (3, 8):
return unicodedata.is_normalized(form, s)
elif sys.version_info > (2, 0):
return unicodedata.normalize(form, s) == s
else:
u = unicode(s)
return unicodedata.normalize(form, u) == u


def test_database_normalized():
if sys.version_info < (3, 8):
return

# Test if all names in EMOJI_DATA are in NFKC form
for e, emoji_data in emoji.EMOJI_DATA.items():
if 'alias' in emoji_data:
for alias in emoji_data['alias']:
assert is_normalized('NFKC', alias), 'Alias %r of %r is not NFKC' % (alias, e)
for lang in emoji.LANGUAGES:
if lang in emoji_data:
name = emoji_data[lang]
assert is_normalized('NFKC', name), 'Name lang=%s of %r is not NFKC' % (lang, e)


def test_normalized_and_not_normalized():
pairs = [
['en', u':Cura\xe7ao:', u':Curac\u0327ao:'],
['en', u':Co\u0302te_d\u2019Ivoire:', u':Co\u0302te_d\u2019Ivoire:'],
['alias', u':flag_for_\xc5land_Islands:', u':flag_for_A\u030aland_Islands:'],
['de', u':flagge_d\xe4nemark:', u':flagge_da\u0308nemark:'],
['fr', u':drapeau_r\xe9publique_dominicaine:', u':drapeau_re\u0301publique_dominicaine:'],
['fr', u':fl\xe8che_fin:', u':fle\u0300che_fin:'],
['es', u':bandera_etiop\xeda:', u':bandera_etiopi\u0301a:'],
['pt', u':bot\xe3o_free:', u':bota\u0303o_free:'],
['de', u':alter_schl\xfcssel:', u':alter_schlu\u0308ssel:'],
['fr', u':homme_\xe2g\xe9_peau_l\xe9g\xe8rement_mate:', u':homme_a\u0302ge\u0301_peau_le\u0301ge\u0300rement_mate:'],
['pt', u':cora\xe7\xe3o_vermelho:',u':corac\u0327a\u0303o_vermelho:'],
['en', u':Cayman_Islands:', u':Cayman_\u2160slands:'],
]

if sys.version_info[0] > 2:
pairs.append(['fr', u':c\u0153ur_rouge:', u':c\ua7f9ur_rouge:'])

for language, normalized, other_form in pairs:
emoji_from_normalized = emoji.emojize(normalized, language=language)
emoji_from_other_form = emoji.emojize(other_form, language=language)
assert emoji_from_normalized == emoji_from_other_form
assert not emoji_from_normalized.startswith(":")

0 comments on commit cdd6c51

Please sign in to comment.