From 0261c60f7a051cef20c0c940a6c65a6151c75127 Mon Sep 17 00:00:00 2001 From: Shachar Mirkin Date: Wed, 9 Dec 2020 23:09:36 +0100 Subject: [PATCH 1/4] Add mutliple options --- src/xpinyin/__init__.py | 52 ++++++++++++++++++++++++++++------ src/xpinyin/combs.py | 63 +++++++++++++++++++++++++++++++++++++++++ src/xpinyin/tests.py | 9 ++++-- 3 files changed, 113 insertions(+), 11 deletions(-) create mode 100644 src/xpinyin/combs.py diff --git a/src/xpinyin/__init__.py b/src/xpinyin/__init__.py index 1f3bcbc..82db0b8 100644 --- a/src/xpinyin/__init__.py +++ b/src/xpinyin/__init__.py @@ -4,6 +4,9 @@ import os.path import re +from typing import List, Optional + +from xpinyin.combs import get_combs PinyinToneMark = { 0: u"aoeiuv\u00fc", @@ -15,7 +18,6 @@ class Pinyin(object): - """translate chinese hanzi to pinyin by python, inspired by flyerhzm’s `chinese\_pinyin`_ gem @@ -60,7 +62,7 @@ def __init__(self, data_path=data_path): with open(data_path) as f: for line in f: k, v = line.split('\t') - self.dict[k] = v + self.dict[k] = v.rstrip() @staticmethod def decode_pinyin(s): @@ -107,27 +109,59 @@ def convert_pinyin(word, convert): if convert == 'upper': return word.upper() - def get_pinyin(self, chars=u'你好', splitter=u'-', - tone_marks=None, convert='lower'): + def get_pinyins(self, chars: str, splitter: str = u'-', + tone_marks: Optional[str] = None, convert: str = 'lower', comb: bool = True) -> List[str]: + all_pinyins = [] # a list of lists of pinyin options for each char + flag = 1 # in the list (probably not aChinese character) + for char in chars: + key = "%X" % ord(char) + if key not in self.dict: + if flag == 1: + all_pinyins.append([char]) # add as is + else: + all_pinyins[-1][-1] += char # add to previous sequence of non Chinese chars + flag = 0 + else: + flag = 1 + char_py_versions = self.dict[key].split() + last = 1 if comb is False else len(char_py_versions) + if tone_marks == 'marks': + char_options = [Pinyin.decode_pinyin(o) for o in char_py_versions[0:last]] + elif tone_marks == 'numbers': + char_options = [o for o in char_py_versions[0:last]] + else: + char_options = [o[:-1] for o in char_py_versions[0:last]] + all_pinyins.append([Pinyin.convert_pinyin(c, convert) for c in char_options]) + + return list(set(get_combs(all_pinyins, splitter))) # note: ignoring order + + def get_pinyin(self, chars: str, splitter: str = u'-', + tone_marks=None, convert: str = 'lower') -> str: + + return self.get_pinyins(chars, splitter=splitter, tone_marks=tone_marks, convert=convert, comb=False)[0] + + def get_pinyin_old(self, chars=u'你好', splitter=u'-', + tone_marks=None, convert='lower'): result = [] flag = 1 + for char in chars: key = "%X" % ord(char) try: if tone_marks == 'marks': - word = self.decode_pinyin(self.dict[key].split()[0].strip()) + word = self.decode_pinyin(self.dict[key].split()[0]) # TODO comb elif tone_marks == 'numbers': - word = self.dict[key].split()[0].strip() + word = self.dict[key].split()[0] # TODO comb else: - word = self.dict[key].split()[0].strip()[:-1] + word = self.dict[key].split()[0][:-1] # TODO comb word = self.convert_pinyin(word, convert) result.append(word) flag = 1 except KeyError: if flag: - result.append(char) + result.append(char) # TODO this is adding the original else: - result[-1] += char + result[-1] += char # TODO replacing the last char with the original if already was in error state flag = 0 return splitter.join(result) diff --git a/src/xpinyin/combs.py b/src/xpinyin/combs.py new file mode 100644 index 0000000..4e63653 --- /dev/null +++ b/src/xpinyin/combs.py @@ -0,0 +1,63 @@ +from typing import List + + +def _get_comb_indexes(lengths: List[int], n=None) -> List[List[int]]: + """ + Given a list with the number of possible options per place, returns a list of numbers representing combinations. + The combinations are created via additions to a multi-radix number, from left to right + (i.e. from smaller to larger numbers). + + @param n The maximal number of requested combinations. + """ + # calculate the maximal number of possible combinations + n_max = 1 + for j in lengths: + n_max *= j + + n = min(n, n_max) if n is not None else n_max + if n == 0: + raise ValueError("Can't create combinations with 0-length lists") + n_items = len(lengths) + + curr = [0] * n_items + combs = [list.copy(curr)] + i = n_items - 1 + count = 1 + while count < n: + curr[i] = (curr[i] + 1) % lengths[i] + if curr[i] != 0: + combs.append(list.copy(curr)) + count += 1 + i = n_items - 1 # reset to right-most digit + else: + i -= 1 # try previous (left) digit + + return combs + + +def get_combs(options: List[List[str]], splitter='', n=10) -> List[str]: + """ + e.g.: [['a'], ['1' ,'2'], ['@']] -> [a1@, a2@] + Note: the order is not guaranteed + """ + combs = [] + comb_numbers = [len(o) for o in options] + combs_indexes = _get_comb_indexes(comb_numbers, n) + + for c in combs_indexes: # e.g. [0,2,1] + comb = [] + for i in range(len(c)): + comb.append(options[i][c[i]]) + combs.append(splitter.join(comb)) + + return combs + + +def main(): + lengths = [1, 2, 1] + print(_get_comb_indexes(lengths)) + print(get_combs([['a', 'b'], ['1', '2'], ['@', '#']])) + + +if __name__ == '__main__': + main() diff --git a/src/xpinyin/tests.py b/src/xpinyin/tests.py index 3c795c9..b09d56a 100644 --- a/src/xpinyin/tests.py +++ b/src/xpinyin/tests.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import unittest + class PinyinTests(unittest.TestCase): def Pinyin(self, *a, **kw): from xpinyin import Pinyin @@ -23,8 +24,6 @@ def test_get_pinyin_mixed_words(self): def test_get_pinyin_with_tone_marks(self): self.assertEqual(self.p.get_pinyin(u'上海', tone_marks='marks'), u'sh\xe0ng-h\u01cei') - - def test_get_pinyin_with_tone_marks(self): self.assertEqual(self.p.get_pinyin(u'秋', tone_marks='marks'), u'qiū') def test_get_initial(self): @@ -37,5 +36,11 @@ def test_get_initials_with_splitter(self): self.assertEqual(self.p.get_initials(u'你好', u' '), u'N H') self.assertEqual(self.p.get_initials(u'你好', u''), u'NH') + # --- testing combinations --- + + def test_get_pinyins_with_default_splitter(self): + self.assertEqual(self.p.get_pinyins(u'上海'), [u'shang-hai']) + + if __name__ == '__main__': unittest.main() From 16763a0e453c8aa8654ac0af3f46592893b6e90e Mon Sep 17 00:00:00 2001 From: Shachar Mirkin Date: Thu, 10 Dec 2020 00:38:37 +0100 Subject: [PATCH 2/4] Restore original order of combinations Restored original order of combinations (reflexted frequency); added unit tests; fixed max number --- src/xpinyin/__init__.py | 48 ++++++++++++----------------------------- src/xpinyin/combs.py | 4 ++-- src/xpinyin/tests.py | 17 +++++++++++++++ 3 files changed, 33 insertions(+), 36 deletions(-) diff --git a/src/xpinyin/__init__.py b/src/xpinyin/__init__.py index 82db0b8..7cae020 100644 --- a/src/xpinyin/__init__.py +++ b/src/xpinyin/__init__.py @@ -110,7 +110,7 @@ def convert_pinyin(word, convert): return word.upper() def get_pinyins(self, chars: str, splitter: str = u'-', - tone_marks: Optional[str] = None, convert: str = 'lower', comb: bool = True) -> List[str]: + tone_marks: Optional[str] = None, convert: str = 'lower', n=None) -> List[str]: all_pinyins = [] # a list of lists of pinyin options for each char flag = 1 # in the list (probably not aChinese character) for char in chars: @@ -122,48 +122,28 @@ def get_pinyins(self, chars: str, splitter: str = u'-', all_pinyins[-1][-1] += char # add to previous sequence of non Chinese chars flag = 0 else: - flag = 1 - char_py_versions = self.dict[key].split() - last = 1 if comb is False else len(char_py_versions) + if tone_marks is None: # in this case we may have duplicates if the variations differ just by the tones + char_py_versions = [] + for v in self.dict[key].split(): + if v[0:-1] not in char_py_versions: # we remove the tone mark while we're at it + char_py_versions.append(v[0:-1]) + else: + char_py_versions = self.dict[key].split() + last = 1 if n == 1 else len(char_py_versions) if tone_marks == 'marks': char_options = [Pinyin.decode_pinyin(o) for o in char_py_versions[0:last]] - elif tone_marks == 'numbers': + else: # 'numbers' or None char_options = [o for o in char_py_versions[0:last]] - else: - char_options = [o[:-1] for o in char_py_versions[0:last]] + all_pinyins.append([Pinyin.convert_pinyin(c, convert) for c in char_options]) + flag = 1 - return list(set(get_combs(all_pinyins, splitter))) # note: ignoring order + return get_combs(all_pinyins, splitter, n=n) def get_pinyin(self, chars: str, splitter: str = u'-', tone_marks=None, convert: str = 'lower') -> str: - return self.get_pinyins(chars, splitter=splitter, tone_marks=tone_marks, convert=convert, comb=False)[0] - - def get_pinyin_old(self, chars=u'你好', splitter=u'-', - tone_marks=None, convert='lower'): - result = [] - flag = 1 - - for char in chars: - key = "%X" % ord(char) - try: - if tone_marks == 'marks': - word = self.decode_pinyin(self.dict[key].split()[0]) # TODO comb - elif tone_marks == 'numbers': - word = self.dict[key].split()[0] # TODO comb - else: - word = self.dict[key].split()[0][:-1] # TODO comb - word = self.convert_pinyin(word, convert) - result.append(word) - flag = 1 - except KeyError: - if flag: - result.append(char) # TODO this is adding the original - else: - result[-1] += char # TODO replacing the last char with the original if already was in error state - flag = 0 - return splitter.join(result) + return self.get_pinyins(chars, splitter=splitter, tone_marks=tone_marks, convert=convert, n=1)[0] def get_initial(self, char=u'你'): try: diff --git a/src/xpinyin/combs.py b/src/xpinyin/combs.py index 4e63653..406ecb7 100644 --- a/src/xpinyin/combs.py +++ b/src/xpinyin/combs.py @@ -35,10 +35,10 @@ def _get_comb_indexes(lengths: List[int], n=None) -> List[List[int]]: return combs -def get_combs(options: List[List[str]], splitter='', n=10) -> List[str]: +def get_combs(options: List[List[str]], splitter='', n=None) -> List[str]: """ + Given a list of options, returns up to n combinations e.g.: [['a'], ['1' ,'2'], ['@']] -> [a1@, a2@] - Note: the order is not guaranteed """ combs = [] comb_numbers = [len(o) for o in options] diff --git a/src/xpinyin/tests.py b/src/xpinyin/tests.py index b09d56a..6b8cd2d 100644 --- a/src/xpinyin/tests.py +++ b/src/xpinyin/tests.py @@ -41,6 +41,23 @@ def test_get_initials_with_splitter(self): def test_get_pinyins_with_default_splitter(self): self.assertEqual(self.p.get_pinyins(u'上海'), [u'shang-hai']) + def test_get_pinyins_single_char(self): + self.assertEqual(self.p.get_pinyins(u'乐', splitter='', tone_marks='marks'), + ['lè', 'yuè', 'yào', 'luò', 'liáo']) # 4E50 LE4 YUE4 YAO4 LUO4 LIAO2 + + def test_get_pinyins_two_chars(self): + combs1 = self.p.get_pinyins(u'音', splitter='', tone_marks='marks') + combs2 = self.p.get_pinyins(u'乐', splitter='', tone_marks='marks') + combs12 = self.p.get_pinyins(u'音乐', splitter='', tone_marks='marks') + self.assertEqual(len(combs12), len(combs1) * len(combs2)) + self.assertIn('yīnyuè', combs12) + + def test_get_pinyins_no_tones_uniq(self): + self.assertEqual(['ma'], self.p.get_pinyins(u'吗', splitter='', tone_marks=None)) + + def test_get_pinyins_max_number(self): + self.assertEqual(5, len(self.p.get_pinyins(u'音乐', splitter='', n=5))) + if __name__ == '__main__': unittest.main() From f10a70091cb733267c90f3b7702fd5a4b99a09a1 Mon Sep 17 00:00:00 2001 From: Shachar Mirkin Date: Thu, 10 Dec 2020 22:55:08 +0100 Subject: [PATCH 3/4] Add tests, set combinations limit Add documentation, tests; set limit to avoid explonantion explosion --- src/xpinyin/__init__.py | 38 ++++++++++++++++-------------- src/xpinyin/combs.py | 34 +++++++++++++-------------- src/xpinyin/tests.py | 52 ++++++++++++++++++++++++++++++++++------- 3 files changed, 81 insertions(+), 43 deletions(-) diff --git a/src/xpinyin/__init__.py b/src/xpinyin/__init__.py index 7cae020..961d3d6 100644 --- a/src/xpinyin/__init__.py +++ b/src/xpinyin/__init__.py @@ -19,7 +19,7 @@ class Pinyin(object): """translate chinese hanzi to pinyin by python, inspired by flyerhzm’s - `chinese\_pinyin`_ gem + `chinese_pinyin`_ gem usage ----- @@ -51,7 +51,7 @@ class Pinyin(object): 'S H' 请输入utf8编码汉字 - .. _chinese\_pinyin: https://github.com/flyerhzm/chinese_pinyin + .. _chinese_pinyin: https://github.com/flyerhzm/chinese_pinyin """ data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), @@ -110,35 +110,39 @@ def convert_pinyin(word, convert): return word.upper() def get_pinyins(self, chars: str, splitter: str = u'-', - tone_marks: Optional[str] = None, convert: str = 'lower', n=None) -> List[str]: - all_pinyins = [] # a list of lists of pinyin options for each char - flag = 1 # in the list (probably not aChinese character) + tone_marks: Optional[str] = None, convert: str = 'lower', n: int = 10) -> List[str]: + """ + Get All pinyin combinations given all possible readings of each character. + The number of combinations is limited par default to 10 to avoid exponential explosion on long texts. + """ + all_pinyin_options = [] # a list of lists that we'll fill with all pinyin options for each character + flag = 1 # in the list (otherwise, probably not a Chinese character) for char in chars: key = "%X" % ord(char) if key not in self.dict: if flag == 1: - all_pinyins.append([char]) # add as is + all_pinyin_options.append([char]) # add as is else: - all_pinyins[-1][-1] += char # add to previous sequence of non Chinese chars - flag = 0 + all_pinyin_options[-1][-1] += char # add to previous sequence of non Chinese chars + flag = 0 # within a sequence of non Chinese characters else: if tone_marks is None: # in this case we may have duplicates if the variations differ just by the tones - char_py_versions = [] + char_py_options = [] for v in self.dict[key].split(): - if v[0:-1] not in char_py_versions: # we remove the tone mark while we're at it - char_py_versions.append(v[0:-1]) + if v[0:-1] not in char_py_options: # we remove the tone mark while we're at it + char_py_options.append(v[0:-1]) else: - char_py_versions = self.dict[key].split() - last = 1 if n == 1 else len(char_py_versions) + char_py_options = self.dict[key].split() + last = 1 if n == 1 else len(char_py_options) if tone_marks == 'marks': - char_options = [Pinyin.decode_pinyin(o) for o in char_py_versions[0:last]] + char_options = [Pinyin.decode_pinyin(o) for o in char_py_options[0:last]] else: # 'numbers' or None - char_options = [o for o in char_py_versions[0:last]] + char_options = [o for o in char_py_options[0:last]] - all_pinyins.append([Pinyin.convert_pinyin(c, convert) for c in char_options]) + all_pinyin_options.append([Pinyin.convert_pinyin(c, convert) for c in char_options]) flag = 1 - return get_combs(all_pinyins, splitter, n=n) + return get_combs(options=all_pinyin_options, splitter=splitter, n=n) def get_pinyin(self, chars: str, splitter: str = u'-', tone_marks=None, convert: str = 'lower') -> str: diff --git a/src/xpinyin/combs.py b/src/xpinyin/combs.py index 406ecb7..ab62f22 100644 --- a/src/xpinyin/combs.py +++ b/src/xpinyin/combs.py @@ -1,30 +1,33 @@ from typing import List -def _get_comb_indexes(lengths: List[int], n=None) -> List[List[int]]: +def _get_comb_indexes(num_options_list: List[int], n=None) -> List[List[int]]: """ Given a list with the number of possible options per place, returns a list of numbers representing combinations. The combinations are created via additions to a multi-radix number, from left to right (i.e. from smaller to larger numbers). - @param n The maximal number of requested combinations. + e.g. [2, 2, 1] -> [[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0]] + i.e. we have 2 options (0, 1) for the first and second places and one option (0) for the third. + + :param num_options_list: a list with the number of options per place + :param n: The maximal number of requested combinations. If None, all possible combinations will be returned """ # calculate the maximal number of possible combinations n_max = 1 - for j in lengths: + for j in num_options_list: n_max *= j - n = min(n, n_max) if n is not None else n_max if n == 0: raise ValueError("Can't create combinations with 0-length lists") - n_items = len(lengths) + n_items = len(num_options_list) curr = [0] * n_items combs = [list.copy(curr)] i = n_items - 1 count = 1 while count < n: - curr[i] = (curr[i] + 1) % lengths[i] + curr[i] = (curr[i] + 1) % num_options_list[i] if curr[i] != 0: combs.append(list.copy(curr)) count += 1 @@ -35,10 +38,15 @@ def _get_comb_indexes(lengths: List[int], n=None) -> List[List[int]]: return combs -def get_combs(options: List[List[str]], splitter='', n=None) -> List[str]: +def get_combs(options: List[List[str]], splitter: str = '', n: int = None) -> List[str]: """ - Given a list of options, returns up to n combinations + Given a list of options per place, returns up to n combinations e.g.: [['a'], ['1' ,'2'], ['@']] -> [a1@, a2@] + For instance, ['1' ,'2'] is the group defining the options for the second place + + :param options: a list with a list of options for each group. + :param splitter: a string to separate the groups + :param n: The maximal number of requested combinations. If None, all possible combinations will be returned """ combs = [] comb_numbers = [len(o) for o in options] @@ -51,13 +59,3 @@ def get_combs(options: List[List[str]], splitter='', n=None) -> List[str]: combs.append(splitter.join(comb)) return combs - - -def main(): - lengths = [1, 2, 1] - print(_get_comb_indexes(lengths)) - print(get_combs([['a', 'b'], ['1', '2'], ['@', '#']])) - - -if __name__ == '__main__': - main() diff --git a/src/xpinyin/tests.py b/src/xpinyin/tests.py index 6b8cd2d..5c6f68f 100644 --- a/src/xpinyin/tests.py +++ b/src/xpinyin/tests.py @@ -2,9 +2,12 @@ # -*- coding: utf-8 -*- import unittest +from xpinyin.combs import _get_comb_indexes, get_combs + class PinyinTests(unittest.TestCase): - def Pinyin(self, *a, **kw): + @staticmethod + def Pinyin(*a, **kw): from xpinyin import Pinyin return Pinyin(*a, **kw) @@ -36,14 +39,34 @@ def test_get_initials_with_splitter(self): self.assertEqual(self.p.get_initials(u'你好', u' '), u'N H') self.assertEqual(self.p.get_initials(u'你好', u''), u'NH') - # --- testing combinations --- + # --- testing combinations auxiliary functions --- + + def test_get_comb_indexes(self): + self.assertEqual([[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0]], _get_comb_indexes([2, 2, 1])) + + def test_get_comb_indexes_max_num(self): + self.assertEqual([[0, 0, 0], [0, 1, 0], [1, 0, 0]], _get_comb_indexes([2, 2, 1], 3)) + + def test_get_combs(self): + self.assertEqual(['a1@', 'a1#', 'a2@', 'a2#', 'b1@', 'b1#', 'b2@', 'b2#'], + get_combs([['a', 'b'], ['1', '2'], ['@', '#']])) + + def test_get_combs_splitter_max_num(self): + self.assertEqual(['a 1 @', 'a 1 #', 'a 2 @', 'a 2 #', 'b 1 @'], + get_combs([['a', 'b'], ['1', '2'], ['@', '#']], splitter=' ', n=5)) + + def test_get_combs_max_num_too_big(self): + self.assertEqual(['a||1||@', 'a||1||#', 'a||2||@', 'a||2||#', 'b||1||@', 'b||1||#', 'b||2||@', 'b||2||#'], + get_combs([['a', 'b'], ['1', '2'], ['@', '#']], splitter='||', n=100)) + + # --- testing pinyin combinations --- def test_get_pinyins_with_default_splitter(self): self.assertEqual(self.p.get_pinyins(u'上海'), [u'shang-hai']) def test_get_pinyins_single_char(self): - self.assertEqual(self.p.get_pinyins(u'乐', splitter='', tone_marks='marks'), - ['lè', 'yuè', 'yào', 'luò', 'liáo']) # 4E50 LE4 YUE4 YAO4 LUO4 LIAO2 + self.assertEqual(['lè', 'yuè', 'yào', 'luò', 'liáo'], # 4E50 LE4 YUE4 YAO4 LUO4 LIAO2 + self.p.get_pinyins(u'乐', splitter='', tone_marks='marks')) def test_get_pinyins_two_chars(self): combs1 = self.p.get_pinyins(u'音', splitter='', tone_marks='marks') @@ -55,9 +78,22 @@ def test_get_pinyins_two_chars(self): def test_get_pinyins_no_tones_uniq(self): self.assertEqual(['ma'], self.p.get_pinyins(u'吗', splitter='', tone_marks=None)) - def test_get_pinyins_max_number(self): + def test_get_pinyins_max_num(self): self.assertEqual(5, len(self.p.get_pinyins(u'音乐', splitter='', n=5))) - -if __name__ == '__main__': - unittest.main() + def test_get_pinyins_mixed_words(self): + self.assertEqual(self.p.get_pinyins(u'ABC串123', splitter=u' ', tone_marks='marks'), + ['ABC chuàn 123', 'ABC guàn 123']) + + def test_get_pinyins_long_seq(self): + text = u"""汉语拼音(Hànyǔ Pīnyīn), + 簡稱拼音,是一種以拉丁字母作普通话(現代標準漢語)標音的方案,為中文羅馬拼音的國際標準規範。 + 汉语拼音在中国大陆作为基础教育内容全面使用,是义务教育的重要内容。在海外,特别是常用現代標準漢語的地区如新加坡、 + 马来西亚、菲律宾和美国唐人街等,目前也在汉语教育中进行汉语拼音教学。臺灣自2008年開始, + 中文譯音使用原則也採用漢語拼音[1],但舊護照姓名和部分地名、道路名稱仍採用威妥瑪拼音、 + 郵政式拼音、國語羅馬字、國音二式抑或通用拼音[2]。""" + self.assertEqual(20, len(self.p.get_pinyins(text, n=20))) + self.assertEqual(10, len(self.p.get_pinyins(text))) # limited to 10 by default + + if __name__ == '__main__': + unittest.main() From 4e00960cec93f218e9067368c98541385ecf5f27 Mon Sep 17 00:00:00 2001 From: Shachar Mirkin Date: Fri, 11 Dec 2020 13:15:22 +0100 Subject: [PATCH 4/4] Add multiple readings example to README --- README.rst | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 5bc7dd5..c1a6ac0 100644 --- a/README.rst +++ b/README.rst @@ -50,13 +50,17 @@ Usage 'SH' >>> p.get_initials(u"上海", u' ') 'S H' - + >>> # get combinations of the multiple readings of the characters + >>> p.get_pinyins(u'好吗?', splitter=u'', tone_marks='marks') + ['hǎoma?', 'hǎomá?', 'hǎomǎ?', 'hàoma?', 'hàomá?', 'hàomǎ?'] + + 如果方法中传入变量,那么直接加前缀是不可以了。而是要将变量转为utf-8编码: >>> wordvalue = '中国' >>> wordvalue= unicode(wordvalue,'utf-8') >>> s = p.get_initials(wordvalue, u'').lower() 'zg' - + 请输入utf8编码汉字