lxneng · lxneng · Dec 11, 2020 · Dec 9, 2020 · Dec 9, 2020 · Dec 10, 2020
diff --git a/README.rst b/README.rst
@@ -50,13 +50,17 @@ Usage
     'SH'
     >>> p.get_initials(u"上海", u' ')
     'S H'
-
+    >>> # get combinations of the multiple readings of the characters
+    >>> p.get_pinyins(u'好吗？', splitter=u'', tone_marks='marks')
+    ['hǎoma？', 'hǎomá？', 'hǎomǎ？', 'hàoma？', 'hàomá？', 'hàomǎ？']
+
+
     如果方法中传入变量，那么直接加前缀是不可以了。而是要将变量转为utf-8编码：
     >>> wordvalue = '中国'
     >>> wordvalue= unicode(wordvalue,'utf-8')
     >>> s = p.get_initials(wordvalue, u'').lower()
     'zg'
-    
+
 
 请输入utf8编码汉字
 

diff --git a/src/xpinyin/__init__.py b/src/xpinyin/__init__.py
@@ -4,6 +4,9 @@
 
 import os.path
 import re
+from typing import List, Optional
+
+from xpinyin.combs import get_combs
 
 PinyinToneMark = {
     0: u"aoeiuv\u00fc",
@@ -15,9 +18,8 @@
 
 
 class Pinyin(object):
-
     """translate chinese hanzi to pinyin by python, inspired by flyerhzm’s
-    `chinese\_pinyin`_ gem
+    `chinese_pinyin`_ gem
 
     usage
     -----
@@ -49,7 +51,7 @@ class Pinyin(object):
         'S H'
 
     请输入utf8编码汉字
-    .. _chinese\_pinyin: https://github.com/flyerhzm/chinese_pinyin
+    .. _chinese_pinyin: https://github.com/flyerhzm/chinese_pinyin
     """
 
     data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
@@ -60,7 +62,7 @@ def __init__(self, data_path=data_path):
         with open(data_path) as f:
             for line in f:
                 k, v = line.split('\t')
-                self.dict[k] = v
+                self.dict[k] = v.rstrip()
 
     @staticmethod
     def decode_pinyin(s):
@@ -107,29 +109,45 @@ def convert_pinyin(word, convert):
         if convert == 'upper':
             return word.upper()
 
-    def get_pinyin(self, chars=u'你好', splitter=u'-',
-                   tone_marks=None, convert='lower'):
-        result = []
-        flag = 1
+    def get_pinyins(self, chars: str, splitter: str = u'-',
+                    tone_marks: Optional[str] = None, convert: str = 'lower', n: int = 10) -> List[str]:
+        """
+        Get All pinyin combinations given all possible readings of each character.
+        The number of combinations is limited par default to 10 to avoid exponential explosion on long texts.
+        """
+        all_pinyin_options = []  # a list of lists that we'll fill with all pinyin options for each character
+        flag = 1  # in the list (otherwise, probably not a Chinese character)
         for char in chars:
             key = "%X" % ord(char)
-            try:
-                if tone_marks == 'marks':
-                    word = self.decode_pinyin(self.dict[key].split()[0].strip())
-                elif tone_marks == 'numbers':
-                    word = self.dict[key].split()[0].strip()
+            if key not in self.dict:
+                if flag == 1:
+                    all_pinyin_options.append([char])  # add as is
                 else:
-                    word = self.dict[key].split()[0].strip()[:-1]
-                word = self.convert_pinyin(word, convert)
-                result.append(word)
-                flag = 1
-            except KeyError:
-                if flag:
-                    result.append(char)
+                    all_pinyin_options[-1][-1] += char  # add to previous sequence of non Chinese chars
+                flag = 0  # within a sequence of non Chinese characters
+            else:
+                if tone_marks is None:  # in this case we may have duplicates if the variations differ just by the tones
+                    char_py_options = []
+                    for v in self.dict[key].split():
+                        if v[0:-1] not in char_py_options:  # we remove the tone mark while we're at it
+                            char_py_options.append(v[0:-1])
                 else:
-                    result[-1] += char
-                flag = 0
-        return splitter.join(result)
+                    char_py_options = self.dict[key].split()
+                last = 1 if n == 1 else len(char_py_options)
+                if tone_marks == 'marks':
+                    char_options = [Pinyin.decode_pinyin(o) for o in char_py_options[0:last]]
+                else:  # 'numbers' or None
+                    char_options = [o for o in char_py_options[0:last]]
+
+                all_pinyin_options.append([Pinyin.convert_pinyin(c, convert) for c in char_options])
+                flag = 1
+
+        return get_combs(options=all_pinyin_options, splitter=splitter, n=n)
+
+    def get_pinyin(self, chars: str, splitter: str = u'-',
+                   tone_marks=None, convert: str = 'lower') -> str:
+
+        return self.get_pinyins(chars, splitter=splitter, tone_marks=tone_marks, convert=convert, n=1)[0]
 
     def get_initial(self, char=u'你'):
         try:

diff --git a/src/xpinyin/combs.py b/src/xpinyin/combs.py
@@ -0,0 +1,61 @@
+from typing import List
+
+
+def _get_comb_indexes(num_options_list: List[int], n=None) -> List[List[int]]:
+    """
+    Given a list with the number of possible options per place, returns a list of numbers representing combinations.
+    The combinations are created via additions to a multi-radix number, from left to right
+    (i.e. from smaller to larger numbers).
+
+    e.g. [2, 2, 1] -> [[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0]]
+    i.e. we have 2 options (0, 1) for the first and second places and one option (0) for the third.
+
+    :param num_options_list: a list with the number of options per place
+    :param n: The maximal number of requested combinations. If None, all possible combinations will be returned
+    """
+    # calculate the maximal number of possible combinations
+    n_max = 1
+    for j in num_options_list:
+        n_max *= j
+    n = min(n, n_max) if n is not None else n_max
+    if n == 0:
+        raise ValueError("Can't create combinations with 0-length lists")
+
+    n_items = len(num_options_list)
+    curr = [0] * n_items
+    combs = [list.copy(curr)]
+    i = n_items - 1
+    count = 1
+    while count < n:
+        curr[i] = (curr[i] + 1) % num_options_list[i]
+        if curr[i] != 0:
+            combs.append(list.copy(curr))
+            count += 1
+            i = n_items - 1  # reset to right-most digit
+        else:
+            i -= 1  # try previous (left) digit
+
+    return combs
+
+
+def get_combs(options: List[List[str]], splitter: str = '', n: int = None) -> List[str]:
+    """
+    Given a list of options per place, returns up to n combinations
+    e.g.: [['a'], ['1' ,'2'], ['@']] -> [a1@, a2@]
+    For instance, ['1' ,'2'] is the group defining the options for the second place
+
+    :param options: a list with a list of options for each group.
+    :param splitter: a string to separate the groups
+    :param n: The maximal number of requested combinations. If None, all possible combinations will be returned
+    """
+    combs = []
+    comb_numbers = [len(o) for o in options]
+    combs_indexes = _get_comb_indexes(comb_numbers, n)
+
+    for c in combs_indexes:  # e.g. [0,2,1]
+        comb = []
+        for i in range(len(c)):
+            comb.append(options[i][c[i]])
+        combs.append(splitter.join(comb))
+
+    return combs
diff --git a/src/xpinyin/tests.py b/src/xpinyin/tests.py
@@ -2,8 +2,12 @@
 # -*- coding: utf-8 -*-
 import unittest
 
+from xpinyin.combs import _get_comb_indexes, get_combs
+
+
 class PinyinTests(unittest.TestCase):
-    def Pinyin(self, *a, **kw):
+    @staticmethod
+    def Pinyin(*a, **kw):
         from xpinyin import Pinyin
 
         return Pinyin(*a, **kw)
@@ -23,8 +27,6 @@ def test_get_pinyin_mixed_words(self):
 
     def test_get_pinyin_with_tone_marks(self):
         self.assertEqual(self.p.get_pinyin(u'上海', tone_marks='marks'), u'sh\xe0ng-h\u01cei')
-
-    def test_get_pinyin_with_tone_marks(self):
         self.assertEqual(self.p.get_pinyin(u'秋', tone_marks='marks'), u'qiū')
 
     def test_get_initial(self):
@@ -37,5 +39,61 @@ def test_get_initials_with_splitter(self):
         self.assertEqual(self.p.get_initials(u'你好', u' '), u'N H')
         self.assertEqual(self.p.get_initials(u'你好', u''), u'NH')
 
-if __name__ == '__main__':
-    unittest.main()
+    # --- testing combinations auxiliary functions ---
+
+    def test_get_comb_indexes(self):
+        self.assertEqual([[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0]], _get_comb_indexes([2, 2, 1]))
+
+    def test_get_comb_indexes_max_num(self):
+        self.assertEqual([[0, 0, 0], [0, 1, 0], [1, 0, 0]], _get_comb_indexes([2, 2, 1], 3))
+
+    def test_get_combs(self):
+        self.assertEqual(['a1@', 'a1#', 'a2@', 'a2#', 'b1@', 'b1#', 'b2@', 'b2#'],
+                         get_combs([['a', 'b'], ['1', '2'], ['@', '#']]))
+
+    def test_get_combs_splitter_max_num(self):
+        self.assertEqual(['a 1 @', 'a 1 #', 'a 2 @', 'a 2 #', 'b 1 @'],
+                         get_combs([['a', 'b'], ['1', '2'], ['@', '#']], splitter=' ', n=5))
+
+    def test_get_combs_max_num_too_big(self):
+        self.assertEqual(['a||1||@', 'a||1||#', 'a||2||@', 'a||2||#', 'b||1||@', 'b||1||#', 'b||2||@', 'b||2||#'],
+                         get_combs([['a', 'b'], ['1', '2'], ['@', '#']], splitter='||', n=100))
+
+    # --- testing pinyin combinations ---
+
+    def test_get_pinyins_with_default_splitter(self):
+        self.assertEqual(self.p.get_pinyins(u'上海'), [u'shang-hai'])
+
+    def test_get_pinyins_single_char(self):
+        self.assertEqual(['lè', 'yuè', 'yào', 'luò', 'liáo'],  # 4E50	LE4 YUE4 YAO4 LUO4 LIAO2
+                         self.p.get_pinyins(u'乐', splitter='', tone_marks='marks'))
+
+    def test_get_pinyins_two_chars(self):
+        combs1 = self.p.get_pinyins(u'音', splitter='', tone_marks='marks')
+        combs2 = self.p.get_pinyins(u'乐', splitter='', tone_marks='marks')
+        combs12 = self.p.get_pinyins(u'音乐', splitter='', tone_marks='marks')
+        self.assertEqual(len(combs12), len(combs1) * len(combs2))
+        self.assertIn('yīnyuè', combs12)
+
+    def test_get_pinyins_no_tones_uniq(self):
+        self.assertEqual(['ma'], self.p.get_pinyins(u'吗', splitter='', tone_marks=None))
+
+    def test_get_pinyins_max_num(self):
+        self.assertEqual(5, len(self.p.get_pinyins(u'音乐', splitter='', n=5)))
+
+    def test_get_pinyins_mixed_words(self):
+        self.assertEqual(self.p.get_pinyins(u'ABC串123', splitter=u' ', tone_marks='marks'),
+                         ['ABC chuàn 123', 'ABC guàn 123'])
+
+    def test_get_pinyins_long_seq(self):
+        text = u"""汉语拼音（Hànyǔ Pīnyīn），
+            簡稱拼音，是一種以拉丁字母作普通话（現代標準漢語）標音的方案，為中文羅馬拼音的國際標準規範。
+            汉语拼音在中国大陆作为基础教育内容全面使用，是义务教育的重要内容。在海外，特别是常用現代標準漢語的地区如新加坡、
+            马来西亚、菲律宾和美国唐人街等，目前也在汉语教育中进行汉语拼音教学。臺灣自2008年開始，
+            中文譯音使用原則也採用漢語拼音[1]，但舊護照姓名和部分地名、道路名稱仍採用威妥瑪拼音、
+            郵政式拼音、國語羅馬字、國音二式抑或通用拼音[2]。"""
+        self.assertEqual(20, len(self.p.get_pinyins(text, n=20)))
+        self.assertEqual(10, len(self.p.get_pinyins(text)))  # limited to 10 by default
+
+        if __name__ == '__main__':
+            unittest.main()