Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add pinyin combinations #46

Merged
merged 4 commits into from
Dec 11, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,17 @@ Usage
'SH'
>>> p.get_initials(u"上海", u' ')
'S H'

>>> # get combinations of the multiple readings of the characters
>>> p.get_pinyins(u'好吗?', splitter=u'', tone_marks='marks')
['hǎoma?', 'hǎomá?', 'hǎomǎ?', 'hàoma?', 'hàomá?', 'hàomǎ?']


如果方法中传入变量,那么直接加前缀是不可以了。而是要将变量转为utf-8编码:
>>> wordvalue = '中国'
>>> wordvalue= unicode(wordvalue,'utf-8')
>>> s = p.get_initials(wordvalue, u'').lower()
'zg'


请输入utf8编码汉字

Expand Down
64 changes: 41 additions & 23 deletions src/xpinyin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@

import os.path
import re
from typing import List, Optional

from xpinyin.combs import get_combs

PinyinToneMark = {
0: u"aoeiuv\u00fc",
Expand All @@ -15,9 +18,8 @@


class Pinyin(object):

"""translate chinese hanzi to pinyin by python, inspired by flyerhzm’s
`chinese\_pinyin`_ gem
`chinese_pinyin`_ gem

usage
-----
Expand Down Expand Up @@ -49,7 +51,7 @@ class Pinyin(object):
'S H'

请输入utf8编码汉字
.. _chinese\_pinyin: https://github.com/flyerhzm/chinese_pinyin
.. _chinese_pinyin: https://github.com/flyerhzm/chinese_pinyin
"""

data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
Expand All @@ -60,7 +62,7 @@ def __init__(self, data_path=data_path):
with open(data_path) as f:
for line in f:
k, v = line.split('\t')
self.dict[k] = v
self.dict[k] = v.rstrip()

@staticmethod
def decode_pinyin(s):
Expand Down Expand Up @@ -107,29 +109,45 @@ def convert_pinyin(word, convert):
if convert == 'upper':
return word.upper()

def get_pinyin(self, chars=u'你好', splitter=u'-',
tone_marks=None, convert='lower'):
result = []
flag = 1
def get_pinyins(self, chars: str, splitter: str = u'-',
tone_marks: Optional[str] = None, convert: str = 'lower', n: int = 10) -> List[str]:
"""
Get All pinyin combinations given all possible readings of each character.
The number of combinations is limited par default to 10 to avoid exponential explosion on long texts.
"""
all_pinyin_options = [] # a list of lists that we'll fill with all pinyin options for each character
flag = 1 # in the list (otherwise, probably not a Chinese character)
for char in chars:
key = "%X" % ord(char)
try:
if tone_marks == 'marks':
word = self.decode_pinyin(self.dict[key].split()[0].strip())
elif tone_marks == 'numbers':
word = self.dict[key].split()[0].strip()
if key not in self.dict:
if flag == 1:
all_pinyin_options.append([char]) # add as is
else:
word = self.dict[key].split()[0].strip()[:-1]
word = self.convert_pinyin(word, convert)
result.append(word)
flag = 1
except KeyError:
if flag:
result.append(char)
all_pinyin_options[-1][-1] += char # add to previous sequence of non Chinese chars
flag = 0 # within a sequence of non Chinese characters
else:
if tone_marks is None: # in this case we may have duplicates if the variations differ just by the tones
char_py_options = []
for v in self.dict[key].split():
if v[0:-1] not in char_py_options: # we remove the tone mark while we're at it
char_py_options.append(v[0:-1])
else:
result[-1] += char
flag = 0
return splitter.join(result)
char_py_options = self.dict[key].split()
last = 1 if n == 1 else len(char_py_options)
if tone_marks == 'marks':
char_options = [Pinyin.decode_pinyin(o) for o in char_py_options[0:last]]
else: # 'numbers' or None
char_options = [o for o in char_py_options[0:last]]

all_pinyin_options.append([Pinyin.convert_pinyin(c, convert) for c in char_options])
flag = 1

return get_combs(options=all_pinyin_options, splitter=splitter, n=n)

def get_pinyin(self, chars: str, splitter: str = u'-',
tone_marks=None, convert: str = 'lower') -> str:

return self.get_pinyins(chars, splitter=splitter, tone_marks=tone_marks, convert=convert, n=1)[0]

def get_initial(self, char=u'你'):
try:
Expand Down
61 changes: 61 additions & 0 deletions src/xpinyin/combs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from typing import List


def _get_comb_indexes(num_options_list: List[int], n=None) -> List[List[int]]:
"""
Given a list with the number of possible options per place, returns a list of numbers representing combinations.
The combinations are created via additions to a multi-radix number, from left to right
(i.e. from smaller to larger numbers).

e.g. [2, 2, 1] -> [[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0]]
i.e. we have 2 options (0, 1) for the first and second places and one option (0) for the third.

:param num_options_list: a list with the number of options per place
:param n: The maximal number of requested combinations. If None, all possible combinations will be returned
"""
# calculate the maximal number of possible combinations
n_max = 1
for j in num_options_list:
n_max *= j
n = min(n, n_max) if n is not None else n_max
if n == 0:
raise ValueError("Can't create combinations with 0-length lists")

n_items = len(num_options_list)
curr = [0] * n_items
combs = [list.copy(curr)]
i = n_items - 1
count = 1
while count < n:
curr[i] = (curr[i] + 1) % num_options_list[i]
if curr[i] != 0:
combs.append(list.copy(curr))
count += 1
i = n_items - 1 # reset to right-most digit
else:
i -= 1 # try previous (left) digit

return combs


def get_combs(options: List[List[str]], splitter: str = '', n: int = None) -> List[str]:
"""
Given a list of options per place, returns up to n combinations
e.g.: [['a'], ['1' ,'2'], ['@']] -> [a1@, a2@]
For instance, ['1' ,'2'] is the group defining the options for the second place

:param options: a list with a list of options for each group.
:param splitter: a string to separate the groups
:param n: The maximal number of requested combinations. If None, all possible combinations will be returned
"""
combs = []
comb_numbers = [len(o) for o in options]
combs_indexes = _get_comb_indexes(comb_numbers, n)

for c in combs_indexes: # e.g. [0,2,1]
comb = []
for i in range(len(c)):
comb.append(options[i][c[i]])
combs.append(splitter.join(comb))

return combs
68 changes: 63 additions & 5 deletions src/xpinyin/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,12 @@
# -*- coding: utf-8 -*-
import unittest

from xpinyin.combs import _get_comb_indexes, get_combs


class PinyinTests(unittest.TestCase):
def Pinyin(self, *a, **kw):
@staticmethod
def Pinyin(*a, **kw):
from xpinyin import Pinyin

return Pinyin(*a, **kw)
Expand All @@ -23,8 +27,6 @@ def test_get_pinyin_mixed_words(self):

def test_get_pinyin_with_tone_marks(self):
self.assertEqual(self.p.get_pinyin(u'上海', tone_marks='marks'), u'sh\xe0ng-h\u01cei')

def test_get_pinyin_with_tone_marks(self):
self.assertEqual(self.p.get_pinyin(u'秋', tone_marks='marks'), u'qiū')

def test_get_initial(self):
Expand All @@ -37,5 +39,61 @@ def test_get_initials_with_splitter(self):
self.assertEqual(self.p.get_initials(u'你好', u' '), u'N H')
self.assertEqual(self.p.get_initials(u'你好', u''), u'NH')

if __name__ == '__main__':
unittest.main()
# --- testing combinations auxiliary functions ---

def test_get_comb_indexes(self):
self.assertEqual([[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 0]], _get_comb_indexes([2, 2, 1]))

def test_get_comb_indexes_max_num(self):
self.assertEqual([[0, 0, 0], [0, 1, 0], [1, 0, 0]], _get_comb_indexes([2, 2, 1], 3))

def test_get_combs(self):
self.assertEqual(['a1@', 'a1#', 'a2@', 'a2#', 'b1@', 'b1#', 'b2@', 'b2#'],
get_combs([['a', 'b'], ['1', '2'], ['@', '#']]))

def test_get_combs_splitter_max_num(self):
self.assertEqual(['a 1 @', 'a 1 #', 'a 2 @', 'a 2 #', 'b 1 @'],
get_combs([['a', 'b'], ['1', '2'], ['@', '#']], splitter=' ', n=5))

def test_get_combs_max_num_too_big(self):
self.assertEqual(['a||1||@', 'a||1||#', 'a||2||@', 'a||2||#', 'b||1||@', 'b||1||#', 'b||2||@', 'b||2||#'],
get_combs([['a', 'b'], ['1', '2'], ['@', '#']], splitter='||', n=100))

# --- testing pinyin combinations ---

def test_get_pinyins_with_default_splitter(self):
self.assertEqual(self.p.get_pinyins(u'上海'), [u'shang-hai'])

def test_get_pinyins_single_char(self):
self.assertEqual(['lè', 'yuè', 'yào', 'luò', 'liáo'], # 4E50 LE4 YUE4 YAO4 LUO4 LIAO2
self.p.get_pinyins(u'乐', splitter='', tone_marks='marks'))

def test_get_pinyins_two_chars(self):
combs1 = self.p.get_pinyins(u'音', splitter='', tone_marks='marks')
combs2 = self.p.get_pinyins(u'乐', splitter='', tone_marks='marks')
combs12 = self.p.get_pinyins(u'音乐', splitter='', tone_marks='marks')
self.assertEqual(len(combs12), len(combs1) * len(combs2))
self.assertIn('yīnyuè', combs12)

def test_get_pinyins_no_tones_uniq(self):
self.assertEqual(['ma'], self.p.get_pinyins(u'吗', splitter='', tone_marks=None))

def test_get_pinyins_max_num(self):
self.assertEqual(5, len(self.p.get_pinyins(u'音乐', splitter='', n=5)))

def test_get_pinyins_mixed_words(self):
self.assertEqual(self.p.get_pinyins(u'ABC串123', splitter=u' ', tone_marks='marks'),
['ABC chuàn 123', 'ABC guàn 123'])

def test_get_pinyins_long_seq(self):
text = u"""汉语拼音(Hànyǔ Pīnyīn),
簡稱拼音,是一種以拉丁字母作普通话(現代標準漢語)標音的方案,為中文羅馬拼音的國際標準規範。
汉语拼音在中国大陆作为基础教育内容全面使用,是义务教育的重要内容。在海外,特别是常用現代標準漢語的地区如新加坡、
马来西亚、菲律宾和美国唐人街等,目前也在汉语教育中进行汉语拼音教学。臺灣自2008年開始,
中文譯音使用原則也採用漢語拼音[1],但舊護照姓名和部分地名、道路名稱仍採用威妥瑪拼音、
郵政式拼音、國語羅馬字、國音二式抑或通用拼音[2]。"""
self.assertEqual(20, len(self.p.get_pinyins(text, n=20)))
self.assertEqual(10, len(self.p.get_pinyins(text))) # limited to 10 by default

if __name__ == '__main__':
unittest.main()