-
Notifications
You must be signed in to change notification settings - Fork 115
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
b5d0fb2
commit 46bbbd0
Showing
10 changed files
with
2,605 additions
and
6 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,191 @@ | ||
import jieba | ||
import synonyms | ||
import random | ||
from random import shuffle | ||
|
||
random.seed(2019) | ||
|
||
# 停用词列表,默认使用哈工大停用词表 | ||
f = open('stopwords/stopwords.txt', encoding='utf-8') | ||
stop_words = list() | ||
for stop_word in f.readlines(): | ||
stop_words.append(stop_word[:-1]) | ||
|
||
# 考虑到与英文的不同,暂时搁置 | ||
# 文本清理 | ||
''' | ||
import re | ||
def get_only_chars(line): | ||
#1.清除所有的数字 | ||
''' | ||
|
||
|
||
######################################################################## | ||
# 同义词替换 | ||
# 替换一个语句中的n个单词为其同义词 | ||
######################################################################## | ||
def synonym_replacement(words, n): | ||
new_words = words.copy() | ||
random_word_list = list(set([word for word in words if word not in stop_words])) | ||
random.shuffle(random_word_list) | ||
num_replaced = 0 | ||
for random_word in random_word_list: | ||
synonyms = get_synonyms(random_word) | ||
if len(synonyms) >= 1: | ||
synonym = random.choice(synonyms) | ||
new_words = [synonym if word == random_word else word for word in new_words] | ||
num_replaced += 1 | ||
if num_replaced >= n: | ||
break | ||
|
||
sentence = ' '.join(new_words) | ||
new_words = sentence.split(' ') | ||
|
||
return new_words | ||
|
||
|
||
def get_synonyms(word): | ||
return synonyms.nearby(word)[0] | ||
|
||
|
||
######################################################################## | ||
# 随机插入 | ||
# 随机在语句中插入n个词 | ||
######################################################################## | ||
def random_insertion(words, n): | ||
new_words = words.copy() | ||
for _ in range(n): | ||
add_word(new_words) | ||
return new_words | ||
|
||
|
||
def add_word(new_words): | ||
synonyms = [] | ||
counter = 0 | ||
while len(synonyms) < 1: | ||
random_word = new_words[random.randint(0, len(new_words) - 1)] | ||
synonyms = get_synonyms(random_word) | ||
counter += 1 | ||
if counter >= 10: | ||
return | ||
random_synonym = random.choice(synonyms) | ||
random_idx = random.randint(0, len(new_words) - 1) | ||
new_words.insert(random_idx, random_synonym) | ||
|
||
|
||
######################################################################## | ||
# Random swap 随机交换 | ||
# Randomly swap two words in the sentence n times | ||
######################################################################## | ||
|
||
def random_swap(words, n): | ||
new_words = words.copy() | ||
for _ in range(n): | ||
new_words = swap_word(new_words) | ||
return new_words | ||
|
||
|
||
def swap_word(new_words): | ||
random_idx_1 = random.randint(0, len(new_words) - 1) | ||
random_idx_2 = random_idx_1 | ||
counter = 0 | ||
while random_idx_2 == random_idx_1: | ||
random_idx_2 = random.randint(0, len(new_words) - 1) | ||
counter += 1 | ||
if counter > 3: | ||
return new_words | ||
new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1] | ||
return new_words | ||
|
||
|
||
######################################################################## | ||
# 随机删除 | ||
# 以概率p删除语句中的词 | ||
######################################################################## | ||
def random_deletion(words, p): | ||
if len(words) == 1: | ||
return words | ||
|
||
new_words = [] | ||
for word in words: | ||
r = random.uniform(0, 1) | ||
if r > p: | ||
new_words.append(word) | ||
|
||
if len(new_words) == 0: | ||
rand_int = random.randint(0, len(words) - 1) | ||
return [words[rand_int]] | ||
|
||
return new_words | ||
|
||
|
||
######################################################################## | ||
# EDA函数 | ||
def eda(sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=9): | ||
seg_list = jieba.cut(sentence) | ||
seg_list = " ".join(seg_list) | ||
words = list(seg_list.split()) | ||
num_words = len(words) | ||
|
||
augmented_sentences = [] | ||
num_new_per_technique = int(num_aug / 4) + 1 | ||
n_sr = max(1, int(alpha_sr * num_words)) | ||
n_ri = max(1, int(alpha_ri * num_words)) | ||
n_rs = max(1, int(alpha_rs * num_words)) | ||
|
||
# print(words, "\n") | ||
|
||
# 同义词替换sr | ||
for _ in range(num_new_per_technique): | ||
a_words = synonym_replacement(words, n_sr) | ||
augmented_sentences.append(' '.join(a_words)) | ||
# print("同义词替换===》", ' '.join(a_words)) | ||
|
||
# 随机插入ri | ||
for _ in range(num_new_per_technique): | ||
a_words = random_insertion(words, n_ri) | ||
augmented_sentences.append(' '.join(a_words)) | ||
# print("随机插入===》", ' '.join(a_words)) | ||
|
||
# 随机交换rs | ||
for _ in range(num_new_per_technique): | ||
a_words = random_swap(words, n_rs) | ||
augmented_sentences.append(' '.join(a_words)) | ||
# print("随机交换===》", ' '.join(a_words)) | ||
|
||
# 随机删除rd | ||
for _ in range(num_new_per_technique): | ||
a_words = random_deletion(words, p_rd) | ||
augmented_sentences.append(' '.join(a_words)) | ||
# print("随即删除===》", ' '.join(a_words)) | ||
# print(augmented_sentences) | ||
shuffle(augmented_sentences) | ||
|
||
if num_aug >= 1: | ||
augmented_sentences = augmented_sentences[:num_aug] | ||
else: | ||
keep_prob = num_aug / len(augmented_sentences) | ||
augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob] | ||
|
||
augmented_sentences.append(seg_list) | ||
|
||
return augmented_sentences | ||
|
||
## | ||
# 测试用例 | ||
if __name__ == '__main__': | ||
''' | ||
我们 就 像 蒲公英 , 我 也 祈祷 着 能 和 你 飞去 同 一片 土地 | ||
我们 就 像 蒲公英 , 我 也 礼拜 着 能 和 你 飞去 同 一片 土地 | ||
我们 就 像 蒲公英 , 我 也 祈祷 着 能 和 你 飞去 同 一片 土地 | ||
我们 就 像 蒲公英 , 我 也 祈祷 着 能 一片 你 飞去 同 和 土地 | ||
我们 就 像 蒲公英 , 向日葵 我 也 祈祷 着 能 和 你 飞去 同 一片 土地 | ||
我们 就 像 , 我 也 祈祷 着 能 和 你 飞去 同 一片 | ||
我们 就 像 蒲公英 , 我 也 祈祷 着 能 和 你 飞去 同 一片 土地 | ||
我们 就 像 蒲公英 , 我 也 祈祷 着 能 和 你 飞去 同 一片 国有土地 | ||
我们 蒲公英 就 像 蒲公英 , 我 也 祈祷 着 能 和 你 飞去 同 一片 土地 | ||
我们 就 像 蒲公英 , 我 也 祈祷 着 能 和 你 飞去 同 一片 土地 | ||
''' | ||
result = eda(sentence="我们就像蒲公英,我也祈祷着能和你飞去同一片土地") | ||
for i in result: | ||
print(i) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# 语言缩写1,google | ||
language_short_zh = ['zh', 'zh_CN', 'zh_HK', 'zh_TW'] | ||
language_short_first = ['en', 'fr', 'de', 'es', 'be', 'it', 'ja', 'ar', 'nl', 'pt', 'bg', 'el', 'ca', 'iw', 'is', 'sh', 'ko', 'sv', 'sq', 'ru', 'no', 'fi', 'hr', 'ro', 'sr', 'pl', 'lt', 'th', 'mk', 'sk', 'et', 'da', 'hu', 'sl', 'tr', 'uk', 'lv', 'cs'] | ||
language_short_other = ['en', 'en_US', 'ar', 'ar_AE', 'ar_BH', 'ar_DZ', 'ar_EG', 'ar_IQ', 'ar_JO', 'ar_KW', 'ar_LB', 'ar_LY', 'ar_MA', 'ar_OM', 'ar_QA', 'ar_SA', 'ar_SD', 'ar_SY', 'ar_TN', 'ar_YE', 'be', 'be_BY', 'bg', 'bg_BG', 'bo_CN', 'ca', 'ca_ES', 'ca_ES_EURO', 'cs', 'cs_CZ', 'da', 'da_DK', 'de', 'de_AT', 'de_AT_EURO', 'de_CH', 'de_DE', 'de_DE_EURO', 'de_LU', 'de_LU_EURO', 'el', 'el_GR', 'en_AU', 'en_CA', 'en_GB', 'en_IE', 'en_IE_EURO', 'en_NZ', 'en_ZA', 'es', 'es_BO', 'es_AR', 'es_CL', 'es_CO', 'es_CR', 'es_DO', 'es_EC', 'es_ES', 'es_ES_EURO', 'es_GT', 'es_HN', 'es_MX', 'es_NI', 'et', 'es_PA', 'es_PE', 'es_PR', 'es_PY', 'es_SV', 'es_UY', 'es_VE', 'et_EE', 'fi', 'fi_FI', 'fi_FI_EURO', 'fr', 'fr_BE', 'fr_BE_EURO', 'fr_CA', 'fr_CH', 'fr_FR', 'fr_FR_EURO', 'fr_LU', 'fr_LU_EURO', 'hr', 'hr_HR', 'hu', 'hu_HU', 'is', 'is_IS', 'it', 'it_CH', 'it_IT', 'it_IT_EURO', 'iw', 'iw_IL', 'ja', 'ja_JP', 'ko', 'ko_KR', 'lt', 'lt_LT', 'lv', 'lv_LV', 'mk', 'mk_MK', 'nl', 'nl_BE', 'nl_BE_EURO', 'nl_NL', 'nl_NL_EURO', 'no', 'no_NO', 'no_NO_NY', 'pl', 'pl_PL', 'pt', 'pt_BR', 'pt_PT', 'pt_PT_EURO', 'ro', 'ro_RO', 'ru', 'ru_RU', 'sh', 'sh_YU', 'sk', 'sk_SK', 'sl', 'sl_SI', 'sq', 'sq_AL', 'sr', 'sr_YU', 'sv', 'sv_SE', 'th', 'th_TH', 'tr', 'tr_TR', 'uk', 'uk_UA'] | ||
|
||
|
||
# 语言缩写,国内在线翻译 | ||
language_short_google = ['en', 'fr', 'ru', 'de', 'es', 'pt', 'ar', 'ja', 'ko', 'it', 'be', 'nl', 'bg', 'el', 'ca', 'iw', 'is', 'sh', 'sv', 'sq', 'no', 'fi', 'hr', 'ro', 'pl', 'lt', 'th', 'mk', 'sk', 'et', 'da', 'hu', 'sl', 'tr', 'uk', 'lv', 'cs', 'sr'] | ||
language_short_baidu = ['en', 'fra', 'ru', 'de', 'est', 'pt', 'ara', 'jp', 'kor', 'vie', 'yue', 'wyw', 'spa', 'th', 'it', 'el', 'nl', 'pl', 'bul', 'dan', 'fin', 'cs', 'rom', 'slo', 'swe', 'hu', 'cht'] | ||
language_short_youdao = ['en', 'fr', 'ru', 'de', 'es', 'pt', 'ar', 'ja', 'ko', 'vi', 'id'] | ||
language_short_sougou = ['en', 'fr', 'ru', 'de', 'es', 'pt', 'ar', 'ja', 'ko', 'vi', 'id', 'it', 'et', 'bg', 'pl', 'bs-Latn', 'fa', 'mww', 'da', 'fi', 'tlh-Qaak', 'tlh', 'hr', 'otq', 'ca', 'cs', 'ro', 'lv', 'ht', 'lt', 'nl', 'ms', 'mt', 'sl', 'th', 'tr', 'sk', 'sw', 'af', 'no', 'uk', 'ur', 'el', 'hu', 'cy', 'yua', 'he', 'hi', 'sv', 'yue', 'fj', 'fil', 'sm', 'to', 'ty', 'mg', 'bn', 'sr-Latn', 'sr-Cyrl'] | ||
language_short_tencent = ['en', 'fr', 'ru', 'de', 'es', 'pt', 'jp', 'ko', 'vi', 'id', 'it', 'kr', 'tr', 'ms', 'th'] | ||
|
||
|
||
# 在线翻译账户密码,自己去注册吧 | ||
app_key_google = "" | ||
app_secret_google = "" | ||
app_key_bing = "" | ||
app_secret_bing = "" | ||
app_key_baidu = "" | ||
app_secret_baidu = "" | ||
app_key_youdao = "" | ||
app_secret_youdao = "" | ||
app_key_sougou = "" | ||
app_secret_sougou = "" | ||
app_key_tencent = "" | ||
app_secret_tentcnet = "" |
Oops, something went wrong.