Skip to content

Commit

Permalink
data_augmentation
Browse files Browse the repository at this point in the history
  • Loading branch information
rsanshierli committed May 21, 2020

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
1 parent b5d0fb2 commit 46bbbd0
Showing 10 changed files with 2,605 additions and 6 deletions.
5 changes: 2 additions & 3 deletions .idea/workspace.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

191 changes: 191 additions & 0 deletions Augmentation/EDA.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
import jieba
import synonyms
import random
from random import shuffle

random.seed(2019)

# 停用词列表,默认使用哈工大停用词表
f = open('stopwords/stopwords.txt', encoding='utf-8')
stop_words = list()
for stop_word in f.readlines():
stop_words.append(stop_word[:-1])

# 考虑到与英文的不同,暂时搁置
# 文本清理
'''
import re
def get_only_chars(line):
#1.清除所有的数字
'''


########################################################################
# 同义词替换
# 替换一个语句中的n个单词为其同义词
########################################################################
def synonym_replacement(words, n):
new_words = words.copy()
random_word_list = list(set([word for word in words if word not in stop_words]))
random.shuffle(random_word_list)
num_replaced = 0
for random_word in random_word_list:
synonyms = get_synonyms(random_word)
if len(synonyms) >= 1:
synonym = random.choice(synonyms)
new_words = [synonym if word == random_word else word for word in new_words]
num_replaced += 1
if num_replaced >= n:
break

sentence = ' '.join(new_words)
new_words = sentence.split(' ')

return new_words


def get_synonyms(word):
return synonyms.nearby(word)[0]


########################################################################
# 随机插入
# 随机在语句中插入n个词
########################################################################
def random_insertion(words, n):
new_words = words.copy()
for _ in range(n):
add_word(new_words)
return new_words


def add_word(new_words):
synonyms = []
counter = 0
while len(synonyms) < 1:
random_word = new_words[random.randint(0, len(new_words) - 1)]
synonyms = get_synonyms(random_word)
counter += 1
if counter >= 10:
return
random_synonym = random.choice(synonyms)
random_idx = random.randint(0, len(new_words) - 1)
new_words.insert(random_idx, random_synonym)


########################################################################
# Random swap 随机交换
# Randomly swap two words in the sentence n times
########################################################################

def random_swap(words, n):
new_words = words.copy()
for _ in range(n):
new_words = swap_word(new_words)
return new_words


def swap_word(new_words):
random_idx_1 = random.randint(0, len(new_words) - 1)
random_idx_2 = random_idx_1
counter = 0
while random_idx_2 == random_idx_1:
random_idx_2 = random.randint(0, len(new_words) - 1)
counter += 1
if counter > 3:
return new_words
new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
return new_words


########################################################################
# 随机删除
# 以概率p删除语句中的词
########################################################################
def random_deletion(words, p):
if len(words) == 1:
return words

new_words = []
for word in words:
r = random.uniform(0, 1)
if r > p:
new_words.append(word)

if len(new_words) == 0:
rand_int = random.randint(0, len(words) - 1)
return [words[rand_int]]

return new_words


########################################################################
# EDA函数
def eda(sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=9):
seg_list = jieba.cut(sentence)
seg_list = " ".join(seg_list)
words = list(seg_list.split())
num_words = len(words)

augmented_sentences = []
num_new_per_technique = int(num_aug / 4) + 1
n_sr = max(1, int(alpha_sr * num_words))
n_ri = max(1, int(alpha_ri * num_words))
n_rs = max(1, int(alpha_rs * num_words))

# print(words, "\n")

# 同义词替换sr
for _ in range(num_new_per_technique):
a_words = synonym_replacement(words, n_sr)
augmented_sentences.append(' '.join(a_words))
# print("同义词替换===》", ' '.join(a_words))

# 随机插入ri
for _ in range(num_new_per_technique):
a_words = random_insertion(words, n_ri)
augmented_sentences.append(' '.join(a_words))
# print("随机插入===》", ' '.join(a_words))

# 随机交换rs
for _ in range(num_new_per_technique):
a_words = random_swap(words, n_rs)
augmented_sentences.append(' '.join(a_words))
# print("随机交换===》", ' '.join(a_words))

# 随机删除rd
for _ in range(num_new_per_technique):
a_words = random_deletion(words, p_rd)
augmented_sentences.append(' '.join(a_words))
# print("随即删除===》", ' '.join(a_words))
# print(augmented_sentences)
shuffle(augmented_sentences)

if num_aug >= 1:
augmented_sentences = augmented_sentences[:num_aug]
else:
keep_prob = num_aug / len(augmented_sentences)
augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob]

augmented_sentences.append(seg_list)

return augmented_sentences

##
# 测试用例
if __name__ == '__main__':
'''
我们 就 像 蒲公英 , 我 也 祈祷 着 能 和 你 飞去 同 一片 土地
我们 就 像 蒲公英 , 我 也 礼拜 着 能 和 你 飞去 同 一片 土地
我们 就 像 蒲公英 , 我 也 祈祷 着 能 和 你 飞去 同 一片 土地
我们 就 像 蒲公英 , 我 也 祈祷 着 能 一片 你 飞去 同 和 土地
我们 就 像 蒲公英 , 向日葵 我 也 祈祷 着 能 和 你 飞去 同 一片 土地
我们 就 像 , 我 也 祈祷 着 能 和 你 飞去 同 一片
我们 就 像 蒲公英 , 我 也 祈祷 着 能 和 你 飞去 同 一片 土地
我们 就 像 蒲公英 , 我 也 祈祷 着 能 和 你 飞去 同 一片 国有土地
我们 蒲公英 就 像 蒲公英 , 我 也 祈祷 着 能 和 你 飞去 同 一片 土地
我们 就 像 蒲公英 , 我 也 祈祷 着 能 和 你 飞去 同 一片 土地
'''
result = eda(sentence="我们就像蒲公英,我也祈祷着能和你飞去同一片土地")
for i in result:
print(i)
Empty file added Augmentation/__init__.py
Empty file.
27 changes: 27 additions & 0 deletions Augmentation/constant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# 语言缩写1,google
language_short_zh = ['zh', 'zh_CN', 'zh_HK', 'zh_TW']
language_short_first = ['en', 'fr', 'de', 'es', 'be', 'it', 'ja', 'ar', 'nl', 'pt', 'bg', 'el', 'ca', 'iw', 'is', 'sh', 'ko', 'sv', 'sq', 'ru', 'no', 'fi', 'hr', 'ro', 'sr', 'pl', 'lt', 'th', 'mk', 'sk', 'et', 'da', 'hu', 'sl', 'tr', 'uk', 'lv', 'cs']
language_short_other = ['en', 'en_US', 'ar', 'ar_AE', 'ar_BH', 'ar_DZ', 'ar_EG', 'ar_IQ', 'ar_JO', 'ar_KW', 'ar_LB', 'ar_LY', 'ar_MA', 'ar_OM', 'ar_QA', 'ar_SA', 'ar_SD', 'ar_SY', 'ar_TN', 'ar_YE', 'be', 'be_BY', 'bg', 'bg_BG', 'bo_CN', 'ca', 'ca_ES', 'ca_ES_EURO', 'cs', 'cs_CZ', 'da', 'da_DK', 'de', 'de_AT', 'de_AT_EURO', 'de_CH', 'de_DE', 'de_DE_EURO', 'de_LU', 'de_LU_EURO', 'el', 'el_GR', 'en_AU', 'en_CA', 'en_GB', 'en_IE', 'en_IE_EURO', 'en_NZ', 'en_ZA', 'es', 'es_BO', 'es_AR', 'es_CL', 'es_CO', 'es_CR', 'es_DO', 'es_EC', 'es_ES', 'es_ES_EURO', 'es_GT', 'es_HN', 'es_MX', 'es_NI', 'et', 'es_PA', 'es_PE', 'es_PR', 'es_PY', 'es_SV', 'es_UY', 'es_VE', 'et_EE', 'fi', 'fi_FI', 'fi_FI_EURO', 'fr', 'fr_BE', 'fr_BE_EURO', 'fr_CA', 'fr_CH', 'fr_FR', 'fr_FR_EURO', 'fr_LU', 'fr_LU_EURO', 'hr', 'hr_HR', 'hu', 'hu_HU', 'is', 'is_IS', 'it', 'it_CH', 'it_IT', 'it_IT_EURO', 'iw', 'iw_IL', 'ja', 'ja_JP', 'ko', 'ko_KR', 'lt', 'lt_LT', 'lv', 'lv_LV', 'mk', 'mk_MK', 'nl', 'nl_BE', 'nl_BE_EURO', 'nl_NL', 'nl_NL_EURO', 'no', 'no_NO', 'no_NO_NY', 'pl', 'pl_PL', 'pt', 'pt_BR', 'pt_PT', 'pt_PT_EURO', 'ro', 'ro_RO', 'ru', 'ru_RU', 'sh', 'sh_YU', 'sk', 'sk_SK', 'sl', 'sl_SI', 'sq', 'sq_AL', 'sr', 'sr_YU', 'sv', 'sv_SE', 'th', 'th_TH', 'tr', 'tr_TR', 'uk', 'uk_UA']


# 语言缩写,国内在线翻译
language_short_google = ['en', 'fr', 'ru', 'de', 'es', 'pt', 'ar', 'ja', 'ko', 'it', 'be', 'nl', 'bg', 'el', 'ca', 'iw', 'is', 'sh', 'sv', 'sq', 'no', 'fi', 'hr', 'ro', 'pl', 'lt', 'th', 'mk', 'sk', 'et', 'da', 'hu', 'sl', 'tr', 'uk', 'lv', 'cs', 'sr']
language_short_baidu = ['en', 'fra', 'ru', 'de', 'est', 'pt', 'ara', 'jp', 'kor', 'vie', 'yue', 'wyw', 'spa', 'th', 'it', 'el', 'nl', 'pl', 'bul', 'dan', 'fin', 'cs', 'rom', 'slo', 'swe', 'hu', 'cht']
language_short_youdao = ['en', 'fr', 'ru', 'de', 'es', 'pt', 'ar', 'ja', 'ko', 'vi', 'id']
language_short_sougou = ['en', 'fr', 'ru', 'de', 'es', 'pt', 'ar', 'ja', 'ko', 'vi', 'id', 'it', 'et', 'bg', 'pl', 'bs-Latn', 'fa', 'mww', 'da', 'fi', 'tlh-Qaak', 'tlh', 'hr', 'otq', 'ca', 'cs', 'ro', 'lv', 'ht', 'lt', 'nl', 'ms', 'mt', 'sl', 'th', 'tr', 'sk', 'sw', 'af', 'no', 'uk', 'ur', 'el', 'hu', 'cy', 'yua', 'he', 'hi', 'sv', 'yue', 'fj', 'fil', 'sm', 'to', 'ty', 'mg', 'bn', 'sr-Latn', 'sr-Cyrl']
language_short_tencent = ['en', 'fr', 'ru', 'de', 'es', 'pt', 'jp', 'ko', 'vi', 'id', 'it', 'kr', 'tr', 'ms', 'th']


# 在线翻译账户密码,自己去注册吧
app_key_google = ""
app_secret_google = ""
app_key_bing = ""
app_secret_bing = ""
app_key_baidu = ""
app_secret_baidu = ""
app_key_youdao = ""
app_secret_youdao = ""
app_key_sougou = ""
app_secret_sougou = ""
app_key_tencent = ""
app_secret_tentcnet = ""
Loading

0 comments on commit 46bbbd0

Please sign in to comment.