-
Notifications
You must be signed in to change notification settings - Fork 0
/
CleanText.py
133 lines (104 loc) · 4.59 KB
/
CleanText.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
__version__ = "0.1"
__author__ = "Kalle Westerling"
#########################################
####### Standard Settings ###############
#########################################
# Modifying original text
LOWER = True
EXPAND_CONTRACTIONS = True
REMOVE_STOPWORDS = True
# Removing elements of original text
LINKS = True
DIGITS = True
EMOJI = True
HASH = True
AT = True
PUNCTUATION = True
#########################################
import html2text, yaml, re, string, unidecode # new dependency: unidecode
# Constants
RE_EMOJI = re.compile(u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])')
RE_DIGITS = re.compile(r'\d+(th|st|nd|nth)|[\'`´‘]?\d+[\'`´‘]?s?\b|\d')
class CleanText():
'''
Will help clean text.
'''
def __init__(self, text, special_replacements = {}, stopwords = []):
if not isinstance(text, str): text = "" # this function only takes strings, so whatever else is fed to it needs to be neutralized
if stopwords and not isinstance(stopwords, list): raise RuntimeError("stopwords provided needs to be a list of strings.")
if special_replacements and not isinstance(special_replacements, dict): raise RuntimeError("special_replacements provided needs to be a dictionary.")
self.original_text = text
# process variables
self.lower = LOWER
self.expand_contractions = EXPAND_CONTRACTIONS
self.remove_stopwords = REMOVE_STOPWORDS
self.special_replacements = special_replacements
self.stopwords = stopwords
# remove
self.links = LINKS
self.digits = DIGITS
self.emoji = EMOJI
self.hash = HASH
self.at = AT
self.punctuation = PUNCTUATION
# cleaning process
self.text = self.original_text
self.clean()
def clean(self):
_ = self.original_text
if self.lower: _ = _.lower()
if self.expand_contractions: _ = self._expand_contractions(_)
if self.special_replacements: _ = self._special_replacement(_)
_ = self._clean_html(_)
if self.links: _ = self._clean_links(_)
if self.hash: _ = self._clean_hashtags(_)
if self.at: _ = self._clean_ats(_)
if self.digits: _ = self._clean_digits(_)
_ = unidecode.unidecode(_) # new feature
if self.emoji: _ = self._clean_emojis(_)
if self.punctuation: _ = self._clean_punctuation(_)
_ = self._clean_stopwords(_)
_ = _.strip()
self.text = _
return(_)
def _special_replacement(self, text):
c_re = re.compile('(%s)' % '|'.join(self.special_replacements.keys()))
def replace(match):
return self.special_replacements[match.group(0)]
return(c_re.sub(replace, text))
def _clean_html(self, text):
h = html2text.HTML2Text()
h.ignore_links = True
h.bypass_tables = True
return(h.handle(text).strip())
def _clean_links(self, text): # todo: also replace www.????.co/m?
_ = re.sub(r"(?i)\b((?:[a-z][\w-]+:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}\/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))", "", text)
return(_.strip())
def _clean_hashtags(self, text):
return(re.sub(r"#[\w-]+", "", text))
def _clean_ats(self, text):
return(re.sub(r"@[\w-]+", "", text))
def _clean_digits(self, text):
return(RE_DIGITS.sub(r'', text))
def _clean_emojis(self, text): # thanks https://gist.github.com/Alex-Just/e86110836f3f93fe7932290526529cd1#gistcomment-3059482
return(RE_EMOJI.sub(r'', text))
def _clean_punctuation(self, text):
_ = re.sub("[{}]".format(string.punctuation)," ", text)
_ = re.sub("[¡“”’]"," ", _)
return(_)
def _clean_stopwords(self, text):
with open('/usr/local/lib/python3.7/site-packages/CleanTextConfiguration/YoastSEO-stopwords.txt', 'r') as f:
stops = f.read().splitlines()
stops.extend(self.stopwords)
stops = set(stops)
_ = " ".join([word for word in text.split() if word not in stops])
return(_)
def _expand_contractions(self, text):
with open("/usr/local/lib/python3.7/site-packages/CleanTextConfiguration/contractions.yml") as f:
contractions = yaml.safe_load(stream=f)
c_re = re.compile('(%s)' % '|'.join(contractions.keys()))
def replace(match):
return contractions[match.group(0)]
return(c_re.sub(replace, text))
def __repr__(self):
return(self.text)