From ef3be7eb553e5812a15b0df5e3a3f0e69708b19a Mon Sep 17 00:00:00 2001 From: Ayan Date: Tue, 22 Jun 2021 11:40:23 +0530 Subject: [PATCH 1/7] Move preprocessing functions from textcourpus module --- gensim/corpora/textcorpus.py | 102 ++++---------------------------- gensim/parsing/preprocessing.py | 78 +++++++++++++++++++++++- 2 files changed, 86 insertions(+), 94 deletions(-) diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py index e5616fe9d7..63acda6620 100644 --- a/gensim/corpora/textcorpus.py +++ b/gensim/corpora/textcorpus.py @@ -44,93 +44,13 @@ from gensim import interfaces, utils from gensim.corpora.dictionary import Dictionary -from gensim.parsing.preprocessing import STOPWORDS, RE_WHITESPACE +from gensim.parsing.preprocessing import remove_stopword_tokens, remove_short_tokens, \ + lower_to_unicode, strip_multiple_whitespaces from gensim.utils import deaccent, simple_tokenize logger = logging.getLogger(__name__) -def remove_stopwords(tokens, stopwords=STOPWORDS): - """Remove stopwords using list from `gensim.parsing.preprocessing.STOPWORDS`. - - Parameters - ---------- - tokens : iterable of str - Sequence of tokens. - stopwords : iterable of str, optional - Sequence of stopwords - - Returns - ------- - list of str - List of tokens without `stopwords`. - - """ - return [token for token in tokens if token not in stopwords] - - -def remove_short(tokens, minsize=3): - """Remove tokens shorter than `minsize` chars. - - Parameters - ---------- - tokens : iterable of str - Sequence of tokens. - minsize : int, optimal - Minimal length of token (include). - - Returns - ------- - list of str - List of tokens without short tokens. - - """ - return [token for token in tokens if len(token) >= minsize] - - -def lower_to_unicode(text, encoding='utf8', errors='strict'): - """Lowercase `text` and convert to unicode, using :func:`gensim.utils.any2unicode`. - - Parameters - ---------- - text : str - Input text. - encoding : str, optional - Encoding that will be used for conversion. - errors : str, optional - Error handling behaviour, used as parameter for `unicode` function (python2 only). - - Returns - ------- - str - Unicode version of `text`. - - See Also - -------- - :func:`gensim.utils.any2unicode` - Convert any string to unicode-string. - - """ - return utils.to_unicode(text.lower(), encoding, errors) - - -def strip_multiple_whitespaces(s): - """Collapse multiple whitespace characters into a single space. - - Parameters - ---------- - s : str - Input string - - Returns - ------- - str - String with collapsed whitespaces. - - """ - return RE_WHITESPACE.sub(" ", s) - - class TextCorpus(interfaces.CorpusABC): """Helper class to simplify the pipeline of getting BoW vectors from plain text. @@ -177,12 +97,12 @@ class TextCorpus(interfaces.CorpusABC): The default preprocessing consists of: - #. :func:`~gensim.corpora.textcorpus.lower_to_unicode` - lowercase and convert to unicode (assumes utf8 encoding) + #. :func:`~gensim.parsing.preprocessing.lower_to_unicode` - lowercase and convert to unicode (assumes utf8 encoding) #. :func:`~gensim.utils.deaccent`- deaccent (asciifolding) - #. :func:`~gensim.corpora.textcorpus.strip_multiple_whitespaces` - collapse multiple whitespaces into a single one + #. :func:`~gensim.parsing.preprocessing.strip_multiple_whitespaces`- collapse multiple whitespaces into a single one #. :func:`~gensim.utils.simple_tokenize` - tokenize by splitting on whitespace - #. :func:`~gensim.corpora.textcorpus.remove_short` - remove words less than 3 characters long - #. :func:`~gensim.corpora.textcorpus.remove_stopwords` - remove stopwords + #. :func:`~gensim.parsing.preprocessing.remove_short_tokens` - remove words less than 3 characters long + #. :func:`~gensim.parsing.preprocessing.remove_stopword_tokens` - remove stopwords """ @@ -204,15 +124,15 @@ def __init__(self, input=None, dictionary=None, metadata=False, character_filter Each will be applied to the text of each document in order, and should return a single string with the modified text. For Python 2, the original text will not be unicode, so it may be useful to convert to unicode as the first character filter. - If None - using :func:`~gensim.corpora.textcorpus.lower_to_unicode`, - :func:`~gensim.utils.deaccent` and :func:`~gensim.corpora.textcorpus.strip_multiple_whitespaces`. + If None - using :func:`~gensim.parsing.preprocessing.lower_to_unicode`, + :func:`~gensim.utils.deaccent` and :func:`~gensim.parsing.preprocessing.strip_multiple_whitespaces`. tokenizer : callable, optional Tokenizer for document, if None - using :func:`~gensim.utils.simple_tokenize`. token_filters : iterable of callable, optional Each will be applied to the iterable of tokens in order, and should return another iterable of tokens. These filters can add, remove, or replace tokens, or do nothing at all. - If None - using :func:`~gensim.corpora.textcorpus.remove_short` and - :func:`~gensim.corpora.textcorpus.remove_stopwords`. + If None - using :func:`~gensim.parsing.preprocessing.remove_short_tokens` and + :func:`~gensim.parsing.preprocessing.remove_stopword_tokens`. Examples -------- @@ -254,7 +174,7 @@ def __init__(self, input=None, dictionary=None, metadata=False, character_filter self.token_filters = token_filters if self.token_filters is None: - self.token_filters = [remove_short, remove_stopwords] + self.token_filters = [remove_short_tokens, remove_stopword_tokens] self.length = None self.dictionary = None diff --git a/gensim/parsing/preprocessing.py b/gensim/parsing/preprocessing.py index 777ca46e8e..81638a2fbe 100644 --- a/gensim/parsing/preprocessing.py +++ b/gensim/parsing/preprocessing.py @@ -68,17 +68,20 @@ RE_WHITESPACE = re.compile(r"(\s)+", re.UNICODE) -def remove_stopwords(s): +def remove_stopwords(s, stopwords=None): """Remove :const:`~gensim.parsing.preprocessing.STOPWORDS` from `s`. Parameters ---------- s : str + stopwords : iterable of str, optional + Sequence of stopwords + If None - using :const:`~gensim.parsing.preprocessing.STOPWORDS` Returns ------- str - Unicode string without :const:`~gensim.parsing.preprocessing.STOPWORDS`. + Unicode string without `stopwords`. Examples -------- @@ -89,8 +92,32 @@ def remove_stopwords(s): u'Better late never, better late.' """ + if stopwords is None: + stopwords = STOPWORDS s = utils.to_unicode(s) - return " ".join(w for w in s.split() if w not in STOPWORDS) + return " ".join(w for w in s.split() if w not in stopwords) + + +def remove_stopword_tokens(tokens, stopwords=None): + """Remove stopword tokens using list `stopwords`. + + Parameters + ---------- + tokens : iterable of str + Sequence of tokens. + stopwords : iterable of str, optional + Sequence of stopwords + If None - using :const:`~gensim.parsing.preprocessing.STOPWORDS` + + Returns + ------- + list of str + List of tokens without `stopwords`. + + """ + if stopwords is None: + stopwords = STOPWORDS + return [token for token in tokens if token not in stopwords] def strip_punctuation(s): @@ -175,6 +202,25 @@ def strip_short(s, minsize=3): return " ".join(e for e in s.split() if len(e) >= minsize) +def remove_short_tokens(tokens, minsize=3): + """Remove tokens shorter than `minsize` chars. + + Parameters + ---------- + tokens : iterable of str + Sequence of tokens. + minsize : int, optimal + Minimal length of token (include). + + Returns + ------- + list of str + List of tokens without short tokens. + """ + + return [token for token in tokens if len(token) >= minsize] + + def strip_numeric(s): """Remove digits from `s` using :const:`~gensim.parsing.preprocessing.RE_NUMERIC`. @@ -310,6 +356,32 @@ def stem_text(text): stem = stem_text +def lower_to_unicode(text, encoding='utf8', errors='strict'): + """Lowercase `text` and convert to unicode, using :func:`gensim.utils.any2unicode`. + + Parameters + ---------- + text : str + Input text. + encoding : str, optional + Encoding that will be used for conversion. + errors : str, optional + Error handling behaviour, used as parameter for `unicode` function (python2 only). + + Returns + ------- + str + Unicode version of `text`. + + See Also + -------- + :func:`gensim.utils.any2unicode` + Convert any string to unicode-string. + + """ + return utils.to_unicode(text.lower(), encoding, errors) + + DEFAULT_FILTERS = [ lambda x: x.lower(), strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, From 38b8616dd1c55b131114eb95af1934c200919d1f Mon Sep 17 00:00:00 2001 From: Ayan Date: Tue, 22 Jun 2021 11:43:27 +0530 Subject: [PATCH 2/7] Move preprocessing functions from lowcorpus module --- gensim/corpora/lowcorpus.py | 21 ++------------------- gensim/parsing/preprocessing.py | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py index 80dacf8ec0..01b1043a9c 100644 --- a/gensim/corpora/lowcorpus.py +++ b/gensim/corpora/lowcorpus.py @@ -11,28 +11,11 @@ from gensim import utils from gensim.corpora import IndexedCorpus - +from gensim.parsing.preprocessing import split_on_space logger = logging.getLogger(__name__) -def split_on_space(s): - """Split line by spaces, used in :class:`gensim.corpora.lowcorpus.LowCorpus`. - - Parameters - ---------- - s : str - Some line. - - Returns - ------- - list of str - List of tokens from `s`. - - """ - return [word for word in utils.to_unicode(s).strip().split(' ') if word] - - class LowCorpus(IndexedCorpus): """Corpus handles input in `GibbsLda++ format `_. @@ -86,7 +69,7 @@ def __init__(self, fname, id2word=None, line2words=split_on_space): If not provided, the mapping is constructed directly from `fname`. line2words : callable, optional Function which converts lines(str) into tokens(list of str), - using :func:`~gensim.corpora.lowcorpus.split_on_space` as default. + using :func:`~gensim.parsing.preprocessing.split_on_space` as default. """ IndexedCorpus.__init__(self, fname) diff --git a/gensim/parsing/preprocessing.py b/gensim/parsing/preprocessing.py index 81638a2fbe..c4b10d4429 100644 --- a/gensim/parsing/preprocessing.py +++ b/gensim/parsing/preprocessing.py @@ -382,6 +382,23 @@ def lower_to_unicode(text, encoding='utf8', errors='strict'): return utils.to_unicode(text.lower(), encoding, errors) +def split_on_space(s): + """Split line by spaces, used in :class:`gensim.corpora.lowcorpus.LowCorpus`. + + Parameters + ---------- + s : str + Some line. + + Returns + ------- + list of str + List of tokens from `s`. + + """ + return [word for word in utils.to_unicode(s).strip().split(' ') if word] + + DEFAULT_FILTERS = [ lambda x: x.lower(), strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, From 59fd14de7c010be8495be8e56bd6e4489b2b7fd4 Mon Sep 17 00:00:00 2001 From: Ayan Date: Tue, 22 Jun 2021 12:00:32 +0530 Subject: [PATCH 3/7] Add test cases for preprocessing functions --- gensim/test/test_parsing.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/gensim/test/test_parsing.py b/gensim/test/test_parsing.py index d61671bd85..73cc81a713 100644 --- a/gensim/test/test_parsing.py +++ b/gensim/test/test_parsing.py @@ -8,9 +8,10 @@ import logging import unittest import numpy as np +import gensim.parsing.preprocessing as preprocessing from gensim.parsing.preprocessing import \ remove_stopwords, strip_punctuation2, strip_tags, strip_short, strip_numeric, strip_non_alphanum, \ - strip_multiple_whitespaces, split_alphanum, stem_text + strip_multiple_whitespaces, split_alphanum, stem_text, remove_stopword_tokens, remove_short_tokens, split_on_space # several documents @@ -67,6 +68,27 @@ def test_split_alphanum(self): def test_strip_stopwords(self): self.assertEqual(remove_stopwords("the world is square"), "world square") + # confirm redifining the global `STOPWORDS` working + STOPWORDS = preprocessing.STOPWORDS + preprocessing.STOPWORDS = frozenset(["the"]) + self.assertEqual(remove_stopwords("the world is square"), "world is square") + preprocessing.STOPWORDS = STOPWORDS + + def test_strip_stopword_tokens(self): + self.assertEqual(remove_stopword_tokens(["the", "world", "is", "sphere"]), ["world", "sphere"]) + + # confirm redifining the global `STOPWORDS` working + STOPWORDS = preprocessing.STOPWORDS + preprocessing.STOPWORDS = frozenset(["the"]) + self.assertEqual(remove_stopword_tokens(["the", "world", "is", "sphere"]), ["world", "is", "sphere"]) + preprocessing.STOPWORDS = STOPWORDS + + def test_strip_short_tokens(self): + self.assertEqual(remove_short_tokens(["salut", "les", "amis", "du", "59"], 3), ["salut", "les", "amis"]) + + def test_split_on_space(self): + self.assertEqual(split_on_space(" salut les amis du 59 "), ["salut", "les", "amis", "du", "59"]) + def test_stem_text(self): target = \ "while it is quit us to be abl to search a larg " + \ From a24e397bd28e9d58b812ea0176675c51ca69e121 Mon Sep 17 00:00:00 2001 From: Ayan Date: Tue, 22 Jun 2021 14:13:57 +0530 Subject: [PATCH 4/7] Fix styling issues --- gensim/corpora/textcorpus.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py index 63acda6620..c2b8b620bf 100644 --- a/gensim/corpora/textcorpus.py +++ b/gensim/corpora/textcorpus.py @@ -44,8 +44,10 @@ from gensim import interfaces, utils from gensim.corpora.dictionary import Dictionary -from gensim.parsing.preprocessing import remove_stopword_tokens, remove_short_tokens, \ - lower_to_unicode, strip_multiple_whitespaces +from gensim.parsing.preprocessing import ( + remove_stopword_tokens, remove_short_tokens, + lower_to_unicode, strip_multiple_whitespaces, +) from gensim.utils import deaccent, simple_tokenize logger = logging.getLogger(__name__) @@ -99,7 +101,7 @@ class TextCorpus(interfaces.CorpusABC): #. :func:`~gensim.parsing.preprocessing.lower_to_unicode` - lowercase and convert to unicode (assumes utf8 encoding) #. :func:`~gensim.utils.deaccent`- deaccent (asciifolding) - #. :func:`~gensim.parsing.preprocessing.strip_multiple_whitespaces`- collapse multiple whitespaces into a single one + #. :func:`~gensim.parsing.preprocessing.strip_multiple_whitespaces` - collapse multiple whitespaces into one #. :func:`~gensim.utils.simple_tokenize` - tokenize by splitting on whitespace #. :func:`~gensim.parsing.preprocessing.remove_short_tokens` - remove words less than 3 characters long #. :func:`~gensim.parsing.preprocessing.remove_stopword_tokens` - remove stopwords From 148531bb730bcfbbff0f130505557fa54cae043b Mon Sep 17 00:00:00 2001 From: Ayan Date: Tue, 22 Jun 2021 14:25:25 +0530 Subject: [PATCH 5/7] Refactor remove_stopwords() and strip_short() --- gensim/parsing/preprocessing.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/gensim/parsing/preprocessing.py b/gensim/parsing/preprocessing.py index c4b10d4429..b715748f21 100644 --- a/gensim/parsing/preprocessing.py +++ b/gensim/parsing/preprocessing.py @@ -92,10 +92,8 @@ def remove_stopwords(s, stopwords=None): u'Better late never, better late.' """ - if stopwords is None: - stopwords = STOPWORDS s = utils.to_unicode(s) - return " ".join(w for w in s.split() if w not in stopwords) + return " ".join(remove_stopword_tokens(s.split(), stopwords)) def remove_stopword_tokens(tokens, stopwords=None): @@ -199,7 +197,7 @@ def strip_short(s, minsize=3): """ s = utils.to_unicode(s) - return " ".join(e for e in s.split() if len(e) >= minsize) + return " ".join(remove_short_tokens(s.split(), minsize)) def remove_short_tokens(tokens, minsize=3): From a57ef56d134df24b897e292071d8861900ce3dbe Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Thu, 12 Aug 2021 10:41:07 +0900 Subject: [PATCH 6/7] make tests pass --- gensim/test/test_parsing.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/gensim/test/test_parsing.py b/gensim/test/test_parsing.py index d7486a1497..305b3e501a 100644 --- a/gensim/test/test_parsing.py +++ b/gensim/test/test_parsing.py @@ -7,12 +7,19 @@ import logging import unittest + +import mock import numpy as np + +import gensim.parsing.preprocessing from gensim.parsing.preprocessing import ( + remove_short_tokens, + remove_stopword_tokens, remove_stopwords, stem_text, split_alphanum, + split_on_space, strip_multiple_whitespaces, strip_non_alphanum, strip_numeric, @@ -76,19 +83,18 @@ def test_strip_stopwords(self): self.assertEqual(remove_stopwords("the world is square"), "world square") # confirm redifining the global `STOPWORDS` working - STOPWORDS = preprocessing.STOPWORDS - preprocessing.STOPWORDS = frozenset(["the"]) - self.assertEqual(remove_stopwords("the world is square"), "world is square") - preprocessing.STOPWORDS = STOPWORDS + with mock.patch('gensim.parsing.preprocessing.STOPWORDS', frozenset(["the"])): + self.assertEqual(remove_stopwords("the world is square"), "world is square") def test_strip_stopword_tokens(self): self.assertEqual(remove_stopword_tokens(["the", "world", "is", "sphere"]), ["world", "sphere"]) # confirm redifining the global `STOPWORDS` working - STOPWORDS = preprocessing.STOPWORDS - preprocessing.STOPWORDS = frozenset(["the"]) - self.assertEqual(remove_stopword_tokens(["the", "world", "is", "sphere"]), ["world", "is", "sphere"]) - preprocessing.STOPWORDS = STOPWORDS + with mock.patch('gensim.parsing.preprocessing.STOPWORDS', frozenset(["the"])): + self.assertEqual( + remove_stopword_tokens(["the", "world", "is", "sphere"]), + ["world", "is", "sphere"] + ) def test_strip_short_tokens(self): self.assertEqual(remove_short_tokens(["salut", "les", "amis", "du", "59"], 3), ["salut", "les", "amis"]) From f0adc73e3537633f821e43bf2688701476e839b2 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Thu, 12 Aug 2021 10:44:16 +0900 Subject: [PATCH 7/7] rm unused import --- gensim/test/test_parsing.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/gensim/test/test_parsing.py b/gensim/test/test_parsing.py index 305b3e501a..f96ad332d2 100644 --- a/gensim/test/test_parsing.py +++ b/gensim/test/test_parsing.py @@ -11,8 +11,6 @@ import mock import numpy as np - -import gensim.parsing.preprocessing from gensim.parsing.preprocessing import ( remove_short_tokens, remove_stopword_tokens,