Isolate generic preprocessing functions (piskvorky#3180)

* Move preprocessing functions from textcourpus module * Move preprocessing functions from lowcorpus module * Add test cases for preprocessing functions * Fix styling issues * Refactor remove_stopwords() and strip_short() * make tests pass * rm unused import Co-authored-by: Michael Penkov <m@penkov.dev>
tbbharaj · Aug 19, 2021 · 6c4cd0e · 6c4cd0e
1 parent 05d99ed
commit 6c4cd0e
Show file tree

Hide file tree

Showing 4 changed files with 131 additions and 115 deletions.
diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py
@@ -11,28 +11,11 @@
 
 from gensim import utils
 from gensim.corpora import IndexedCorpus
-
+from gensim.parsing.preprocessing import split_on_space
 
 logger = logging.getLogger(__name__)
 
 
-def split_on_space(s):
-    """Split line by spaces, used in :class:`gensim.corpora.lowcorpus.LowCorpus`.
-
-    Parameters
-    ----------
-    s : str
-        Some line.
-
-    Returns
-    -------
-    list of str
-        List of tokens from `s`.
-
-    """
-    return [word for word in utils.to_unicode(s).strip().split(' ') if word]
-
-
 class LowCorpus(IndexedCorpus):
     """Corpus handles input in `GibbsLda++ format <http://gibbslda.sourceforge.net/>`_.
 
@@ -86,7 +69,7 @@ def __init__(self, fname, id2word=None, line2words=split_on_space):
             If not provided, the mapping is constructed directly from `fname`.
         line2words : callable, optional
             Function which converts lines(str) into tokens(list of str),
-            using :func:`~gensim.corpora.lowcorpus.split_on_space` as default.
+            using :func:`~gensim.parsing.preprocessing.split_on_space` as default.
 
         """
         IndexedCorpus.__init__(self, fname)

diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py
@@ -44,93 +44,15 @@
 
 from gensim import interfaces, utils
 from gensim.corpora.dictionary import Dictionary
-from gensim.parsing.preprocessing import STOPWORDS, RE_WHITESPACE
+from gensim.parsing.preprocessing import (
+    remove_stopword_tokens, remove_short_tokens,
+    lower_to_unicode, strip_multiple_whitespaces,
+)
 from gensim.utils import deaccent, simple_tokenize
 
 logger = logging.getLogger(__name__)
 
 
-def remove_stopwords(tokens, stopwords=STOPWORDS):
-    """Remove stopwords using list from `gensim.parsing.preprocessing.STOPWORDS`.
-
-    Parameters
-    ----------
-    tokens : iterable of str
-        Sequence of tokens.
-    stopwords : iterable of str, optional
-        Sequence of stopwords
-
-    Returns
-    -------
-    list of str
-        List of tokens without `stopwords`.
-
-    """
-    return [token for token in tokens if token not in stopwords]
-
-
-def remove_short(tokens, minsize=3):
-    """Remove tokens shorter than `minsize` chars.
-
-    Parameters
-    ----------
-    tokens : iterable of str
-        Sequence of tokens.
-    minsize : int, optimal
-        Minimal length of token (include).
-
-    Returns
-    -------
-    list of str
-        List of tokens without short tokens.
-
-    """
-    return [token for token in tokens if len(token) >= minsize]
-
-
-def lower_to_unicode(text, encoding='utf8', errors='strict'):
-    """Lowercase `text` and convert to unicode, using :func:`gensim.utils.any2unicode`.
-
-    Parameters
-    ----------
-    text : str
-        Input text.
-    encoding : str, optional
-        Encoding that will be used for conversion.
-    errors : str, optional
-        Error handling behaviour, used as parameter for `unicode` function (python2 only).
-
-    Returns
-    -------
-    str
-        Unicode version of `text`.
-
-    See Also
-    --------
-    :func:`gensim.utils.any2unicode`
-        Convert any string to unicode-string.
-
-    """
-    return utils.to_unicode(text.lower(), encoding, errors)
-
-
-def strip_multiple_whitespaces(s):
-    """Collapse multiple whitespace characters into a single space.
-
-    Parameters
-    ----------
-    s : str
-        Input string
-
-    Returns
-    -------
-    str
-        String with collapsed whitespaces.
-
-    """
-    return RE_WHITESPACE.sub(" ", s)
-
-
 class TextCorpus(interfaces.CorpusABC):
     """Helper class to simplify the pipeline of getting BoW vectors from plain text.
 
@@ -177,12 +99,12 @@ class TextCorpus(interfaces.CorpusABC):
 
     The default preprocessing consists of:
 
-    #. :func:`~gensim.corpora.textcorpus.lower_to_unicode` - lowercase and convert to unicode (assumes utf8 encoding)
+    #. :func:`~gensim.parsing.preprocessing.lower_to_unicode` - lowercase and convert to unicode (assumes utf8 encoding)
     #. :func:`~gensim.utils.deaccent`- deaccent (asciifolding)
-    #. :func:`~gensim.corpora.textcorpus.strip_multiple_whitespaces` - collapse multiple whitespaces into a single one
+    #. :func:`~gensim.parsing.preprocessing.strip_multiple_whitespaces` - collapse multiple whitespaces into one
     #. :func:`~gensim.utils.simple_tokenize` - tokenize by splitting on whitespace
-    #. :func:`~gensim.corpora.textcorpus.remove_short` - remove words less than 3 characters long
-    #. :func:`~gensim.corpora.textcorpus.remove_stopwords` - remove stopwords
+    #. :func:`~gensim.parsing.preprocessing.remove_short_tokens` - remove words less than 3 characters long
+    #. :func:`~gensim.parsing.preprocessing.remove_stopword_tokens` - remove stopwords
 
     """
 
@@ -204,15 +126,15 @@ def __init__(self, input=None, dictionary=None, metadata=False, character_filter
             Each will be applied to the text of each document in order, and should return a single string with
             the modified text. For Python 2, the original text will not be unicode, so it may be useful to
             convert to unicode as the first character filter.
-            If None - using :func:`~gensim.corpora.textcorpus.lower_to_unicode`,
-            :func:`~gensim.utils.deaccent` and :func:`~gensim.corpora.textcorpus.strip_multiple_whitespaces`.
+            If None - using :func:`~gensim.parsing.preprocessing.lower_to_unicode`,
+            :func:`~gensim.utils.deaccent` and :func:`~gensim.parsing.preprocessing.strip_multiple_whitespaces`.
         tokenizer : callable, optional
             Tokenizer for document, if None - using :func:`~gensim.utils.simple_tokenize`.
         token_filters : iterable of callable, optional
             Each will be applied to the iterable of tokens in order, and should return another iterable of tokens.
             These filters can add, remove, or replace tokens, or do nothing at all.
-            If None - using :func:`~gensim.corpora.textcorpus.remove_short` and
-            :func:`~gensim.corpora.textcorpus.remove_stopwords`.
+            If None - using :func:`~gensim.parsing.preprocessing.remove_short_tokens` and
+            :func:`~gensim.parsing.preprocessing.remove_stopword_tokens`.
 
         Examples
         --------
@@ -254,7 +176,7 @@ def __init__(self, input=None, dictionary=None, metadata=False, character_filter
 
         self.token_filters = token_filters
         if self.token_filters is None:
-            self.token_filters = [remove_short, remove_stopwords]
+            self.token_filters = [remove_short_tokens, remove_stopword_tokens]
 
         self.length = None
         self.dictionary = None

diff --git a/gensim/parsing/preprocessing.py b/gensim/parsing/preprocessing.py
@@ -68,17 +68,20 @@
 RE_WHITESPACE = re.compile(r"(\s)+", re.UNICODE)
 
 
-def remove_stopwords(s):
+def remove_stopwords(s, stopwords=None):
     """Remove :const:`~gensim.parsing.preprocessing.STOPWORDS` from `s`.
 
     Parameters
     ----------
     s : str
+    stopwords : iterable of str, optional
+        Sequence of stopwords
+        If None - using :const:`~gensim.parsing.preprocessing.STOPWORDS`
 
     Returns
     -------
     str
-        Unicode string without :const:`~gensim.parsing.preprocessing.STOPWORDS`.
+        Unicode string without `stopwords`.
 
     Examples
     --------
@@ -90,7 +93,29 @@ def remove_stopwords(s):
 
     """
     s = utils.to_unicode(s)
-    return " ".join(w for w in s.split() if w not in STOPWORDS)
+    return " ".join(remove_stopword_tokens(s.split(), stopwords))
+
+
+def remove_stopword_tokens(tokens, stopwords=None):
+    """Remove stopword tokens using list `stopwords`.
+
+    Parameters
+    ----------
+    tokens : iterable of str
+        Sequence of tokens.
+    stopwords : iterable of str, optional
+        Sequence of stopwords
+        If None - using :const:`~gensim.parsing.preprocessing.STOPWORDS`
+
+    Returns
+    -------
+    list of str
+        List of tokens without `stopwords`.
+
+    """
+    if stopwords is None:
+        stopwords = STOPWORDS
+    return [token for token in tokens if token not in stopwords]
 
 
 def strip_punctuation(s):
@@ -170,7 +195,26 @@ def strip_short(s, minsize=3):
 
     """
     s = utils.to_unicode(s)
-    return " ".join(e for e in s.split() if len(e) >= minsize)
+    return " ".join(remove_short_tokens(s.split(), minsize))
+
+
+def remove_short_tokens(tokens, minsize=3):
+    """Remove tokens shorter than `minsize` chars.
+
+    Parameters
+    ----------
+    tokens : iterable of str
+        Sequence of tokens.
+    minsize : int, optimal
+        Minimal length of token (include).
+
+    Returns
+    -------
+    list of str
+        List of tokens without short tokens.
+    """
+
+    return [token for token in tokens if len(token) >= minsize]
 
 
 def strip_numeric(s):
@@ -308,6 +352,49 @@ def stem_text(text):
 stem = stem_text
 
 
+def lower_to_unicode(text, encoding='utf8', errors='strict'):
+    """Lowercase `text` and convert to unicode, using :func:`gensim.utils.any2unicode`.
+
+    Parameters
+    ----------
+    text : str
+        Input text.
+    encoding : str, optional
+        Encoding that will be used for conversion.
+    errors : str, optional
+        Error handling behaviour, used as parameter for `unicode` function (python2 only).
+
+    Returns
+    -------
+    str
+        Unicode version of `text`.
+
+    See Also
+    --------
+    :func:`gensim.utils.any2unicode`
+        Convert any string to unicode-string.
+
+    """
+    return utils.to_unicode(text.lower(), encoding, errors)
+
+
+def split_on_space(s):
+    """Split line by spaces, used in :class:`gensim.corpora.lowcorpus.LowCorpus`.
+
+    Parameters
+    ----------
+    s : str
+        Some line.
+
+    Returns
+    -------
+    list of str
+        List of tokens from `s`.
+
+    """
+    return [word for word in utils.to_unicode(s).strip().split(' ') if word]
+
+
 DEFAULT_FILTERS = [
     lambda x: x.lower(), strip_tags, strip_punctuation,
     strip_multiple_whitespaces, strip_numeric,

diff --git a/gensim/test/test_parsing.py b/gensim/test/test_parsing.py
@@ -7,12 +7,17 @@
 
 import logging
 import unittest
+
+import mock
 import numpy as np
 
 from gensim.parsing.preprocessing import (
+    remove_short_tokens,
+    remove_stopword_tokens,
     remove_stopwords,
     stem_text,
     split_alphanum,
+    split_on_space,
     strip_multiple_whitespaces,
     strip_non_alphanum,
     strip_numeric,
@@ -21,7 +26,6 @@
     strip_tags,
 )
 
-
 # several documents
 doc1 = """C'est un trou de verdure où chante une rivière,
 Accrochant follement aux herbes des haillons
@@ -76,6 +80,26 @@ def test_split_alphanum(self):
     def test_strip_stopwords(self):
         self.assertEqual(remove_stopwords("the world is square"), "world square")
 
+        # confirm redifining the global `STOPWORDS` working
+        with mock.patch('gensim.parsing.preprocessing.STOPWORDS', frozenset(["the"])):
+            self.assertEqual(remove_stopwords("the world is square"), "world is square")
+
+    def test_strip_stopword_tokens(self):
+        self.assertEqual(remove_stopword_tokens(["the", "world", "is", "sphere"]), ["world", "sphere"])
+
+        # confirm redifining the global `STOPWORDS` working
+        with mock.patch('gensim.parsing.preprocessing.STOPWORDS', frozenset(["the"])):
+            self.assertEqual(
+                remove_stopword_tokens(["the", "world", "is", "sphere"]),
+                ["world", "is", "sphere"]
+            )
+
+    def test_strip_short_tokens(self):
+        self.assertEqual(remove_short_tokens(["salut", "les", "amis", "du", "59"], 3), ["salut", "les", "amis"])
+
+    def test_split_on_space(self):
+        self.assertEqual(split_on_space(" salut   les  amis du 59 "), ["salut", "les", "amis", "du", "59"])
+
     def test_stem_text(self):
         target = \
             "while it is quit us to be abl to search a larg " + \