Implement dealing with np.nan, closes jbesomi#86

Every function in the library now handles NaNs correctly. Implemented through decorator @handle_nans in new file _helper.py. Tests added in test_nan.py As we went through the whole library anyways, argument "input" was renamed to "s" in some functions to be in line with the others. Co-authored-by: Maximilian Krahn <maximilian.krahn@icloud.com>
SummerOfCode-NoHate · Jul 17, 2020 · ce196d4 · ce196d4
1 parent 7fdc168
commit ce196d4
Show file tree

Hide file tree

Showing 8 changed files with 182 additions and 79 deletions.
diff --git a/tests/test_indexes.py b/tests/test_indexes.py
@@ -48,7 +48,7 @@
     ["remove_brackets", preprocessing.remove_brackets, (s_text,)],
     ["remove_html_tags", preprocessing.remove_html_tags, (s_text,)],
     ["tokenize", preprocessing.tokenize, (s_text,)],
-    ["tokenize_with_phrases", preprocessing.tokenize_with_phrases, (s_text_list,)],
+    ["tokenize_with_phrases", preprocessing.tokenize_with_phrases, (s_text,)],
     ["replace_urls", preprocessing.replace_urls, (s_text, "")],
     ["remove_urls", preprocessing.remove_urls, (s_text,)],
     ["replace_tags", preprocessing.replace_tags, (s_text, "")],
@@ -61,7 +61,7 @@
         representation.term_frequency,
         (preprocessing.tokenize(s_text),),
     ],
-    #["tfidf", representation.tfidf, (preprocessing.tokenize(s_text),),],
+    # ["tfidf", representation.tfidf, (preprocessing.tokenize(s_text),),],
     ["pca", representation.pca, (s_numeric_lists, 0)],
     ["nmf", representation.nmf, (s_numeric_lists,)],
     ["tsne", representation.tsne, (s_numeric_lists,)],

diff --git a/tests/test_nan.py b/tests/test_nan.py
@@ -9,7 +9,7 @@
 
 # Define valid inputs for different functions.
 s_nan = pd.Series([np.NaN, "test"])
-s_numeric_and_nan_lists = pd.Series([[5.0, np.NaN], [6.0, 6.0]])
+s_numeric_and_nan_lists = pd.Series([[5.0, 5.0], [6.0, 6.0], np.nan])
 
 # Define all test cases. Every test case is a list
 # of [name of test case, function to test, tuple of valid input for the function].
@@ -27,7 +27,6 @@
 ]
 
 test_cases_preprocessing = [
-    ["fillna", preprocessing.fillna, (s_nan,)],
     ["lowercase", preprocessing.lowercase, (s_nan,)],
     ["replace_digits", preprocessing.replace_digits, (s_nan, "")],
     ["remove_digits", preprocessing.remove_digits, (s_nan,)],
@@ -38,7 +37,6 @@
     ["replace_stopwords", preprocessing.replace_stopwords, (s_nan, "")],
     ["remove_stopwords", preprocessing.remove_stopwords, (s_nan,)],
     ["stem", preprocessing.stem, (s_nan,)],
-    ["clean", preprocessing.clean, (s_nan,)],
     ["remove_round_brackets", preprocessing.remove_round_brackets, (s_nan,)],
     ["remove_curly_brackets", preprocessing.remove_curly_brackets, (s_nan,)],
     ["remove_square_brackets", preprocessing.remove_square_brackets, (s_nan,)],
@@ -59,7 +57,7 @@
         representation.term_frequency,
         (preprocessing.tokenize(s_nan),),
     ],
-    #["tfidf", representation.tfidf, (preprocessing.tokenize(s_nan),)],
+    # ["tfidf", representation.tfidf, (preprocessing.tokenize(s_nan),)],
     ["pca", representation.pca, (s_numeric_and_nan_lists, 0)],
     ["nmf", representation.nmf, (s_numeric_and_nan_lists,)],
     ["tsne", representation.tsne, (s_numeric_and_nan_lists,)],

diff --git a/tests/test_nlp.py b/tests/test_nlp.py
@@ -44,17 +44,13 @@ def test_noun_chunks(self):
 
     def test_count_sentences(self):
         s = pd.Series("I think ... it counts correctly. Doesn't it? Great!")
-        s_true = pd.Series(3)
+        s_true = pd.Series(3, dtype=object)
         self.assertEqual(nlp.count_sentences(s), s_true)
 
     def test_count_sentences_numeric(self):
         s = pd.Series([13.0, 42.0])
         self.assertRaises(TypeError, nlp.count_sentences, s)
 
-    def test_count_sentences_missing_value(self):
-        s = pd.Series(["Test.", np.nan])
-        self.assertRaises(TypeError, nlp.count_sentences, s)
-
     def test_count_sentences_index(self):
         s = pd.Series(["Test"], index=[5])
         counted_sentences_s = nlp.count_sentences(s)

diff --git a/texthero/_helper.py b/texthero/_helper.py
@@ -0,0 +1,86 @@
+"""
+Useful helper functions for the texthero library.
+"""
+
+import functools
+import wrapt
+import numpy as np
+
+
+"""
+Decorators.
+"""
+
+
+def handle_nans(wrapped=None, input_only=False):
+    """
+    Decorator to make a function not change NaN values.
+
+    Using the decorator, the function to be applied
+    will not change cells that have value np.nan.
+
+    The function must take as first input a Series s,
+    manipulate that Series (e.g. removing diacritics)
+    and then return as first output the Series s.
+
+    Parameters
+    ----------
+    input_only: Boolean, default to False.
+        Set to True when the output that is returned by the
+        function is _not_ the same as the input series
+        with (some) cells changed (e.g. in top_words,
+        the output Series is different from the input
+        Series, and in pca there is no return, so in both
+        cases input_only is set to True).
+
+
+    Examples
+    --------
+    >>> from texthero._helper import *
+    >>> import pandas as pd
+    >>> import numpy as np
+    >>> @handle_nans
+    ... def replace_a_with_b(s):
+    ...     return s.str.replace("a", "b")
+    >>> s_with_nan = pd.Series(["Test a", np.nan])
+    >>> replace_a_with_b(s_with_nan)
+    0    Test b
+    1       NaN
+    dtype: object
+    """
+    if wrapped is None:
+        return functools.partial(handle_nans, input_only=input_only)
+
+    @wrapt.decorator
+    def wrapper(wrapped, instance, args, kwargs):
+
+        # Get first input argument (the series).
+        s = args[0]
+        nan_mask = ~s.isna()
+
+        # Need a copy as changing s[nan_mask] would change the original input.
+        s_result = s.copy()
+        s_without_nans = s[nan_mask]
+
+        # Change input Series so the function will only work on the non-nan fields.
+        args = (s_without_nans,) + args[1:] if args[1:] else (s_without_nans,)
+
+        # Execute the function and get the result.
+        output = wrapped(*args, **kwargs)
+
+        # If we should also handle the output.
+        if not input_only:
+            # Replace first argument of output (that's the Series) to refill the NaN fields.
+            if not isinstance(output, tuple):
+                output = (output,)
+            s_result[nan_mask] = output[0]
+
+            # Recover index name if set.
+            if output[0].index.name:
+                s_result.index.name = output[0].index.name
+
+            output = (s_result,) + output[1:] if output[1:] else s_result
+
+        return output
+
+    return wrapper(wrapped)
diff --git a/texthero/nlp.py b/texthero/nlp.py
@@ -5,7 +5,10 @@
 import spacy
 import pandas as pd
 
+from texthero._helper import handle_nans
 
+
+@handle_nans
 def named_entities(s, package="spacy"):
     """
     Return named-entities.
@@ -57,6 +60,7 @@ def named_entities(s, package="spacy"):
     return pd.Series(entities, index=s.index)
 
 
+@handle_nans
 def noun_chunks(s):
     """
     Return noun chunks (noun phrases).
@@ -101,6 +105,7 @@ def noun_chunks(s):
     return pd.Series(noun_chunks, index=s.index)
 
 
+@handle_nans
 def count_sentences(s: pd.Series) -> pd.Series:
     """
     Count the number of sentences per cell in a Pandas Series.
@@ -117,7 +122,7 @@ def count_sentences(s: pd.Series) -> pd.Series:
     >>> hero.count_sentences(s)
     0    2
     1    3
-    dtype: int64
+    dtype: object
     """
     number_of_sentences = []