Skip to content

Commit

Permalink
Implement dealing with np.nan, closes jbesomi#86
Browse files Browse the repository at this point in the history
Every function in the library now handles NaNs correctly.

Implemented through decorator @handle_nans in new file _helper.py.

Tests added in test_nan.py

As we went through the whole library anyways, argument "input" was renamed to "s" in some functions to be in line with the others.

Co-authored-by: Maximilian Krahn <maximilian.krahn@icloud.com>
  • Loading branch information
henrifroese and mk2510 committed Jul 17, 2020
1 parent 7fdc168 commit ce196d4
Show file tree
Hide file tree
Showing 8 changed files with 182 additions and 79 deletions.
4 changes: 2 additions & 2 deletions tests/test_indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
["remove_brackets", preprocessing.remove_brackets, (s_text,)],
["remove_html_tags", preprocessing.remove_html_tags, (s_text,)],
["tokenize", preprocessing.tokenize, (s_text,)],
["tokenize_with_phrases", preprocessing.tokenize_with_phrases, (s_text_list,)],
["tokenize_with_phrases", preprocessing.tokenize_with_phrases, (s_text,)],
["replace_urls", preprocessing.replace_urls, (s_text, "")],
["remove_urls", preprocessing.remove_urls, (s_text,)],
["replace_tags", preprocessing.replace_tags, (s_text, "")],
Expand All @@ -61,7 +61,7 @@
representation.term_frequency,
(preprocessing.tokenize(s_text),),
],
#["tfidf", representation.tfidf, (preprocessing.tokenize(s_text),),],
# ["tfidf", representation.tfidf, (preprocessing.tokenize(s_text),),],
["pca", representation.pca, (s_numeric_lists, 0)],
["nmf", representation.nmf, (s_numeric_lists,)],
["tsne", representation.tsne, (s_numeric_lists,)],
Expand Down
6 changes: 2 additions & 4 deletions tests/test_nan.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

# Define valid inputs for different functions.
s_nan = pd.Series([np.NaN, "test"])
s_numeric_and_nan_lists = pd.Series([[5.0, np.NaN], [6.0, 6.0]])
s_numeric_and_nan_lists = pd.Series([[5.0, 5.0], [6.0, 6.0], np.nan])

# Define all test cases. Every test case is a list
# of [name of test case, function to test, tuple of valid input for the function].
Expand All @@ -27,7 +27,6 @@
]

test_cases_preprocessing = [
["fillna", preprocessing.fillna, (s_nan,)],
["lowercase", preprocessing.lowercase, (s_nan,)],
["replace_digits", preprocessing.replace_digits, (s_nan, "")],
["remove_digits", preprocessing.remove_digits, (s_nan,)],
Expand All @@ -38,7 +37,6 @@
["replace_stopwords", preprocessing.replace_stopwords, (s_nan, "")],
["remove_stopwords", preprocessing.remove_stopwords, (s_nan,)],
["stem", preprocessing.stem, (s_nan,)],
["clean", preprocessing.clean, (s_nan,)],
["remove_round_brackets", preprocessing.remove_round_brackets, (s_nan,)],
["remove_curly_brackets", preprocessing.remove_curly_brackets, (s_nan,)],
["remove_square_brackets", preprocessing.remove_square_brackets, (s_nan,)],
Expand All @@ -59,7 +57,7 @@
representation.term_frequency,
(preprocessing.tokenize(s_nan),),
],
#["tfidf", representation.tfidf, (preprocessing.tokenize(s_nan),)],
# ["tfidf", representation.tfidf, (preprocessing.tokenize(s_nan),)],
["pca", representation.pca, (s_numeric_and_nan_lists, 0)],
["nmf", representation.nmf, (s_numeric_and_nan_lists,)],
["tsne", representation.tsne, (s_numeric_and_nan_lists,)],
Expand Down
6 changes: 1 addition & 5 deletions tests/test_nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,17 +44,13 @@ def test_noun_chunks(self):

def test_count_sentences(self):
s = pd.Series("I think ... it counts correctly. Doesn't it? Great!")
s_true = pd.Series(3)
s_true = pd.Series(3, dtype=object)
self.assertEqual(nlp.count_sentences(s), s_true)

def test_count_sentences_numeric(self):
s = pd.Series([13.0, 42.0])
self.assertRaises(TypeError, nlp.count_sentences, s)

def test_count_sentences_missing_value(self):
s = pd.Series(["Test.", np.nan])
self.assertRaises(TypeError, nlp.count_sentences, s)

def test_count_sentences_index(self):
s = pd.Series(["Test"], index=[5])
counted_sentences_s = nlp.count_sentences(s)
Expand Down
86 changes: 86 additions & 0 deletions texthero/_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
"""
Useful helper functions for the texthero library.
"""

import functools
import wrapt
import numpy as np


"""
Decorators.
"""


def handle_nans(wrapped=None, input_only=False):
"""
Decorator to make a function not change NaN values.
Using the decorator, the function to be applied
will not change cells that have value np.nan.
The function must take as first input a Series s,
manipulate that Series (e.g. removing diacritics)
and then return as first output the Series s.
Parameters
----------
input_only: Boolean, default to False.
Set to True when the output that is returned by the
function is _not_ the same as the input series
with (some) cells changed (e.g. in top_words,
the output Series is different from the input
Series, and in pca there is no return, so in both
cases input_only is set to True).
Examples
--------
>>> from texthero._helper import *
>>> import pandas as pd
>>> import numpy as np
>>> @handle_nans
... def replace_a_with_b(s):
... return s.str.replace("a", "b")
>>> s_with_nan = pd.Series(["Test a", np.nan])
>>> replace_a_with_b(s_with_nan)
0 Test b
1 NaN
dtype: object
"""
if wrapped is None:
return functools.partial(handle_nans, input_only=input_only)

@wrapt.decorator
def wrapper(wrapped, instance, args, kwargs):

# Get first input argument (the series).
s = args[0]
nan_mask = ~s.isna()

# Need a copy as changing s[nan_mask] would change the original input.
s_result = s.copy()
s_without_nans = s[nan_mask]

# Change input Series so the function will only work on the non-nan fields.
args = (s_without_nans,) + args[1:] if args[1:] else (s_without_nans,)

# Execute the function and get the result.
output = wrapped(*args, **kwargs)

# If we should also handle the output.
if not input_only:
# Replace first argument of output (that's the Series) to refill the NaN fields.
if not isinstance(output, tuple):
output = (output,)
s_result[nan_mask] = output[0]

# Recover index name if set.
if output[0].index.name:
s_result.index.name = output[0].index.name

output = (s_result,) + output[1:] if output[1:] else s_result

return output

return wrapper(wrapped)
7 changes: 6 additions & 1 deletion texthero/nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
import spacy
import pandas as pd

from texthero._helper import handle_nans


@handle_nans
def named_entities(s, package="spacy"):
"""
Return named-entities.
Expand Down Expand Up @@ -57,6 +60,7 @@ def named_entities(s, package="spacy"):
return pd.Series(entities, index=s.index)


@handle_nans
def noun_chunks(s):
"""
Return noun chunks (noun phrases).
Expand Down Expand Up @@ -101,6 +105,7 @@ def noun_chunks(s):
return pd.Series(noun_chunks, index=s.index)


@handle_nans
def count_sentences(s: pd.Series) -> pd.Series:
"""
Count the number of sentences per cell in a Pandas Series.
Expand All @@ -117,7 +122,7 @@ def count_sentences(s: pd.Series) -> pd.Series:
>>> hero.count_sentences(s)
0 2
1 3
dtype: int64
dtype: object
"""
number_of_sentences = []

Expand Down
Loading

0 comments on commit ce196d4

Please sign in to comment.