Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speed-Up Preprocessing + NLP #162

Open
wants to merge 26 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
fa342a9
added MultiIndex DF support
mk2510 Aug 18, 2020
59a9f8c
beginning with tests
henrifroese Aug 19, 2020
19c52de
implemented correct sparse support
mk2510 Aug 19, 2020
66e566c
Merge branch 'master_upstream' into change_representation_to_multicolumn
mk2510 Aug 21, 2020
41f55a8
added back list() and rm .tolist()
mk2510 Aug 21, 2020
217611a
rm .tolist() and added list()
mk2510 Aug 21, 2020
6a3b56d
Adopted the test to the new dataframes
mk2510 Aug 21, 2020
b8ff561
wrong format
mk2510 Aug 21, 2020
e3af2f9
Address most review comments.
henrifroese Aug 21, 2020
77ad80e
Add more unittests for representation
henrifroese Aug 21, 2020
4c718b8
start or paralisation
mk2510 Aug 23, 2020
4c7deb0
implement parallel in some functions
henrifroese Aug 23, 2020
8835847
added private functions to preprosseing and enable parallel
mk2510 Aug 23, 2020
4fd7b94
Merge remote-tracking branch 'origin/decorator_for_parallelization' i…
mk2510 Aug 23, 2020
a587768
begin to add tests
henrifroese Aug 23, 2020
e17a751
added test and NLP implementation
mk2510 Aug 23, 2020
c0279b2
changed _helper to helper
mk2510 Aug 23, 2020
83e534c
move config variables to new `config.py`
henrifroese Aug 23, 2020
eda100a
fix config import in __init__
henrifroese Aug 23, 2020
ca3a98f
right parallel access
mk2510 Aug 23, 2020
127291d
formatted code
mk2510 Aug 23, 2020
823bbb4
changed back imports from the functions as not needed
mk2510 Aug 23, 2020
a8e4994
Merge branch 'master_upstream' into decorator_for_parallelization
mk2510 Sep 22, 2020
8f77b08
moved stem to nlp
mk2510 Sep 22, 2020
b529181
fixed test
mk2510 Sep 22, 2020
22a8100
parallized noun chunks
mk2510 Sep 22, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
273 changes: 268 additions & 5 deletions tests/test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,17 @@
import doctest
import unittest
import warnings
import string

from texthero import _helper
from texthero import helper, preprocessing, nlp

"""
Doctests.
"""


def load_tests(loader, tests, ignore):
tests.addTests(doctest.DocTestSuite(_helper))
tests.addTests(doctest.DocTestSuite(helper))
return tests


Expand All @@ -35,7 +36,7 @@ class TestHelpers(PandasTestCase):
def test_handle_nans(self):
s = pd.Series(["Test", np.nan, pd.NA])

@_helper.handle_nans(replace_nans_with="This was a NAN")
@helper.handle_nans(replace_nans_with="This was a NAN")
def f(s):
return s

Expand All @@ -51,7 +52,7 @@ def f(s):
def test_handle_nans_no_nans_in_input(self):
s = pd.Series(["Test"])

@_helper.handle_nans(replace_nans_with="This was a NAN")
@helper.handle_nans(replace_nans_with="This was a NAN")
def f(s):
return s

Expand All @@ -63,7 +64,7 @@ def f(s):
def test_handle_nans_index(self):
s = pd.Series(["Test", np.nan, pd.NA], index=[4, 5, 6])

@_helper.handle_nans(replace_nans_with="This was a NAN")
@helper.handle_nans(replace_nans_with="This was a NAN")
def f(s):
return s

Expand All @@ -74,3 +75,265 @@ def f(s):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
self.assertTrue(f(s).index.equals(s_true.index))


class TestPreprocessingParallelized(PandasTestCase):
"""
Test remove digits.
"""

def setUp(self):
helper.MIN_LINES_FOR_PARALLELIZATION = 0
helper.PARALLELIZE = True

def tearDown(self):
helper.MIN_LINES_FOR_PARALLELIZATION = 10000
helper.PARALLELIZE = True

def parallelized_test_helper(self, func, s, non_parallel_s_true, **kwargs):

s = s
non_parallel_s_true = non_parallel_s_true

pd.testing.assert_series_equal(non_parallel_s_true, func(s, **kwargs))

def test_remove_digits_only_block(self):
s = pd.Series("remove block of digits 1234 h1n1")
s_true = pd.Series("remove block of digits h1n1")
self.parallelized_test_helper(preprocessing.remove_digits, s, s_true)

def test_remove_digits_any(self):
s = pd.Series("remove block of digits 1234 h1n1")
s_true = pd.Series("remove block of digits h n ")

self.parallelized_test_helper(
preprocessing.remove_digits, s, s_true, only_blocks=False
)

def test_remove_digits_brackets(self):
s = pd.Series("Digits in bracket (123 $) needs to be cleaned out")
s_true = pd.Series("Digits in bracket ( $) needs to be cleaned out")
self.parallelized_test_helper(preprocessing.remove_digits, s, s_true)

def test_remove_digits_start(self):
s = pd.Series("123 starting digits needs to be cleaned out")
s_true = pd.Series(" starting digits needs to be cleaned out")
self.parallelized_test_helper(preprocessing.remove_digits, s, s_true)

def test_remove_digits_end(self):
s = pd.Series("end digits needs to be cleaned out 123")
s_true = pd.Series("end digits needs to be cleaned out ")
self.parallelized_test_helper(preprocessing.remove_digits, s, s_true)

def test_remove_digits_phone(self):
s = pd.Series("+41 1234 5678")
s_true = pd.Series("+ ")
self.parallelized_test_helper(preprocessing.remove_digits, s, s_true)

def test_remove_digits_punctuation(self):
s = pd.Series(string.punctuation)
s_true = pd.Series(string.punctuation)
self.parallelized_test_helper(preprocessing.remove_digits, s, s_true)

"""
Test replace digits
"""

def test_replace_digits(self):
s = pd.Series("1234 falcon9")
s_true = pd.Series("X falcon9")
self.parallelized_test_helper(
preprocessing.replace_digits, s, s_true, symbols="X"
)

def test_replace_digits_any(self):
s = pd.Series("1234 falcon9")
s_true = pd.Series("X falconX")
self.parallelized_test_helper(
preprocessing.replace_digits, s, s_true, symbols="X", only_blocks=False
)

"""
Remove punctuation.
"""

def test_remove_punctation(self):
s = pd.Series("Remove all! punctuation!! ()")
s_true = pd.Series(
"Remove all punctuation "
) # TODO maybe just remove space?
self.parallelized_test_helper(preprocessing.remove_punctuation, s, s_true)

"""
Remove diacritics.
"""

def test_remove_diactitics(self):
s = pd.Series("Montréal, über, 12.89, Mère, Françoise, noël, 889, اِس, اُس")
s_true = pd.Series("Montreal, uber, 12.89, Mere, Francoise, noel, 889, اس, اس")
self.parallelized_test_helper(preprocessing.remove_diacritics, s, s_true)

"""
Remove whitespace.
"""

def test_remove_whitespace(self):
s = pd.Series("hello world hello world ")
s_true = pd.Series("hello world hello world")
self.parallelized_test_helper(preprocessing.remove_whitespace, s, s_true)

"""
Test pipeline.
"""

def test_pipeline_stopwords(self):
s = pd.Series("E-I-E-I-O\nAnd on")
s_true = pd.Series("e-i-e-i-o\n ")
pipeline = [preprocessing.lowercase, preprocessing.remove_stopwords]
self.parallelized_test_helper(preprocessing.clean, s, s_true, pipeline=pipeline)

"""
Test remove html tags
"""

def test_remove_html_tags(self):
s = pd.Series("<html>remove <br>html</br> tags<html> &nbsp;")
s_true = pd.Series("remove html tags ")
self.parallelized_test_helper(preprocessing.remove_html_tags, s, s_true)

"""
Text tokenization
"""

def test_tokenize(self):
s = pd.Series("text to tokenize")
s_true = pd.Series([["text", "to", "tokenize"]])
self.parallelized_test_helper(preprocessing.tokenize, s, s_true)

"""
Has content
"""

def test_has_content(self):
s = pd.Series(["c", np.nan, "\t\n", " ", "", "has content", None])
s_true = pd.Series([True, False, False, False, False, True, False])
self.parallelized_test_helper(preprocessing.has_content, s, s_true)

"""
Test remove urls
"""

def test_remove_urls(self):
s = pd.Series("http://tests.com http://www.tests.com")
s_true = pd.Series(" ")
self.parallelized_test_helper(preprocessing.remove_urls, s, s_true)

"""
Remove brackets
"""

def test_remove_brackets(self):
s = pd.Series(
"Remove all [square_brackets]{/curly_brackets}(round_brackets)<angle_brackets>"
)
s_true = pd.Series("Remove all ")
self.parallelized_test_helper(preprocessing.remove_brackets, s, s_true)

"""
Test replace and remove tags
"""

def test_replace_tags(self):
s = pd.Series("Hi @tag, we will replace you")
s_true = pd.Series("Hi TAG, we will replace you")
self.parallelized_test_helper(
preprocessing.replace_tags, s, s_true, symbol="TAG"
)

def test_remove_tags_alphabets(self):
s = pd.Series("Hi @tag, we will remove you")
s_true = pd.Series("Hi , we will remove you")

self.parallelized_test_helper(preprocessing.remove_tags, s, s_true)

"""
Test replace and remove hashtags
"""

def test_replace_hashtags(self):
s = pd.Series("Hi #hashtag, we will replace you")
s_true = pd.Series("Hi HASHTAG, we will replace you")

self.parallelized_test_helper(
preprocessing.replace_hashtags, s, s_true, symbol="HASHTAG"
)

def test_remove_hashtags(self):
s = pd.Series("Hi #hashtag_trending123, we will remove you")
s_true = pd.Series("Hi , we will remove you")

self.parallelized_test_helper(preprocessing.remove_hashtags, s, s_true)

"""
Test NLP for parallelization
"""

"""
Named entity.
"""

def test_named_entities(self):
s = pd.Series("New York is a big city")
s_true = pd.Series([[("New York", "GPE", 0, 8)]])
self.parallelized_test_helper(nlp.named_entities, s, s_true)

"""
Noun chunks.
"""

def test_noun_chunks(self):
s = pd.Series("Today is such a beautiful day")
s_true = pd.Series(
[[("Today", "NP", 0, 5), ("such a beautiful day", "NP", 9, 29)]]
)

self.parallelized_test_helper(nlp.noun_chunks, s, s_true)

"""
Count sentences.
"""

def test_count_sentences(self):
s = pd.Series("I think ... it counts correctly. Doesn't it? Great!")
s_true = pd.Series(3)
self.parallelized_test_helper(nlp.count_sentences, s, s_true)

"""
POS tagging.
"""

def test_pos(self):
s = pd.Series(["Today is such a beautiful day", "São Paulo is a great city"])

s_true = pd.Series(
[
[
("Today", "NOUN", "NN", 0, 5),
("is", "AUX", "VBZ", 6, 8),
("such", "DET", "PDT", 9, 13),
("a", "DET", "DT", 14, 15),
("beautiful", "ADJ", "JJ", 16, 25),
("day", "NOUN", "NN", 26, 29),
],
[
("São", "PROPN", "NNP", 0, 3),
("Paulo", "PROPN", "NNP", 4, 9),
("is", "AUX", "VBZ", 10, 12),
("a", "DET", "DT", 13, 14),
("great", "ADJ", "JJ", 15, 20),
("city", "NOUN", "NN", 21, 25),
],
]
)

self.parallelized_test_helper(nlp.pos_tag, s, s_true)
10 changes: 5 additions & 5 deletions tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def test_tokenize_split_punctuation(self):
def test_tokenize_not_split_in_between_punctuation(self):
s = pd.Series(["don't say hello-world hello_world"])
s_true = pd.Series([["don't", "say", "hello-world", "hello_world"]])
self.assertEqual(preprocessing.tokenize(s), s_true)
pd.testing.assert_series_equal(preprocessing.tokenize(s), s_true)

"""
Has content
Expand All @@ -186,7 +186,7 @@ def test_tokenize_not_split_in_between_punctuation(self):
def test_has_content(self):
s = pd.Series(["c", np.nan, "\t\n", " ", "", "has content", None])
s_true = pd.Series([True, False, False, False, False, True, False])
self.assertEqual(preprocessing.has_content(s), s_true)
pd.testing.assert_series_equal(preprocessing.has_content(s), s_true)

"""
Test remove urls
Expand All @@ -195,17 +195,17 @@ def test_has_content(self):
def test_remove_urls(self):
s = pd.Series("http://tests.com http://www.tests.com")
s_true = pd.Series(" ")
self.assertEqual(preprocessing.remove_urls(s), s_true)
pd.testing.assert_series_equal(preprocessing.remove_urls(s), s_true)

def test_remove_urls_https(self):
s = pd.Series("https://tests.com https://www.tests.com")
s_true = pd.Series(" ")
self.assertEqual(preprocessing.remove_urls(s), s_true)
pd.testing.assert_series_equal(preprocessing.remove_urls(s), s_true)

def test_remove_urls_multiline(self):
s = pd.Series("https://tests.com \n https://tests.com")
s_true = pd.Series(" \n ")
self.assertEqual(preprocessing.remove_urls(s), s_true)
pd.testing.assert_series_equal(preprocessing.remove_urls(s), s_true)

"""
Remove brackets
Expand Down
5 changes: 5 additions & 0 deletions texthero/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,8 @@
from .nlp import *

from . import stopwords

from . import helper

from . import config
from .config import *
2 changes: 2 additions & 0 deletions texthero/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
MIN_LINES_FOR_PARALLELIZATION = 10000
PARALLELIZE = True
Loading