From 9ce1e1403f5e972217d9273b57fd16d78ad640da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabr=C3=ADcio=20Raphael?= Date: Tue, 23 Jul 2024 01:27:06 -0300 Subject: [PATCH] Correction of tests for the WikiCorpus class to be able to receive a list of tokenizing functions. --- gensim/test/test_corpora.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py index c948080ef3..7c27f68887 100644 --- a/gensim/test/test_corpora.py +++ b/gensim/test/test_corpora.py @@ -620,7 +620,7 @@ def test_indexing(self): # Needed for the test_simple_tokenizer and test_list_tokenizers are the TestWikiCorpus class. # Cannot be nested due to serializing. -def simple_tokenize(content, token_min_len=2, token_max_len=15, lower=True): +def simple_tokenizer(content, token_min_len=2, token_max_len=15, lower=True): return [ token for token in (content.lower() if lower else content).split() if token_min_len <= len(token) <= token_max_len] @@ -732,7 +732,8 @@ def test_list_tokenizers(self): """ define a list containing two tokenizers functions (simple and custom) and use it """ - wc = self.corpus_class(self.enwiki, processes=1, tokenizer_func=[simple_tokenizer, custom_tokenizer], + wc = self.corpus_class(self.enwiki, processes=1, + tokenizer_func=[simple_tokenizer, custom_tokenizer], token_max_len=16, token_min_len=1, lower=False) row = wc.get_texts() list_tokens = next(row)