From 36a85ba7f9a7d82204a6eb1d7466d8a6dbb7c92c Mon Sep 17 00:00:00 2001 From: Keiran Thompson Date: Sat, 29 Aug 2015 19:31:18 +1000 Subject: [PATCH 01/13] Speedup pagerank by only calculating first eigenvector --- gensim/summarization/pagerank_weighted.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/gensim/summarization/pagerank_weighted.py b/gensim/summarization/pagerank_weighted.py index e5a31357d0..1978c6e1c7 100644 --- a/gensim/summarization/pagerank_weighted.py +++ b/gensim/summarization/pagerank_weighted.py @@ -2,10 +2,9 @@ # -*- coding: utf-8 -*- # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - from numpy import empty as empty_matrix from scipy.sparse import csr_matrix -from scipy.linalg import eig +from scipy.sparse.linalg import eigs from six.moves import xrange try: @@ -21,8 +20,10 @@ def pagerank_weighted(graph, damping=0.85): probability_matrix = build_probability_matrix(graph) pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * probability_matrix - vals, vecs = eig(pagerank_matrix, left=True, right=False) # TODO optimize this. - return process_results(graph, vecs) + + vals, vecs = eigs(pagerank_matrix.T, k=1) # TODO raise an error if matrix has complex eigenvectors? + + return process_results(graph, vecs.real) def build_adjacency_matrix(graph): @@ -37,7 +38,7 @@ def build_adjacency_matrix(graph): neighbors_sum = sum(graph.edge_weight((current_node, neighbor)) for neighbor in graph.neighbors(current_node)) for j in xrange(length): edge_weight = float(graph.edge_weight((current_node, nodes[j]))) - if i != j and edge_weight != 0: + if i != j and edge_weight != 0.0: row.append(i) col.append(j) data.append(edge_weight / neighbors_sum) @@ -49,7 +50,7 @@ def build_probability_matrix(graph): dimension = len(graph.nodes()) matrix = empty_matrix((dimension, dimension)) - probability = 1 / float(dimension) + probability = 1.0 / float(dimension) matrix.fill(probability) return matrix @@ -58,6 +59,6 @@ def build_probability_matrix(graph): def process_results(graph, vecs): scores = {} for i, node in enumerate(graph.nodes()): - scores[node] = abs(vecs[i][0]) # TODO: this is wasteful (only compute the principal component). + scores[node] = abs(vecs[i, :]) return scores From f0bfbd1f9c33a7e4489fd68496bcc49accc9371b Mon Sep 17 00:00:00 2001 From: Keiran Thompson Date: Sat, 29 Aug 2015 19:31:43 +1000 Subject: [PATCH 02/13] Speedup summarisation by skipping edges with small weights --- gensim/summarization/summarizer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py index 9e5e343299..2d46716b4a 100644 --- a/gensim/summarization/summarizer.py +++ b/gensim/summarization/summarizer.py @@ -10,13 +10,14 @@ from gensim.summarization.commons import remove_unreachable_nodes as _remove_unreachable_nodes from gensim.summarization.bm25 import get_bm25_weights as _bm25_weights from gensim.corpora import Dictionary -from scipy.sparse import csr_matrix from math import log10 as _log10 from six.moves import xrange INPUT_MIN_LENGTH = 10 +WEIGHT_THRESHOLD = 1.e-3 + logger = logging.getLogger(__name__) @@ -26,7 +27,7 @@ def _set_graph_edge_weights(graph): for i in xrange(len(documents)): for j in xrange(len(documents)): - if i == j: + if i == j or weights[i][j] < WEIGHT_THRESHOLD: continue sentence_1 = documents[i] From 527025f4e77d0a7b1d1ce3fbd5ee86c44651def9 Mon Sep 17 00:00:00 2001 From: Keiran Thompson Date: Sat, 29 Aug 2015 19:32:48 +1000 Subject: [PATCH 03/13] Add a pos_filters argument to keywords method While the paper showed best performance was obtained with nouns and adjectives, there are applications where including verbs, adverbs, or even prepositions in the keywords list might be desirable. --- gensim/summarization/keywords.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py index 5146c560ad..0975e4f987 100644 --- a/gensim/summarization/keywords.py +++ b/gensim/summarization/keywords.py @@ -12,12 +12,13 @@ from six.moves.queue import Queue as _Queue from six.moves import xrange - WINDOW_SIZE = 2 -"""Check tags in http://www.clips.ua.ac.be/pages/mbsp-tags and use only first two letters +""" +Check tags in http://www.clips.ua.ac.be/pages/mbsp-tags and use only first two letters Example: filter for nouns and adjectives: -INCLUDING_FILTER = ['NN', 'JJ']""" +INCLUDING_FILTER = ['NN', 'JJ'] +""" INCLUDING_FILTER = ['NN', 'JJ'] EXCLUDING_FILTER = [] @@ -26,8 +27,11 @@ def _get_pos_filters(): return frozenset(INCLUDING_FILTER), frozenset(EXCLUDING_FILTER) -def _get_words_for_graph(tokens): - include_filters, exclude_filters = _get_pos_filters() +def _get_words_for_graph(tokens, pos_filter): + if pos_filter is None: + include_filters, exclude_filters = _get_pos_filters() + else: + include_filters = set(pos_filter) if include_filters and exclude_filters: raise ValueError("Can't use both include and exclude filters, should use only one") @@ -189,13 +193,13 @@ def _format_results(_keywords, combined_keywords, split, scores): return "\n".join(combined_keywords) -def keywords(text, ratio=0.2, words=None, split=False, scores=False): +def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=['NN', 'JJ']): # Gets a dict of word -> lemma tokens = _clean_text_by_word(text) split_text = list(_tokenize_by_word(text)) # Creates the graph and adds the edges - graph = _build_graph(_get_words_for_graph(tokens)) + graph = _build_graph(_get_words_for_graph(tokens, pos_filter)) _set_graph_edges(graph, tokens, split_text) del split_text # It's no longer used From f2ab8babaf36b177cb62b1080315489dffba688e Mon Sep 17 00:00:00 2001 From: Keiran Thompson Date: Sat, 29 Aug 2015 19:39:43 +1000 Subject: [PATCH 04/13] Make summarisation tests executable as standalone --- gensim/test/test_summarization.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/gensim/test/test_summarization.py b/gensim/test/test_summarization.py index 909a3c60ac..b739c49e57 100644 --- a/gensim/test/test_summarization.py +++ b/gensim/test/test_summarization.py @@ -13,6 +13,7 @@ """ import os.path +import logging import unittest from gensim import utils @@ -128,3 +129,7 @@ def test_corpus_summarization_ratio(self): expected_summary_length = int(len(corpus) * ratio) self.assertEqual(len(selected_docs), expected_summary_length) + +if __name__ == '__main__': + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) + unittest.main() From 7b0e5b48771903c613827e8238c26e6beae8b1c6 Mon Sep 17 00:00:00 2001 From: Keiran Thompson Date: Sat, 29 Aug 2015 20:21:19 +1000 Subject: [PATCH 05/13] Fix bug not caught by tests --- gensim/summarization/keywords.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py index 0975e4f987..f585004812 100644 --- a/gensim/summarization/keywords.py +++ b/gensim/summarization/keywords.py @@ -32,6 +32,7 @@ def _get_words_for_graph(tokens, pos_filter): include_filters, exclude_filters = _get_pos_filters() else: include_filters = set(pos_filter) + exclude_filters = frozenset([]) if include_filters and exclude_filters: raise ValueError("Can't use both include and exclude filters, should use only one") From 4b96eb0bc796a24a663d4f97b0f8409514322feb Mon Sep 17 00:00:00 2001 From: Keiran Thompson Date: Sat, 29 Aug 2015 20:56:04 +1000 Subject: [PATCH 06/13] Add some keyword tests --- gensim/test/test_data/mihalcea_tarau.kw.txt | 14 +++ .../test/test_data/mihalcea_tarau.kwpos.txt | 17 ++++ gensim/test/test_keywords.py | 96 +++++++++++++++++++ 3 files changed, 127 insertions(+) create mode 100644 gensim/test/test_data/mihalcea_tarau.kw.txt create mode 100644 gensim/test/test_data/mihalcea_tarau.kwpos.txt create mode 100644 gensim/test/test_keywords.py diff --git a/gensim/test/test_data/mihalcea_tarau.kw.txt b/gensim/test/test_data/mihalcea_tarau.kw.txt new file mode 100644 index 0000000000..9ba4443c91 --- /dev/null +++ b/gensim/test/test_data/mihalcea_tarau.kw.txt @@ -0,0 +1,14 @@ +hurricane +coast +saturday +storm +flood +flooding +gilbert +winds heavy +strong +defense +puerto +north +weather +southeast \ No newline at end of file diff --git a/gensim/test/test_data/mihalcea_tarau.kwpos.txt b/gensim/test/test_data/mihalcea_tarau.kwpos.txt new file mode 100644 index 0000000000..b9597ee237 --- /dev/null +++ b/gensim/test/test_data/mihalcea_tarau.kwpos.txt @@ -0,0 +1,17 @@ +hurricane +gilbert +storm +coast +saturday +winds heavy +weather +flood +flooding +strong +defense +people +cabral said +associated +north +residents +southeast \ No newline at end of file diff --git a/gensim/test/test_keywords.py b/gensim/test/test_keywords.py new file mode 100644 index 0000000000..d60db22819 --- /dev/null +++ b/gensim/test/test_keywords.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python +# encoding: utf-8 +# +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +Automated test to reproduce the results of Mihalcea and Tarau (2004). + +Mihalcea and Tarau (2004) introduces the TextRank summarization algorithm. +As a validation of the gensim implementation we reproduced its results +in this test. + +""" + +import os.path +import logging +import unittest + +from gensim import utils +from gensim.corpora import Dictionary +from gensim.summarization import keywords + + +class TestKeywordsTest(unittest.TestCase): + + def test_text_keywords(self): + pre_path = os.path.join(os.path.dirname(__file__), 'test_data') + + with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: + text = f.read() + + # calculate keywords + generated_keywords = keywords(text) + + # To be compared to the reference. + with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kw.txt"), mode="r") as f: + kw = f.read() + + self.assertEqual(generated_keywords, kw) + + def test_text_keywords_words(self): + pre_path = os.path.join(os.path.dirname(__file__), 'test_data') + + with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: + text = f.read() + + # calculate exactly 13 keywords + generated_keywords = keywords(text, words=13).split('\n') + + self.assertEqual(len(generated_keywords), 13) + + def test_text_keywords_pos(self): + pre_path = os.path.join(os.path.dirname(__file__), 'test_data') + + with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: + text = f.read() + + # calculate keywords using only certain parts of speech + generated_keywords_NNVBJJ = keywords(text, pos_filter=['NN', 'VB', 'JJ']) + + # To be compared to the reference. + with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kwpos.txt"), mode="r") as f: + kw = f.read() + + self.assertEqual(generated_keywords_NNVBJJ, kw) + + def test_text_summarization_raises_exception_on_short_input_text(self): + pre_path = os.path.join(os.path.dirname(__file__), 'test_data') + + with utils.smart_open(os.path.join(pre_path, "testsummarization_unrelated.txt"), mode="r") as f: + text = f.read() + + # Keeps the first 8 sentences to make the text shorter. + text = "\n".join(text.split('\n')[:8]) + + self.assertTrue(keywords(text) is not None) + + def test_empty_text_keywords_none(self): + self.assertRaises(ZeroDivisionError) + + def test_keywords_ratio(self): + pre_path = os.path.join(os.path.dirname(__file__), 'test_data') + + with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: + text = f.read() + + # Check ratio parameter is well behaved. Because length is taken on tokenized clean text + # we just check that ratio 40% is twice as long as ratio 20% + selected_docs_20 = keywords(text, ratio=0.2).split("\n") + selected_docs_40 = keywords(text, ratio=0.4).split("\n") + + self.assertAlmostEqual(float(len(selected_docs_40))/len(selected_docs_20), 1.93, places=2) + +if __name__ == '__main__': + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) + unittest.main() From 1c49a98beade5469bc3b724cf938442b7fa394f1 Mon Sep 17 00:00:00 2001 From: Keiran Thompson Date: Sat, 29 Aug 2015 22:19:39 +1000 Subject: [PATCH 07/13] Naive attempt to make keyword tests robust to different hardware and python versions --- gensim/test/test_keywords.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gensim/test/test_keywords.py b/gensim/test/test_keywords.py index d60db22819..6ff3c36b7b 100644 --- a/gensim/test/test_keywords.py +++ b/gensim/test/test_keywords.py @@ -36,7 +36,7 @@ def test_text_keywords(self): with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kw.txt"), mode="r") as f: kw = f.read() - self.assertEqual(generated_keywords, kw) + self.assertEqual(set(generated_keywords), set(kw)) def test_text_keywords_words(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') @@ -45,7 +45,7 @@ def test_text_keywords_words(self): text = f.read() # calculate exactly 13 keywords - generated_keywords = keywords(text, words=13).split('\n') + generated_keywords = keywords(text, words=13, split=True) self.assertEqual(len(generated_keywords), 13) @@ -86,10 +86,10 @@ def test_keywords_ratio(self): # Check ratio parameter is well behaved. Because length is taken on tokenized clean text # we just check that ratio 40% is twice as long as ratio 20% - selected_docs_20 = keywords(text, ratio=0.2).split("\n") - selected_docs_40 = keywords(text, ratio=0.4).split("\n") + selected_docs_20 = keywords(text, ratio=0.2, split=True) + selected_docs_40 = keywords(text, ratio=0.4, split=True) - self.assertAlmostEqual(float(len(selected_docs_40))/len(selected_docs_20), 1.93, places=2) + self.assertAlmostEqual(float(len(selected_docs_40))/len(selected_docs_20), 1.9, places=1) if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) From 78c001af4757919c72d2f88f3872b000cabfda65 Mon Sep 17 00:00:00 2001 From: Keiran Thompson Date: Tue, 1 Sep 2015 13:37:27 +1000 Subject: [PATCH 08/13] Add user option to control lemmatising of keywords --- gensim/summarization/keywords.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py index f585004812..92f4019274 100644 --- a/gensim/summarization/keywords.py +++ b/gensim/summarization/keywords.py @@ -194,7 +194,7 @@ def _format_results(_keywords, combined_keywords, split, scores): return "\n".join(combined_keywords) -def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=['NN', 'JJ']): +def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=['NN', 'JJ'], lemmatize=False): # Gets a dict of word -> lemma tokens = _clean_text_by_word(text) split_text = list(_tokenize_by_word(text)) @@ -211,7 +211,14 @@ def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter= extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) - lemmas_to_word = _lemmas_to_words(tokens) + # The results can be polluted by many variations of the same word + if lemmatize: + lemmas_to_word = {} + for word, unit in tokens.iteritems(): + lemmas_to_word[unit.token] = [word] + else: + lemmas_to_word = _lemmas_to_words(tokens) + keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined From 34e0df54fc10c72b79c61eb037148dcf46446525 Mon Sep 17 00:00:00 2001 From: Keiran Thompson Date: Tue, 1 Sep 2015 17:24:59 +1000 Subject: [PATCH 09/13] Make keywords python 3 compatible --- gensim/summarization/keywords.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py index 92f4019274..155c448a80 100644 --- a/gensim/summarization/keywords.py +++ b/gensim/summarization/keywords.py @@ -11,6 +11,8 @@ from itertools import combinations as _combinations from six.moves.queue import Queue as _Queue from six.moves import xrange +from six import iteritems + WINDOW_SIZE = 2 @@ -37,7 +39,7 @@ def _get_words_for_graph(tokens, pos_filter): raise ValueError("Can't use both include and exclude filters, should use only one") result = [] - for word, unit in tokens.iteritems(): + for word, unit in iteritems(tokens): if exclude_filters and unit.tag in exclude_filters: continue if (include_filters and unit.tag in include_filters) or not include_filters or not unit.tag: @@ -116,7 +118,7 @@ def _extract_tokens(lemmas, scores, ratio, words): def _lemmas_to_words(tokens): lemma_to_word = {} - for word, unit in tokens.iteritems(): + for word, unit in iteritems(tokens): lemma = unit.token if lemma in lemma_to_word: lemma_to_word[lemma].append(word) @@ -214,7 +216,7 @@ def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter= # The results can be polluted by many variations of the same word if lemmatize: lemmas_to_word = {} - for word, unit in tokens.iteritems(): + for word, unit in iteritems(tokens): lemmas_to_word[unit.token] = [word] else: lemmas_to_word = _lemmas_to_words(tokens) From 8962ee3b03031355c54f7c943176b3d5b72ef67b Mon Sep 17 00:00:00 2001 From: Keiran Thompson Date: Tue, 1 Sep 2015 17:25:53 +1000 Subject: [PATCH 10/13] Second attempt to make keywords tests robust to version and platform --- gensim/test/test_keywords.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/gensim/test/test_keywords.py b/gensim/test/test_keywords.py index 6ff3c36b7b..e617b803c4 100644 --- a/gensim/test/test_keywords.py +++ b/gensim/test/test_keywords.py @@ -30,13 +30,13 @@ def test_text_keywords(self): text = f.read() # calculate keywords - generated_keywords = keywords(text) + generated_keywords = keywords(text, split=True) # To be compared to the reference. with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kw.txt"), mode="r") as f: - kw = f.read() + kw = f.read().strip().split("\n") - self.assertEqual(set(generated_keywords), set(kw)) + self.assertEqual(set(map(str, generated_keywords)), set(map(str, kw))) def test_text_keywords_words(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') @@ -45,9 +45,9 @@ def test_text_keywords_words(self): text = f.read() # calculate exactly 13 keywords - generated_keywords = keywords(text, words=13, split=True) + generated_keywords = keywords(text, words=15, split=True) - self.assertEqual(len(generated_keywords), 13) + self.assertEqual(len(generated_keywords), 15) def test_text_keywords_pos(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') @@ -56,13 +56,13 @@ def test_text_keywords_pos(self): text = f.read() # calculate keywords using only certain parts of speech - generated_keywords_NNVBJJ = keywords(text, pos_filter=['NN', 'VB', 'JJ']) + generated_keywords_NNVBJJ = keywords(text, pos_filter=['NN', 'VB', 'JJ'], split=True) # To be compared to the reference. with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kwpos.txt"), mode="r") as f: - kw = f.read() + kw = f.read().strip().split("\n") - self.assertEqual(generated_keywords_NNVBJJ, kw) + self.assertEqual(set(map(str, generated_keywords_NNVBJJ)), set(map(str, kw))) def test_text_summarization_raises_exception_on_short_input_text(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') From e1db1e68472b5876157256c3b70d4d92376148c7 Mon Sep 17 00:00:00 2001 From: Keiran Thompson Date: Tue, 1 Sep 2015 18:10:09 +1000 Subject: [PATCH 11/13] Remove test which fails on python 2.6 --- gensim/test/test_keywords.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/gensim/test/test_keywords.py b/gensim/test/test_keywords.py index e617b803c4..2e7c3a48f5 100644 --- a/gensim/test/test_keywords.py +++ b/gensim/test/test_keywords.py @@ -75,9 +75,6 @@ def test_text_summarization_raises_exception_on_short_input_text(self): self.assertTrue(keywords(text) is not None) - def test_empty_text_keywords_none(self): - self.assertRaises(ZeroDivisionError) - def test_keywords_ratio(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') From 45e6c5b587aa3bcb86cc0fa3ef102b69495407a5 Mon Sep 17 00:00:00 2001 From: Keiran Thompson Date: Tue, 1 Sep 2015 18:10:39 +1000 Subject: [PATCH 12/13] Try removing unicode transform which breaks on python3 --- gensim/summarization/keywords.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py index 155c448a80..e6e4c9d655 100644 --- a/gensim/summarization/keywords.py +++ b/gensim/summarization/keywords.py @@ -163,7 +163,8 @@ def _get_combined_keywords(_keywords, split_text): result.append(word) # appends last word if keyword and doesn't iterate for j in xrange(i + 1, len_text): other_word = _strip_word(split_text[j]) - if other_word in _keywords and other_word == split_text[j].decode("utf-8"): + # if other_word in _keywords and other_word == split_text[j].decode("utf-8"): + if other_word in _keywords and other_word == split_text[j]: combined_word.append(other_word) else: for keyword in combined_word: From 8fdfff71f16887d668b3c0eb0c116495901d1206 Mon Sep 17 00:00:00 2001 From: Keiran Thompson Date: Fri, 18 Sep 2015 16:14:32 +1000 Subject: [PATCH 13/13] Revert "Try removing unicode transform which breaks on python3" This reverts commit 45e6c5b587aa3bcb86cc0fa3ef102b69495407a5. --- gensim/summarization/keywords.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py index e6e4c9d655..155c448a80 100644 --- a/gensim/summarization/keywords.py +++ b/gensim/summarization/keywords.py @@ -163,8 +163,7 @@ def _get_combined_keywords(_keywords, split_text): result.append(word) # appends last word if keyword and doesn't iterate for j in xrange(i + 1, len_text): other_word = _strip_word(split_text[j]) - # if other_word in _keywords and other_word == split_text[j].decode("utf-8"): - if other_word in _keywords and other_word == split_text[j]: + if other_word in _keywords and other_word == split_text[j].decode("utf-8"): combined_word.append(other_word) else: for keyword in combined_word: