-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[WIP] Refactor documentation API Reference for gensim.summarization #1709
Changes from 2 commits
1c6009c
851b02c
5cbb184
31be095
c6c608b
d5247c1
3031cd0
4d7b0a9
2c8ef28
254dce7
a2c2102
1a87934
0ca8332
ed188ae
20b19d6
6ec29bf
e2a2e60
d7056e4
400966c
44f617c
d2fed6c
84b0f3a
ba8b1b6
2a283d7
6bd1584
7ec89fa
fa5efce
0014d88
1a0166a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,13 @@ | |
# | ||
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html | ||
|
||
"""Text Cleaner | ||
|
||
This module contains functions and processors used for processing text, | ||
extracting sentences from text, working with acronyms and abbreviations. | ||
""" | ||
|
||
|
||
from gensim.summarization.syntactic_unit import SyntacticUnit | ||
from gensim.parsing.preprocessing import preprocess_documents | ||
from gensim.utils import tokenize | ||
|
@@ -22,28 +29,102 @@ | |
|
||
|
||
SEPARATOR = r'@' | ||
RE_SENTENCE = re.compile(r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE) | ||
"""str: special separator used in abbreviations.""" | ||
RE_SENTENCE = re.compile(r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE) # backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$) | ||
"""SRE_Pattern: pattern to split text to sentences.""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Problem with building here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @yurkai good example, how to document it: https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/parsing/preprocessing.py#L21 |
||
AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)\s(\w)', re.UNICODE) | ||
"""SRE_Pattern: pattern for detecting abbreviations. (Example: Sgt. Pepper)""" | ||
AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)\s(\w)', re.UNICODE) | ||
"""SRE_Pattern: one more pattern for detecting acronyms.""" | ||
AB_ACRONYM_LETTERS = re.compile(r'([a-zA-Z])\.([a-zA-Z])\.', re.UNICODE) | ||
"""SRE_Pattern: one more pattern for detecting acronyms. | ||
(Example: P.S. I love you)""" | ||
UNDO_AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)' + SEPARATOR + r'(\w)', re.UNICODE) | ||
"""SRE_Pattern: Pattern like AB_SENIOR but with SEPARATOR between abbreviation | ||
and next word""" | ||
UNDO_AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)' + SEPARATOR + r'(\w)', re.UNICODE) | ||
"""SRE_Pattern: Pattern like AB_ACRONYM but with SEPARATOR between abbreviation | ||
and next word""" | ||
|
||
|
||
def split_sentences(text): | ||
"""Splits and returns list of sentences from given text. It preserves | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
abbreviations set in `AB_SENIOR` and `AB_ACRONYM`. | ||
|
||
Parameters | ||
---------- | ||
text : str | ||
Input text. | ||
|
||
Returns | ||
------- | ||
str: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doesn't match with return type |
||
List of sentences from text. | ||
""" | ||
processed = replace_abbreviations(text) | ||
return [undo_replacement(sentence) for sentence in get_sentences(processed)] | ||
|
||
|
||
def replace_abbreviations(text): | ||
"""Replaces blank space to @ separator after abbreviation and next word. | ||
|
||
Parameters | ||
---------- | ||
sentence : str | ||
Input sentence. | ||
|
||
Returns | ||
------- | ||
str: | ||
Sentence with changed separator. | ||
|
||
Example | ||
------- | ||
>>> replace_abbreviations("God bless you, please, Mrs. Robinson") | ||
God bless you, please, Mrs.@Robinson | ||
""" | ||
return replace_with_separator(text, SEPARATOR, [AB_SENIOR, AB_ACRONYM]) | ||
|
||
|
||
def undo_replacement(sentence): | ||
"""Replaces `@` separator back to blank space after each abbreviation. | ||
|
||
Parameters | ||
---------- | ||
sentence : str | ||
Input sentence. | ||
|
||
Returns | ||
------- | ||
str: | ||
Sentence with changed separator. | ||
|
||
Example | ||
------- | ||
>>> undo_replacement("God bless you, please, Mrs.@Robinson") | ||
God bless you, please, Mrs. Robinson | ||
""" | ||
return replace_with_separator(sentence, r" ", [UNDO_AB_SENIOR, UNDO_AB_ACRONYM]) | ||
|
||
|
||
def replace_with_separator(text, separator, regexs): | ||
"""Returns text with replaced separator if provided regular expressions | ||
were matched. | ||
|
||
Parameters | ||
---------- | ||
text : str | ||
Input text. | ||
separator : str | ||
The separator between words to be replaced. | ||
regexs : str | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. doesn't match |
||
List of regular expressions. | ||
|
||
Returns | ||
------- | ||
str | ||
Text with replaced separators. | ||
""" | ||
replacement = r"\1" + separator + r"\2" | ||
result = text | ||
for regex in regexs: | ||
|
@@ -52,11 +133,49 @@ def replace_with_separator(text, separator, regexs): | |
|
||
|
||
def get_sentences(text): | ||
"""Sentence generator from provided text. Sentence pattern set in `RE_SENTENCE`. | ||
|
||
Parameters | ||
---------- | ||
text : str | ||
Input text. | ||
|
||
Yields | ||
------ | ||
str | ||
Single sentence extracted from text. | ||
|
||
Example | ||
------- | ||
>>> text = "Does this text contains two sentences? Yes, it is." | ||
>>> for sentence in get_sentences(text): | ||
>>> print(sentence) | ||
Does this text contains two sentences? | ||
Yes, it is. | ||
""" | ||
for match in RE_SENTENCE.finditer(text): | ||
yield match.group() | ||
|
||
|
||
def merge_syntactic_units(original_units, filtered_units, tags=None): | ||
"""Processes given sentences and its filtered (tokenized) copies into | ||
SyntacticUnit type. Also adds tags if they are provided to produced units. | ||
Returns a SyntacticUnit list. | ||
|
||
Parameters | ||
---------- | ||
original_units : list | ||
List of original sintences. | ||
filtered_units : list | ||
List of tokenized sintences. | ||
tags : list | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
List of strings used as tags for each unit. None as deafault. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't write about default parameter if this isn't special |
||
|
||
Returns | ||
------- | ||
list | ||
SyntacticUnit for each input item. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Need to use link to type, like :class: |
||
""" | ||
units = [] | ||
for i in xrange(len(original_units)): | ||
if filtered_units[i] == '': | ||
|
@@ -74,21 +193,59 @@ def merge_syntactic_units(original_units, filtered_units, tags=None): | |
|
||
|
||
def join_words(words, separator=" "): | ||
"""Merges words to a string using separator (blank space as default). | ||
|
||
Parameters | ||
---------- | ||
words : list | ||
List of words. | ||
separator : str | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
The separator bertween elements. Blank set as default. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Blank? I see |
||
|
||
Returns | ||
------- | ||
str | ||
String of merged words with separator between them. | ||
""" | ||
return separator.join(words) | ||
|
||
|
||
def clean_text_by_sentences(text): | ||
""" Tokenizes a given text into sentences, applying filters and lemmatizing them. | ||
Returns a SyntacticUnit list. """ | ||
"""Tokenizes a given text into sentences, applying filters and lemmatizing them. | ||
Returns a SyntacticUnit list. | ||
|
||
Parameters | ||
---------- | ||
text : list | ||
Input text. | ||
|
||
Returns | ||
------- | ||
list | ||
SyntacticUnit objects for each sentence. | ||
""" | ||
original_sentences = split_sentences(text) | ||
filtered_sentences = [join_words(sentence) for sentence in preprocess_documents(original_sentences)] | ||
|
||
return merge_syntactic_units(original_sentences, filtered_sentences) | ||
|
||
|
||
def clean_text_by_word(text, deacc=True): | ||
""" Tokenizes a given text into words, applying filters and lemmatizing them. | ||
Returns a dict of word -> syntacticUnit. """ | ||
"""Tokenizes a given text into words, applying filters and lemmatizing them. | ||
Returns a dictionary of word -> syntacticUnit. | ||
|
||
Parameters | ||
---------- | ||
text : list | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. type doesn't match |
||
Input text. | ||
deacc : bool | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
Remove accentuation (default True). | ||
|
||
Returns | ||
------- | ||
dictionary | ||
Word as key, SyntacticUnit as value of dictionary. | ||
""" | ||
text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) | ||
original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=deacc)) | ||
filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)] | ||
|
@@ -101,5 +258,19 @@ def clean_text_by_word(text, deacc=True): | |
|
||
|
||
def tokenize_by_word(text): | ||
"""Tokenizes input text. Before tokenizing transforms text to lower case and | ||
removes accentuation and acronyms set `AB_ACRONYM_LETTERS`. | ||
Returns generator of words. | ||
|
||
Parameters | ||
---------- | ||
text : list | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. type doesn't match |
||
Input text. | ||
|
||
Returns | ||
------- | ||
generator | ||
Words contained in processed text. | ||
""" | ||
text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) | ||
return tokenize(text_without_acronyms, to_lower=True, deacc=True) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Need to add examples/highlights/motivation here (after you finish with docstrings in this file).