diff --git a/sr-data/pyproject.toml b/sr-data/pyproject.toml index 8389a773..9c5f6ddc 100644 --- a/sr-data/pyproject.toml +++ b/sr-data/pyproject.toml @@ -40,6 +40,7 @@ scikit-learn = "^1.5.1" cohere = "5.9.1" loguru = "^0.7.2" scikit-fuzzy = "^0.5.0" +markdown-it-py = "^3.0.0" [tool.poetry.group.dev.dependencies] pytest = "^8.2.2" diff --git a/sr-data/src/sr_data/steps/extract.py b/sr-data/src/sr_data/steps/extract.py index 9dddea32..b549a26f 100644 --- a/sr-data/src/sr_data/steps/extract.py +++ b/sr-data/src/sr_data/steps/extract.py @@ -45,8 +45,8 @@ def main(repos, out): frame["headings"] = frame["readme"].apply(headings) before = len(frame) frame = frame.dropna(subset=["headings"]) - stops = len(frame) - logger.info(f"Removed {before - stops} repositories that don't have at least one heading (#)") + headingless = len(frame) + logger.info(f"Removed {before - headingless} repositories that don't have at least one heading (#)") frame["headings"] = frame["headings"].apply( lambda readme: remove_stop_words(readme, stopwords.words("english")) ) @@ -59,7 +59,7 @@ def main(repos, out): ) frame = frame[frame["headings"].apply(bool)] logger.info( - f"Removed {stops - len(frame)} repositories that have 0 headings after regex filtering ('{rword}')" + f"Removed {headingless - len(frame)} repositories that have 0 headings after regex filtering ('{rword}')" ) frame["top"] = frame["headings"].apply( lambda headings: top_words(headings, 5) diff --git a/sr-data/src/sr_data/steps/mcw.py b/sr-data/src/sr_data/steps/mcw.py index 5aa3e68a..8b5d139f 100644 --- a/sr-data/src/sr_data/steps/mcw.py +++ b/sr-data/src/sr_data/steps/mcw.py @@ -29,10 +29,18 @@ from markdown_it import MarkdownIt from nltk import word_tokenize, WordNetLemmatizer from nltk.corpus import stopwords -from sr_data.steps.extract import wordnet_pos +from sr_data.steps.extract import wordnet_pos, filter -# remove not words ['|', '--', ':', '-', ','] +# @todo #137:35min Resolve code duplication in preprocessing methods. +# Ideally, we should reuse `remove_stop_words`, `lemmatize` methods from +# extract.py step. Now we are duplicating logic, only slightly changing it to +# fit the input, will be more traceable to reuse existing methods located in +# extract.py. +# @todo #137:45min Stop words filtering is weak. +# method remove_stop_words doesn't remove such words as: ['the', 'to', 'and', +# 'you', 'a'] and etc. We should remove such words too. Don't forget to create +# unit tests. def main(repos, out): logger.info("Collecting most common words...") frame = pd.read_csv(repos) @@ -43,10 +51,16 @@ def main(repos, out): lambda words: remove_stop_words(words, stopwords.words("english")) ) frame["words"] = frame["words"].apply(lemmatize) - # frame["rtext"] = frame["rtext"].apply( - # lambda text: filter(text, r"^[a-zA-Z]+$") - # ) - # frame["mcw"] = frame["rtext"].apply(most_common) + rword = r"^[a-zA-Z]+$" + frame["words"] = frame["words"].apply( + lambda words: filter(words, rword) + ) + before = len(frame) + frame = frame[frame["words"].apply(bool)] + logger.info( + f"Removed {before - len(frame)} repositories that have 0 words after regex filtering ('{rword}')" + ) + frame["mcw"] = frame["words"].apply(most_common) frame.to_csv(out, index=False) logger.info(f"Saved output to {out}") @@ -86,5 +100,5 @@ def lemmatize(words): def most_common(text): - words = word_tokenize(text) + words = word_tokenize(" ".join(text)) return [word for word, _ in Counter(words).most_common(5)]