Skip to content

Commit

Permalink
feat(#137): markdownit package, puzzles
Browse files Browse the repository at this point in the history
  • Loading branch information
h1alexbel committed Oct 15, 2024
1 parent ef742df commit c249fb8
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 10 deletions.
1 change: 1 addition & 0 deletions sr-data/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ scikit-learn = "^1.5.1"
cohere = "5.9.1"
loguru = "^0.7.2"
scikit-fuzzy = "^0.5.0"
markdown-it-py = "^3.0.0"

[tool.poetry.group.dev.dependencies]
pytest = "^8.2.2"
Expand Down
6 changes: 3 additions & 3 deletions sr-data/src/sr_data/steps/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ def main(repos, out):
frame["headings"] = frame["readme"].apply(headings)
before = len(frame)
frame = frame.dropna(subset=["headings"])
stops = len(frame)
logger.info(f"Removed {before - stops} repositories that don't have at least one heading (#)")
headingless = len(frame)
logger.info(f"Removed {before - headingless} repositories that don't have at least one heading (#)")
frame["headings"] = frame["headings"].apply(
lambda readme: remove_stop_words(readme, stopwords.words("english"))
)
Expand All @@ -59,7 +59,7 @@ def main(repos, out):
)
frame = frame[frame["headings"].apply(bool)]
logger.info(
f"Removed {stops - len(frame)} repositories that have 0 headings after regex filtering ('{rword}')"
f"Removed {headingless - len(frame)} repositories that have 0 headings after regex filtering ('{rword}')"
)
frame["top"] = frame["headings"].apply(
lambda headings: top_words(headings, 5)
Expand Down
28 changes: 21 additions & 7 deletions sr-data/src/sr_data/steps/mcw.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,18 @@
from markdown_it import MarkdownIt
from nltk import word_tokenize, WordNetLemmatizer
from nltk.corpus import stopwords
from sr_data.steps.extract import wordnet_pos
from sr_data.steps.extract import wordnet_pos, filter


# remove not words ['|', '--', ':', '-', ',']
# @todo #137:35min Resolve code duplication in preprocessing methods.
# Ideally, we should reuse `remove_stop_words`, `lemmatize` methods from
# extract.py step. Now we are duplicating logic, only slightly changing it to
# fit the input, will be more traceable to reuse existing methods located in
# extract.py.
# @todo #137:45min Stop words filtering is weak.
# method remove_stop_words doesn't remove such words as: ['the', 'to', 'and',
# 'you', 'a'] and etc. We should remove such words too. Don't forget to create
# unit tests.
def main(repos, out):
logger.info("Collecting most common words...")
frame = pd.read_csv(repos)
Expand All @@ -43,10 +51,16 @@ def main(repos, out):
lambda words: remove_stop_words(words, stopwords.words("english"))
)
frame["words"] = frame["words"].apply(lemmatize)
# frame["rtext"] = frame["rtext"].apply(
# lambda text: filter(text, r"^[a-zA-Z]+$")
# )
# frame["mcw"] = frame["rtext"].apply(most_common)
rword = r"^[a-zA-Z]+$"
frame["words"] = frame["words"].apply(
lambda words: filter(words, rword)
)
before = len(frame)
frame = frame[frame["words"].apply(bool)]
logger.info(
f"Removed {before - len(frame)} repositories that have 0 words after regex filtering ('{rword}')"
)
frame["mcw"] = frame["words"].apply(most_common)
frame.to_csv(out, index=False)
logger.info(f"Saved output to {out}")

Expand Down Expand Up @@ -86,5 +100,5 @@ def lemmatize(words):


def most_common(text):
words = word_tokenize(text)
words = word_tokenize(" ".join(text))
return [word for word, _ in Counter(words).most_common(5)]

0 comments on commit c249fb8

Please sign in to comment.