feat(#137): markdownit package, puzzles

h1alexbel · Oct 15, 2024 · c249fb8 · c249fb8
1 parent ef742df
commit c249fb8
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 10 deletions.
diff --git a/sr-data/pyproject.toml b/sr-data/pyproject.toml
@@ -40,6 +40,7 @@ scikit-learn = "^1.5.1"
 cohere = "5.9.1"
 loguru = "^0.7.2"
 scikit-fuzzy = "^0.5.0"
+markdown-it-py = "^3.0.0"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.2.2"

diff --git a/sr-data/src/sr_data/steps/extract.py b/sr-data/src/sr_data/steps/extract.py
@@ -45,8 +45,8 @@ def main(repos, out):
     frame["headings"] = frame["readme"].apply(headings)
     before = len(frame)
     frame = frame.dropna(subset=["headings"])
-    stops = len(frame)
-    logger.info(f"Removed {before - stops} repositories that don't have at least one heading (#)")
+    headingless = len(frame)
+    logger.info(f"Removed {before - headingless} repositories that don't have at least one heading (#)")
     frame["headings"] = frame["headings"].apply(
         lambda readme: remove_stop_words(readme, stopwords.words("english"))
     )
@@ -59,7 +59,7 @@ def main(repos, out):
     )
     frame = frame[frame["headings"].apply(bool)]
     logger.info(
-        f"Removed {stops - len(frame)} repositories that have 0 headings after regex filtering ('{rword}')"
+        f"Removed {headingless - len(frame)} repositories that have 0 headings after regex filtering ('{rword}')"
     )
     frame["top"] = frame["headings"].apply(
         lambda headings: top_words(headings, 5)

diff --git a/sr-data/src/sr_data/steps/mcw.py b/sr-data/src/sr_data/steps/mcw.py
@@ -29,10 +29,18 @@
 from markdown_it import MarkdownIt
 from nltk import word_tokenize, WordNetLemmatizer
 from nltk.corpus import stopwords
-from sr_data.steps.extract import wordnet_pos
+from sr_data.steps.extract import wordnet_pos, filter
 
 
-# remove not words ['|', '--', ':', '-', ',']
+# @todo #137:35min Resolve code duplication in preprocessing methods.
+#  Ideally, we should reuse `remove_stop_words`, `lemmatize` methods from
+#  extract.py step. Now we are duplicating logic, only slightly changing it to
+#  fit the input, will be more traceable to reuse existing methods located in
+#  extract.py.
+# @todo #137:45min Stop words filtering is weak.
+#  method remove_stop_words doesn't remove such words as: ['the', 'to', 'and',
+#  'you', 'a'] and etc. We should remove such words too. Don't forget to create
+#  unit tests.
 def main(repos, out):
     logger.info("Collecting most common words...")
     frame = pd.read_csv(repos)
@@ -43,10 +51,16 @@ def main(repos, out):
         lambda words: remove_stop_words(words, stopwords.words("english"))
     )
     frame["words"] = frame["words"].apply(lemmatize)
-    # frame["rtext"] = frame["rtext"].apply(
-    #     lambda text: filter(text, r"^[a-zA-Z]+$")
-    # )
-    # frame["mcw"] = frame["rtext"].apply(most_common)
+    rword = r"^[a-zA-Z]+$"
+    frame["words"] = frame["words"].apply(
+        lambda words: filter(words, rword)
+    )
+    before = len(frame)
+    frame = frame[frame["words"].apply(bool)]
+    logger.info(
+        f"Removed {before - len(frame)} repositories that have 0 words after regex filtering ('{rword}')"
+    )
+    frame["mcw"] = frame["words"].apply(most_common)
     frame.to_csv(out, index=False)
     logger.info(f"Saved output to {out}")
 
@@ -86,5 +100,5 @@ def lemmatize(words):
 
 
 def most_common(text):
-    words = word_tokenize(text)
+    words = word_tokenize(" ".join(text))
     return [word for word, _ in Counter(words).most_common(5)]