Add unit tests

jrrobison1 · Aug 16, 2024 · f16ae86 · f16ae86
1 parent fe1d16b
commit f16ae86
Show file tree

Hide file tree

Showing 3 changed files with 71 additions and 6 deletions.
diff --git a/pycpidr/idea_density_rater_rules.py b/pycpidr/idea_density_rater_rules.py
@@ -3,7 +3,7 @@
 
 from pycpidr.utils.constants import *
 from pycpidr.utils.word_search_utils import (
-    is_beginning_of_sentence,
+    beginning_of_sentence,
     is_repetition,
     search_backwards,
 )
@@ -203,7 +203,7 @@ def adjust_word_order(word_list: List[WordListItem], i: int, speech_mode: bool)
     # Note: In some cases this may move a word too far right,
     # but the effect on proposition counting is benign.
     if word.lowercase_token in AUXILIARY_VERBS:
-        sentence_start = is_beginning_of_sentence(word_list, i)
+        sentence_start = beginning_of_sentence(word_list, i)
         if sentence_start == i or word_list[sentence_start].tag in INTERROGATIVES:
             # find out where to move to
             target_position = i + 1
@@ -627,7 +627,7 @@ def handle_fillers(word_list: List[WordListItem], i: int, speech_mode: bool) ->
     # 610
     # A sentence consisting entirely of probable filler words is propositionless
     if speech_mode and word.tag == SENTENCE_END:
-        bos = is_beginning_of_sentence(word_list, i)
+        bos = beginning_of_sentence(word_list, i)
         k = 0
         for j in range(bos, i):
             if word_list[j].tag != "UH" and word_list[j].lowercase_token not in FILLER:

diff --git a/tests/test_idea_density_rater.py b/tests/test_idea_density_rater.py
@@ -19,6 +19,8 @@
 from pycpidr.idea_density_rater import count_words_and_propositions, rate_text
 from pycpidr.tagger import tag_text
 from pycpidr.word_item import WordListItem, WordList
+from pycpidr.utils.word_search_utils import beginning_of_sentence
+from pycpidr.utils.constants import SENTENCE_END
 
 try:
     nlp = spacy.load("en_core_web_sm")
@@ -220,11 +222,12 @@ def test_turner_1987_passage_2():
 
     word_count, proposition_count, idea_density, word_list = rate_text(text, nlp)
 
-    assert word_count == 363
+    # Note: The original CPIDR 3.2 finds 366 words
+    assert word_count == 362
 
     # Note: The original CPIDR 3.2 finds 191 propositions
-    assert proposition_count == 189
-    assert idea_density == pytest.approx(0.520, abs=1e-3)
+    assert proposition_count == 188
+    assert idea_density == pytest.approx(0.519, abs=1e-3)
 
 
 def test_turner_1987_passage_3():

diff --git a/tests/utils/test_word_search_utils.py b/tests/utils/test_word_search_utils.py
@@ -0,0 +1,62 @@
+import pytest
+from pycpidr.utils.word_search_utils import beginning_of_sentence
+from pycpidr.word_item import WordListItem
+from pycpidr.utils.constants import SENTENCE_END
+
+
+def create_word_list(tokens, tags):
+    return [WordListItem(token=t, tag=tag) for t, tag in zip(tokens, tags)]
+
+
+def test_beginning_of_sentence_middle():
+    words = create_word_list(
+        ["This", "is", "a", "sentence", ".", "Another", "one", "."],
+        [
+            "PLC_TAG",
+            "PLC_TAG",
+            "PLC_TAG",
+            "PLC_TAG",
+            SENTENCE_END,
+            "PLC_TAG",
+            "PLC_TAG",
+            SENTENCE_END,
+        ],
+    )
+    assert beginning_of_sentence(words, 6) == 5
+
+
+def test_beginning_of_sentence_start():
+    words = create_word_list(
+        ["This", "is", "a", "sentence", "."],
+        ["PLC_TAG", "PLC_TAG", "PLC_TAG", "PLC_TAG", SENTENCE_END],
+    )
+    assert beginning_of_sentence(words, 2) == 0
+
+
+def test_beginning_of_sentence_end():
+    words = create_word_list(
+        ["This", "is", "a", "sentence", "."],
+        ["PLC_TAG", "PLC_TAG", "PLC_TAG", "PLC_TAG", SENTENCE_END],
+    )
+    assert beginning_of_sentence(words, 4) == 0
+
+
+def test_beginning_of_sentence_single_word():
+    words = create_word_list(["Word"], [""])
+    assert beginning_of_sentence(words, 0) == 0
+
+
+def test_beginning_of_sentence_multiple_sentences():
+    words = create_word_list(
+        ["First", ".", "Second", ".", "Third", "sentence", "."],
+        [
+            "PLC_TAG",
+            SENTENCE_END,
+            "PLC_TAG",
+            SENTENCE_END,
+            "PLC_TAG",
+            "PLC_TAG",
+            SENTENCE_END,
+        ],
+    )
+    assert beginning_of_sentence(words, 5) == 4