fix: Use Guru cards from sets A and B

navapbc · Jun 7, 2024 · 1b3fc83 · 1b3fc83
1 parent e7e8556
commit 1b3fc83
Show file tree

Hide file tree

Showing 5 changed files with 83 additions and 9 deletions.
diff --git a/05-assistive-chatbot/chatbot/__init__.py b/05-assistive-chatbot/chatbot/__init__.py
@@ -82,6 +82,7 @@ def validate_settings(settings):
             return f"Unknown {setting_name}: '{model_name}'"
 
         if chat_engine.startswith("Summaries") and "instruct" not in model_name:
+            # TODO: also send to user
             logger.warning("For the %s chat engine, an `*instruct` model is recommended", chat_engine)
 
     # PLACEHOLDER: Validate other settings

diff --git a/05-assistive-chatbot/chatbot/engines/v2_household_engine.py b/05-assistive-chatbot/chatbot/engines/v2_household_engine.py
@@ -185,6 +185,10 @@ def populate_summaries(gen_results, guru_card_texts, summarizer):
         # Limit summarizing of Guru cards based on score and card count
         if i > 2 and card_entry.score_sum < 0.3:
             continue
+        if card_entry.card_title not in guru_card_texts:
+            # TODO: notify admin via Literal?
+            logger.warning("Guru card not found: %s", card_entry.card_title)
+            continue
         card_text = guru_card_texts[card_entry.card_title]
         card_entry.entire_text = "\n".join([card_entry.card_title, card_text])
         # Summarize based on derived question and original question

diff --git a/05-assistive-chatbot/chatbot/guru_cards.py b/05-assistive-chatbot/chatbot/guru_cards.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 import json
 import os
+from functools import cached_property
 
 from bs4 import BeautifulSoup
 
@@ -17,17 +18,17 @@ def __init__(
         self.content_key = content_key
 
     def extract_qa_text_from_guru(self):
-        json_data = self.cards_as_json()
-        question_answers = self._extract_question_answers(json_data)
+        question_answers = self._extract_question_answers()
         return question_answers
 
+    @cached_property
     def cards_as_json(self):
         with open(self.file_path, encoding="utf-8") as data_file:
             return json.load(data_file)
 
-    def _extract_question_answers(self, json_data):
+    def _extract_question_answers(self):
         question_answers = {}
-        for content in json_data:
+        for content in self.cards_as_json:
             if not content[self.question_key].strip().endswith("?"):
                 continue
             soup = BeautifulSoup(content[self.content_key], "html.parser")
@@ -38,11 +39,10 @@ def _extract_question_answers(self, json_data):
 
 def save_simplified_json(gc_processor):
     "Saves a simplified version of the Guru cards JSON file for easier review"
-    json_data = gc_processor.cards_as_json()
     name, ext = os.path.splitext(gc_processor.file_path)
     with open(f"{name}_simplified{ext}", "w", encoding="utf-8") as f:
         simplified_json = []
-        for card in json_data:
+        for card in gc_processor.cards_as_json:
             tags = [tagsItem.get("value") for tagsItem in card.get("tags", [])]
             boards = [boardsItem.get("title") for boardsItem in card.get("boards", [])]
             soup = BeautifulSoup(card[gc_processor.content_key], "html.parser")
@@ -58,6 +58,48 @@ def save_simplified_json(gc_processor):
         json.dump(simplified_json, f, indent=4)
 
 
+def diff_guru_cards(gc_processor1, gc_processor2):
+    "Return the differences between two Guru cards JSON files"
+    qa1 = gc_processor1.extract_qa_text_from_guru()
+    print(f"Number of questions in file 1: {len(qa1)}")
+    qa2 = gc_processor2.extract_qa_text_from_guru()
+    print(f"Number of questions in file 2: {len(qa2)}")
+
+    q_matches = set(qa1.keys()) & set(qa2.keys())
+    print(f"Number of questions in both files: {len(q_matches)}")
+
+    a_matches = set(qa1.values()) & set(qa2.values())
+    print(f"Number of answers in both files: {len(a_matches)}")
+
+    diff = {}
+    for question, answer in qa1.items():
+        if question not in qa2:
+            diff[question] = (answer, None)
+            continue
+        if qa2.get(question) != answer:
+            diff[question] = (answer, qa2.get(question))
+        qa2.pop(question)
+    print(f"Stage 1: number of differences: {len(diff)}")
+    print("  Count of answers not in file 2", len([a2 for a1, a2 in diff.values() if a2 is None]))
+    print(set([a2 for a1, a2 in diff.values()]))
+    # print("\n".join(diff.keys()))
+    with open("diff-in1.txt", "w") as f:
+        f.write("\n".join(diff.keys()))
+
+    diff2 = {}
+    for question, answer in qa2.items():
+        if question not in qa1:
+            diff2[question] = (None, answer)
+
+    print(f"Stage 2: Number of differences: {len(diff2)}")
+    print("  Count of answers not in file 1", len([a1 for a1, a2 in diff2.values() if a1 is None]))
+    # print(set([a1 for a1,a2 in diff2.values()]))
+
+    diff |= diff2
+    print(f"Final: Number of differences: {len(diff)}")
+    return diff
+
+
 if __name__ == "__main__":
     import sys
 
@@ -66,4 +108,31 @@ def save_simplified_json(gc_processor):
     else:
         _gc_processor = GuruCardsProcessor()
 
-    save_simplified_json(_gc_processor)
+    if len(args) <= 1:
+        save_simplified_json(_gc_processor)
+    elif len(args) == 2:
+        _gc_processor2 = GuruCardsProcessor(file_path=args[1])
+        diffs = diff_guru_cards(_gc_processor, _gc_processor2).keys()
+        # print("\n".join(diffs))
+    elif len(args) == 3:
+        if args[1] == "subset":
+            with open(args[2], "r", encoding="UTF-8") as subset_file:
+                questions_subset = [line.strip() for line in subset_file]
+
+            json_data = _gc_processor.cards_as_json
+            # qa = _gc_processor.extract_qa_text_from_guru()
+
+            data_subset = []
+            for json_entry in json_data:
+                question = json_entry[_gc_processor.question_key].strip()
+                if question in questions_subset:
+                    questions_subset.remove(question)
+                    data_subset.append(json_entry)
+
+            with open("qa_subset.json", "w", encoding="UTF-8") as output_file:
+                output_file.write(json.dumps(data_subset, indent=2))
+
+            print("Remaining questions not found:")
+            print("\n".join(questions_subset))
+    else:
+        print("Usage: guru_cards.py [file_path] [file_path2]")
diff --git a/05-assistive-chatbot/get_input_files.sh b/05-assistive-chatbot/get_input_files.sh
@@ -14,4 +14,4 @@ echo "Downloading from ${GURU_CARDS_URL}"
 curl -f -L "${GURU_CARDS_URL}" -o /tmp/download.zip
 unzip -o /tmp/download.zip
 rm -v /tmp/download.zip
-mv -v guru_cards_for_nava--Multi-benefit.json guru_cards_for_nava.json
+mv -v guru_cards_setB_plus_setA.json guru_cards_for_nava.json
diff --git a/05-assistive-chatbot/guru_api_access.md b/05-assistive-chatbot/guru_api_access.md
@@ -57,7 +57,7 @@ done
 jq -n '[inputs] | add' guru_cards_for_nava_?.json guru_cards_for_nava_??.json > guru_cards_for_nava.json
 
 # Create simplified JSON for readability
-python ingest.py guru_cards_for_nava.json
+python chatbot/guru_cards.py guru_cards_for_nava.json
 
 # Count cards
 jq length guru_cards_for_nava.json