Skip to content

Commit

Permalink
fix: Use Guru cards from sets A and B
Browse files Browse the repository at this point in the history
  • Loading branch information
yoomlam committed Jun 7, 2024
1 parent e7e8556 commit 1b3fc83
Show file tree
Hide file tree
Showing 5 changed files with 83 additions and 9 deletions.
1 change: 1 addition & 0 deletions 05-assistive-chatbot/chatbot/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def validate_settings(settings):
return f"Unknown {setting_name}: '{model_name}'"

if chat_engine.startswith("Summaries") and "instruct" not in model_name:
# TODO: also send to user
logger.warning("For the %s chat engine, an `*instruct` model is recommended", chat_engine)

# PLACEHOLDER: Validate other settings
Expand Down
4 changes: 4 additions & 0 deletions 05-assistive-chatbot/chatbot/engines/v2_household_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,10 @@ def populate_summaries(gen_results, guru_card_texts, summarizer):
# Limit summarizing of Guru cards based on score and card count
if i > 2 and card_entry.score_sum < 0.3:
continue
if card_entry.card_title not in guru_card_texts:
# TODO: notify admin via Literal?
logger.warning("Guru card not found: %s", card_entry.card_title)
continue
card_text = guru_card_texts[card_entry.card_title]
card_entry.entire_text = "\n".join([card_entry.card_title, card_text])
# Summarize based on derived question and original question
Expand Down
83 changes: 76 additions & 7 deletions 05-assistive-chatbot/chatbot/guru_cards.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python
import json
import os
from functools import cached_property

from bs4 import BeautifulSoup

Expand All @@ -17,17 +18,17 @@ def __init__(
self.content_key = content_key

def extract_qa_text_from_guru(self):
json_data = self.cards_as_json()
question_answers = self._extract_question_answers(json_data)
question_answers = self._extract_question_answers()
return question_answers

@cached_property
def cards_as_json(self):
with open(self.file_path, encoding="utf-8") as data_file:
return json.load(data_file)

def _extract_question_answers(self, json_data):
def _extract_question_answers(self):
question_answers = {}
for content in json_data:
for content in self.cards_as_json:
if not content[self.question_key].strip().endswith("?"):
continue
soup = BeautifulSoup(content[self.content_key], "html.parser")
Expand All @@ -38,11 +39,10 @@ def _extract_question_answers(self, json_data):

def save_simplified_json(gc_processor):
"Saves a simplified version of the Guru cards JSON file for easier review"
json_data = gc_processor.cards_as_json()
name, ext = os.path.splitext(gc_processor.file_path)
with open(f"{name}_simplified{ext}", "w", encoding="utf-8") as f:
simplified_json = []
for card in json_data:
for card in gc_processor.cards_as_json:
tags = [tagsItem.get("value") for tagsItem in card.get("tags", [])]
boards = [boardsItem.get("title") for boardsItem in card.get("boards", [])]
soup = BeautifulSoup(card[gc_processor.content_key], "html.parser")
Expand All @@ -58,6 +58,48 @@ def save_simplified_json(gc_processor):
json.dump(simplified_json, f, indent=4)


def diff_guru_cards(gc_processor1, gc_processor2):
"Return the differences between two Guru cards JSON files"
qa1 = gc_processor1.extract_qa_text_from_guru()
print(f"Number of questions in file 1: {len(qa1)}")
qa2 = gc_processor2.extract_qa_text_from_guru()
print(f"Number of questions in file 2: {len(qa2)}")

q_matches = set(qa1.keys()) & set(qa2.keys())
print(f"Number of questions in both files: {len(q_matches)}")

a_matches = set(qa1.values()) & set(qa2.values())
print(f"Number of answers in both files: {len(a_matches)}")

diff = {}
for question, answer in qa1.items():
if question not in qa2:
diff[question] = (answer, None)
continue
if qa2.get(question) != answer:
diff[question] = (answer, qa2.get(question))
qa2.pop(question)
print(f"Stage 1: number of differences: {len(diff)}")
print(" Count of answers not in file 2", len([a2 for a1, a2 in diff.values() if a2 is None]))
print(set([a2 for a1, a2 in diff.values()]))
# print("\n".join(diff.keys()))
with open("diff-in1.txt", "w") as f:
f.write("\n".join(diff.keys()))

diff2 = {}
for question, answer in qa2.items():
if question not in qa1:
diff2[question] = (None, answer)

print(f"Stage 2: Number of differences: {len(diff2)}")
print(" Count of answers not in file 1", len([a1 for a1, a2 in diff2.values() if a1 is None]))
# print(set([a1 for a1,a2 in diff2.values()]))

diff |= diff2
print(f"Final: Number of differences: {len(diff)}")
return diff


if __name__ == "__main__":
import sys

Expand All @@ -66,4 +108,31 @@ def save_simplified_json(gc_processor):
else:
_gc_processor = GuruCardsProcessor()

save_simplified_json(_gc_processor)
if len(args) <= 1:
save_simplified_json(_gc_processor)
elif len(args) == 2:
_gc_processor2 = GuruCardsProcessor(file_path=args[1])
diffs = diff_guru_cards(_gc_processor, _gc_processor2).keys()
# print("\n".join(diffs))
elif len(args) == 3:
if args[1] == "subset":
with open(args[2], "r", encoding="UTF-8") as subset_file:
questions_subset = [line.strip() for line in subset_file]

json_data = _gc_processor.cards_as_json
# qa = _gc_processor.extract_qa_text_from_guru()

data_subset = []
for json_entry in json_data:
question = json_entry[_gc_processor.question_key].strip()
if question in questions_subset:
questions_subset.remove(question)
data_subset.append(json_entry)

with open("qa_subset.json", "w", encoding="UTF-8") as output_file:
output_file.write(json.dumps(data_subset, indent=2))

print("Remaining questions not found:")
print("\n".join(questions_subset))
else:
print("Usage: guru_cards.py [file_path] [file_path2]")
2 changes: 1 addition & 1 deletion 05-assistive-chatbot/get_input_files.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ echo "Downloading from ${GURU_CARDS_URL}"
curl -f -L "${GURU_CARDS_URL}" -o /tmp/download.zip
unzip -o /tmp/download.zip
rm -v /tmp/download.zip
mv -v guru_cards_for_nava--Multi-benefit.json guru_cards_for_nava.json
mv -v guru_cards_setB_plus_setA.json guru_cards_for_nava.json
2 changes: 1 addition & 1 deletion 05-assistive-chatbot/guru_api_access.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ done
jq -n '[inputs] | add' guru_cards_for_nava_?.json guru_cards_for_nava_??.json > guru_cards_for_nava.json
# Create simplified JSON for readability
python ingest.py guru_cards_for_nava.json
python chatbot/guru_cards.py guru_cards_for_nava.json
# Count cards
jq length guru_cards_for_nava.json
Expand Down

0 comments on commit 1b3fc83

Please sign in to comment.