Skip to content

Commit

Permalink
feat: view test results in Streamlit
Browse files Browse the repository at this point in the history
  • Loading branch information
StijnGoossens committed Oct 10, 2023
1 parent a75e167 commit 73792c6
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 55 deletions.
19 changes: 8 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,17 @@
Evaluates LLM-based applications.

## To-do's
- [x] Convert my EHBO notes into question-answer pairs, using OpenAI Function Calling.
- [/] Turn the question-answer pairs into a test set.
- [x] Build Streamlit app for testing myself.
- [] Bug: when I click on the 'Evaluate' button, the app goes to the next question.
- [x] Convert EHBO notes into question-answer pairs, using OpenAI Function Calling.
- [x] Turn the question-answer pairs into a test set.
- [x] Build LLM component to evaluate the given answers by comparing it with the reference answer.
- [x] Build LLM 'app' that can answer the questions.
- [x] Evaluate the LLM app with the LLM evaluator.
- [] Streamlit page to view the evaluation results.
- [] Add the question-answer pairs as the knowledge base for that app.
- [] Evaluate the LLM app with the LLM evaluator.
- [] Compare the results.
- [] Streamlit page to view, edit and add test cases.
- [] Cache the OpenAI function calls.

- [x] Streamlit page to view the evaluation results.
- [ ] Combine the evaluation results into a single metric.
- [ ] Evaluate and compare different LLM apps (GPT-3.5, GPT-4, with RAG)
- [ ] Streamlit page to visualize the comparison.
- [ ] Streamlit page to view, edit and add test cases.
- [ ] Integrate with MLflow for experiment tracking (?)


## Using
Expand Down
123 changes: 84 additions & 39 deletions src/llm_app_eval/app.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,94 @@
"""Streamlit app."""

import json
import os
from importlib.metadata import version

import numpy as np
import pandas as pd
import streamlit as st
from evaluator import Evaluator
from qa_extraction import load_qa_pairs
from eval_properties import properties

st.title(f"llm-app-eval v{version('llm-app-eval')}") # type: ignore[no-untyped-call]


qa_pairs = load_qa_pairs("src/llm_app_eval/data/question_answer_pairs.csv")
evaluator = Evaluator(llm="gpt-4")

# Shuffle the question and answer pairs
np.random.seed(42)
np.random.shuffle(qa_pairs)
# Display a question and answer pair
if "idx" in st.session_state:
idx = st.session_state.idx
else:
idx = 0
st.session_state.idx = idx
st.write(f"Question {idx + 1} of {len(qa_pairs)}")
qa = qa_pairs[idx]
st.header("Question")
st.write(qa.question)
st.header("Answer")
answer = st.text_input("Answer")
st.header("Reference Answer")
st.write(qa.answer)


eval_button = st.button("Evaluate")
if eval_button:
result = evaluator.evaluate(qa.question, answer, qa.answer)
st.write("✅" if result.pass_fail else "❌")
st.write(result.feedback)
st.session_state.idx = min(st.session_state.idx + 1, len(qa_pairs) - 1)
else:
# Display previous and next buttons
col1, col2, col3 = st.columns(3)
if col1.button("Previous"):
st.session_state.idx = max(st.session_state.idx - 1, 0)
if col2.button("Random"):
st.session_state.idx = np.random.randint(0, len(qa_pairs))
if col3.button("Next"):
st.session_state.idx = min(st.session_state.idx + 1, len(qa_pairs) - 1)
TEST_SET_FOLDER = "src/llm_app_eval/data/test_cases"
EVAL_FOLDER = "src/llm_app_eval/data/eval_results"
EVAL_RUNS = ["20231001_175828"]

# Load all the test cases JSON files
test_cases = {} # type: ignore
for test_case in os.listdir(TEST_SET_FOLDER):
test_case_path = os.path.join(TEST_SET_FOLDER, test_case)
with open(test_case_path) as f:
test_cases[test_case] = json.load(f)

# Load all the evaluation results JSON files
eval_results = {} # type: ignore
for eval_run in EVAL_RUNS:
eval_results[eval_run] = {}
eval_run_folder = os.path.join(EVAL_FOLDER, eval_run)
for eval_file in os.listdir(eval_run_folder):
eval_file_path = os.path.join(eval_run_folder, eval_file)
with open(eval_file_path) as f:
eval_results[eval_run][eval_file] = json.load(f)

# Build a matrix for each evaluation run
# Each row is a test case. Each column is a property.
eval_matrices = {} # type: ignore
for eval_run in EVAL_RUNS:
eval_matrices[eval_run] = np.zeros((len(test_cases), len(properties)))
for test_case_idx, test_case in enumerate(test_cases):
for property_idx, prop in enumerate(properties):
r = eval_results[eval_run][test_case]
for property_result in r["property_results"]:
if property_result["property_name"] == prop.property_name:
eval_matrices[eval_run][test_case_idx, property_idx] = property_result[
"pass_fail"
]
break
# Turn the matrix into a dataframe
eval_matrices[eval_run] = pd.DataFrame(
eval_matrices[eval_run],
columns=[prop.property_name for prop in properties],
index=list(test_cases),
)

st.write(eval_matrices[eval_run])

# Select a specific test case
test_case = st.selectbox("Test case", list(test_cases.keys())) # type: ignore

# Select a specific evaluation run
eval_run = st.selectbox("Evaluation run", EVAL_RUNS) # type: ignore

# Show the test case input
st.markdown("**Test case input:**")
st.write(test_cases[test_case]["test_input"]["question"])
# Show the reference_output, historical_output, and historical_feedback, if available
if test_cases[test_case]["reference_output"]:
st.markdown("**Reference output:**")
st.write(test_cases[test_case]["reference_output"]["answer"])
if test_cases[test_case]["historical_output"]:
st.markdown("**Historical output:**")
st.write(test_cases[test_case]["historical_output"]["answer"])
if test_cases[test_case]["historical_feedback"]:
st.markdown("**Historical feedback:**")
st.write(test_cases[test_case]["historical_feedback"])

# Show the model output
st.markdown("**Model response:**")
st.write(eval_results[eval_run][test_case]["output"]["answer"])

# Show the evaluation results
st.markdown("**Evaluation results:**")
# Loop over the properties
for prop in properties:
# Loop over the evaluation runs
for eval_run in EVAL_RUNS:
# Loop over the evaluation results
for property_result in eval_results[eval_run][test_case]["property_results"]:
# If the property name matches the current property, show the result
if property_result["property_name"] == prop.property_name:
st.write(f"{prop.property_name}: {'✅' if property_result['pass_fail'] else '❌'}")
st.write(property_result["feedback"])
10 changes: 5 additions & 5 deletions src/llm_app_eval/notebooks/20231001_evaluate_properties.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -41,28 +41,28 @@
"source": [
"test_cases = [\n",
" TestCase(\n",
" test_id=uuid.uuid4().hex,\n",
" test_id=1,\n",
" test_input={\"question\": \"Waarom zou het slachtoffer naar de dokter moeten gaan na het Heimlich-manoeuvre?\"},\n",
" reference_output={\"answer\": \"Omdat het Heimlich-manoeuvre een interne bloeding kan hebben veroorzaakt.\"},\n",
" ),\n",
" TestCase(\n",
" test_id=uuid.uuid4().hex,\n",
" test_id=2,\n",
" test_input={\"question\": \"Wat zijn de vier stappen van eerste hulp?\"},\n",
" reference_output={\"answer\": \"1. Zorg voor veiligheid, 2. Beoordeel de toestand van het slachtoffer, 3. Hulpdiensten verwittigen indien nodig, 4. Verleen verdere eerste hulp.\"},\n",
" ),\n",
" TestCase(\n",
" test_id=uuid.uuid4().hex,\n",
" test_id=3,\n",
" test_input={\"question\": \"Wat is de eerste stap van eerste hulp?\"},\n",
" historical_output={\"answer\": \"Zorg voor de veiligheid van het slachtoffer.\"},\n",
" historical_feedback=\"Het is belangrijk om ook voor de veiligheid van jezelf en omstaanders te zorgen.\",\n",
" ),\n",
" TestCase(\n",
" test_id=uuid.uuid4().hex,\n",
" test_id=4,\n",
" test_input={\"question\": \"Wat moet je doen als het slachtoffer geen ademhaling heeft?\"},\n",
" historical_output={\"answer\": \"Bel 112\"},\n",
" ),\n",
" TestCase(\n",
" test_id=uuid.uuid4().hex,\n",
" test_id=5,\n",
" test_input={\"question\": \"Moet je eten of drinken toedienen in een noodsituatie?\"},\n",
" reference_output={\"answer\": \"Nee, behalve bij een hypo (lage bloedsuiker) of hitte- en zonneslag\"},\n",
" ),\n",
Expand Down

0 comments on commit 73792c6

Please sign in to comment.