examples/tweets/tweet_metric.py

import dspy
from dspy.datasets import HotPotQA

# Load the dataset.
dataset = HotPotQA(train_seed=1, train_size=200, eval_seed=2023, dev_size=200, test_size=0, keep_details=True)

# Tell DSPy that the 'question' field is the input. Any other fields are labels and/or metadata.
trainset = [x.without('id', 'type').with_inputs('question') for x in dataset.train]
devset = [x.without('id', 'type').with_inputs('question') for x in dataset.dev]
valset, devset = devset[:50], devset[50:]


# Define the signature for automatic assessments.
class Assess(dspy.Signature):
    """Assess the quality of a tweet along the specified dimension."""

    context = dspy.InputField(desc='ignore if N/A')
    assessed_text = dspy.InputField()
    assessment_question = dspy.InputField()
    assessment_answer = dspy.OutputField(desc="Yes or No")


gpt4T = dspy.OpenAI(model='gpt-4-1106-preview', max_tokens=1000, model_type='chat')
retrieve = dspy.Retrieve(k=5)
METRIC = None

def metric(gold, pred, trace=None):
    question, answer, tweet = gold.question, gold.answer, pred.output
    context = retrieve(question).passages

    engaging = "Does the assessed text make for a self-contained, engaging tweet?"
    faithful = "Is the assessed text grounded in the context? Say no if it includes significant facts not in the context."
    correct = f"The text above is should answer `{question}`. The gold answer is `{answer}`."
    correct = f"{correct} Does the assessed text above contain the gold answer?"
    
    with dspy.context(lm=gpt4T):
        faithful = dspy.Predict(Assess)(context=context, assessed_text=tweet, assessment_question=faithful)
        correct =  dspy.Predict(Assess)(context='N/A', assessed_text=tweet, assessment_question=correct)
        engaging = dspy.Predict(Assess)(context='N/A', assessed_text=tweet, assessment_question=engaging)

    correct, engaging, faithful = (m.assessment_answer.split()[0].lower() == 'yes' for m in [correct, engaging, faithful])
    score = (correct + engaging + faithful) if correct and (len(tweet) <= 280) else 0

    if METRIC is not None:
        if METRIC == 'correct': return correct
        if METRIC == 'engaging': return engaging
        if METRIC == 'faithful': return faithful

    if trace is not None: return score >= 3
    return score / 3.0