From 7faa2b30437dbee5a19f779eec97eae05d7eb8d4 Mon Sep 17 00:00:00 2001 From: David Date: Thu, 21 Nov 2024 10:04:37 -0800 Subject: [PATCH 1/2] improve confidence score generation --- libem/core/struct/pattern.py | 4 ++-- libem/match/function.py | 26 ++++++++++++++++---------- libem/match/prompt.py | 13 ++++++++----- 3 files changed, 26 insertions(+), 17 deletions(-) diff --git a/libem/core/struct/pattern.py b/libem/core/struct/pattern.py index ef62ad6..54ae2c8 100644 --- a/libem/core/struct/pattern.py +++ b/libem/core/struct/pattern.py @@ -9,6 +9,6 @@ ) Confidence = Prompt( - default="Give a confidence score from 0.0 to 1.0, with 0.0 being a guess " - "and 1.0 being confident, give the score only, do not justify.", + default="At the end, give a confidence score from 0.0 to 1.0, " + "do not justify.", ) diff --git a/libem/match/function.py b/libem/match/function.py index d561ba8..b0871b1 100644 --- a/libem/match/function.py +++ b/libem/match/function.py @@ -127,9 +127,9 @@ async def once(left: str, right: str) -> dict: prompt.role(), prompt.rules(), prompt.experiences(), - struct.CoT() if parameter.cot() else "", - struct.Confidence() if parameter.confidence() else "", + prompt.CoT() if parameter.cot() else "", prompt.output(), + prompt.Confidence() if parameter.confidence() else "", ) match_prompt = Prompt.join( @@ -289,20 +289,14 @@ def parse_output(output: str) -> dict: """Handle the model output of format: ``` (e.g., "Name Comparison: ...") - (e.g., "Confidence Score: 5" or "5") (e.g., yes) + (e.g., "Confidence Score: 0.9" or "0.9") ``` """ # remove empty lines and process lines in reverse order output = [s for s in output.splitlines() if s][::-1] - answer = output.pop(0).lower() - if prompt.output.value == Index("likelihood"): - answer = float(answer) - else: - answer = "yes" if "yes" in answer else "no" - - confidence, explanation = None, None + answer, confidence, explanation = "no", None, None if parameter.confidence(): for i, line in enumerate(output): @@ -312,6 +306,18 @@ def parse_output(output: str) -> dict: confidence = float(''.join(nums)) output = output[i + 1:] break + + for i, line in enumerate(output): + line = line.lower() + nums = re.findall(r"\d+\.\d+|\d+", line) + if prompt.output.value == Index("likelihood") and nums: + answer = float(answer) + output = output[i + 1:] + break + elif 'yes' in line or 'no' in line: + answer = "yes" if "yes" in line else "no" + output = output[i + 1:] + break if parameter.cot(): explanation = "\n".join(output[::-1]) diff --git a/libem/match/prompt.py b/libem/match/prompt.py index 517d19c..19e8874 100644 --- a/libem/match/prompt.py +++ b/libem/match/prompt.py @@ -2,6 +2,9 @@ from libem.core.struct.prompt import ( Shot, Rules, Experiences ) +from libem.core.struct.pattern import ( + CoT, Confidence +) from libem.match.parameter import model """System prompts""" @@ -25,17 +28,17 @@ default=Index( lambda: "strict" if model() in { - "llama3", "llama3.1", "llama3.2-3b", "llama3.2-1b", - "gpt-4o-2024-08-06", "claude-3-5-sonnet-20240620", + "llama3", "llama3.1", "llama3.2-3b", "llama3.2-1b", + "claude-3-5-sonnet-20240620", } else "standard" ), options={ - "standard": "At the end, give your answer in the form of a " + "standard": "Give your answer in the form of a " "single 'yes' or 'no'.", - "strict": "At the end, give your answer in the form of a " + "strict": "Give your answer in the form of a " "single 'yes' or 'no'. Nothing else.", - "likelihood": "At the end, give your answer strictly in the " + "likelihood": "Give your answer strictly in the " "format of a single number between 0.0 and 1.0, " "estimating the likelihood that the two entities " "are the same.", From cbe4aec3853a2918486f49be614cc52edee14df8 Mon Sep 17 00:00:00 2001 From: Char15Xu Date: Sun, 1 Dec 2024 23:49:51 -0500 Subject: [PATCH 2/2] Initial commit in cascade-feature --- .gitignore | 10 + examples/cascade/bench.py | 28 ++ examples/cascade/online.py | 11 + examples/cascade/util.py | 435 ++++++++++++++++++++++++++++ libem/cascade/__init__.py | 0 libem/cascade/function.py | 58 ++++ libem/cascade/match/function.py | 24 ++ libem/cascade/prematch/function.py | 23 ++ libem/cascade/util.py | 272 +++++++++++++++++ libem/cascade/vectorize/function.py | 25 ++ libem/optimize/cost/openai.py | 1 - libem/optimize/function.py | 5 +- 12 files changed, 890 insertions(+), 2 deletions(-) create mode 100644 examples/cascade/bench.py create mode 100644 examples/cascade/online.py create mode 100644 examples/cascade/util.py create mode 100644 libem/cascade/__init__.py create mode 100644 libem/cascade/function.py create mode 100644 libem/cascade/match/function.py create mode 100644 libem/cascade/prematch/function.py create mode 100644 libem/cascade/util.py create mode 100644 libem/cascade/vectorize/function.py diff --git a/.gitignore b/.gitignore index edff4e5..564ddf0 100644 --- a/.gitignore +++ b/.gitignore @@ -172,3 +172,13 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. .idea/ +# Ignore macOS .DS_Store files +.DS_Store +examples/.DS_Store +examples/cascade/.DS_Store + +# Ignore specific output folder +examples/cascade/output/ + +# Ignore libem-specific .DS_Store file +libem/.DS_Store \ No newline at end of file diff --git a/examples/cascade/bench.py b/examples/cascade/bench.py new file mode 100644 index 0000000..b3636ab --- /dev/null +++ b/examples/cascade/bench.py @@ -0,0 +1,28 @@ +import numpy as np +from benchmark.run import args +import json +from libem.prepare.datasets import abt_buy, beer, itunes_amazon +from libem.cascade.function import online +from util import sensitivity_analysis, generate_stats, confidence_cost_plot, confidence_f1_plot, plot_result, save_results, compare_results + +def bench(): + results_data = online(args(), dataset=abt_buy, num_pairs=None, threshold=0.9) + compare_results(results_data) + cascade_stats, prematch_single, match_single = generate_stats(results_data) + save_results(cascade_stats, prematch_single, match_single) + plot_result(cascade_stats, prematch_single, match_single) + + +def sensitivity_analysis(): + # Perform sensitivity analysis + thresholds = np.arange(0.1, 1.0, 0.1) + f1_scores, costs = sensitivity_analysis(args(), abt_buy, thresholds, num_pairs=100) + F1_SCORE = f1_scores + COSTS = costs + confidence_cost_plot(thresholds, COSTS) + confidence_f1_plot(thresholds, F1_SCORE) + + + +if __name__ == '__main__': + bench() \ No newline at end of file diff --git a/examples/cascade/online.py b/examples/cascade/online.py new file mode 100644 index 0000000..e0e44f8 --- /dev/null +++ b/examples/cascade/online.py @@ -0,0 +1,11 @@ +from libem.prepare.datasets import abt_buy +from benchmark.run import args +import numpy as np +from libem.cascade.function import online + +def main(): + online(args(), abt_buy, num_pairs=100, threshold=0.9) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/examples/cascade/util.py b/examples/cascade/util.py new file mode 100644 index 0000000..bae241d --- /dev/null +++ b/examples/cascade/util.py @@ -0,0 +1,435 @@ +import os +import json +import libem +import numpy as np +from datetime import datetime +import matplotlib.pyplot as plt +from collections import defaultdict +from libem.core.eval import accuracy +from libem.cascade.util import profile +from libem.cascade.function import online +from libem.cascade.match.function import run as match + + +def benchmark(result): + cascade_stats = generate_stats(result) + + save_results(result, cascade_stats) + + plot_result(result, cascade_stats) + + return cascade_stats, result["cascade_result"] + + +def generate_stats(result): + + args = result["args"] + train_set = result["train_set"] + test_set = result["test_set"] + prematch_model = result["prematch_model"] + match_model = result["match_model"] + num_pairs = result["num_pairs"] + prematch_stats = result["prematch_stats"] + prematch_results = result["prematch_results"] + match_stats = result["match_stats"] + cascade_stats = result["cascade_stats"] + cascade_result = result["cascade_result"] + start_time = result["start_time"] + end_time = result["end_time"] + + print("Prematch Stats: ", prematch_stats) + print("Match Stats: ", match_stats) + + overall_stat = profile(args, cascade_result, num_pairs) + overall_stat['throughput'] = libem.round(len(test_set) / (end_time - start_time), 2) + overall_stat['tokens'] = {} + + prematch_cost = prematch_stats.get('tokens', {}).get('cost', 0) + match_cost = match_stats.get('tokens', {}).get('cost', 0) + print(prematch_cost, match_cost) + + overall_stat['tokens']['cost'] = prematch_cost + match_cost + cascade_stats = { + "dataset": args.name, + "stats": overall_stat, + "stats by stages": { + f"Prematch {prematch_model}": prematch_stats, + f"Match {match_model}": match_stats, + }, + "results": cascade_result + } + + result["cascade_stats"] = cascade_stats + + # Result when only Prematch model is used + prematch_single_stats, prematch_single_results = prematch_stats, prematch_results + + prematch_single = { + "dataset": args.name, + "model": prematch_model, + "stats": prematch_single_stats, + "results": prematch_single_results + } + + # Result when only Match model is used + match_single_state, match_single_result = match(train_set, test_set, args, model_choice=match_model) + + match_single = { + "dataset": args.name, + "model": match_model, + "stats": match_single_state, + "results": match_single_result + } + + return cascade_stats, prematch_single, match_single + + +def save_results(cascade, prematch_single, match_single): + # Save the output + dataset = cascade["dataset"] + output_path = os.path.join("examples", "cascade", "output", dataset) + timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + + # Output of the Cascade + dataset = cascade["dataset"] + cascade_file_name = f"cascade-{dataset}-{timestamp}.json" + cascade_file_path = os.path.join(output_path, cascade_file_name) + os.makedirs(os.path.dirname(cascade_file_path), exist_ok=True) + with open(cascade_file_path, "w") as json_file: + json.dump(cascade, json_file, indent=4) + + # Output of the Prematch + dataset = prematch_single["dataset"] + prematch_model = prematch_single["model"] + prematch_file_name = f"{prematch_model}-{dataset}-{timestamp}.json" + prematch_file_path = os.path.join(output_path, prematch_file_name) + os.makedirs(os.path.dirname(prematch_file_path), exist_ok=True) + with open(prematch_file_path, "w") as json_file: + json.dump(prematch_single, json_file, indent=4) + + # Output of the Match + dataset = match_single["dataset"] + match_model = match_single["model"] + match_file_name = f"{match_model}-{dataset}-{timestamp}.json" + match_file_path = os.path.join(output_path, match_file_name) + os.makedirs(os.path.dirname(match_file_path), exist_ok=True) + with open(match_file_path, "w") as json_file: + json.dump(match_single, json_file, indent=4) + + +def plot_result(cascade, prematch_single, match_single): + dataset = cascade["dataset"] + output_path = os.path.join("examples", "cascade", "output", dataset) + timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + + prematch_model = prematch_single["model"] + match_model = match_single["model"] + result_prematch = prematch_single["results"] + result_match = match_single["results"] + approaches = [prematch_model, match_model, "Cascade"] + + # Plot for F1 Graph + f1 = [data['stats']['f1'] for data in [prematch_single, match_single, cascade]] + min_f1, max_f1 = min(f1), max(f1) + y_margin = 2 + plt.bar(approaches, f1, color='skyblue', label='F1 Score') + for i, value in enumerate(f1): + plt.text(i, value + 0.01, f'{value:.2f}', ha='center', va='bottom') + plt.ylim(min_f1 - y_margin, max_f1 + y_margin) + plt.ylabel('F1 Score') + plt.xlabel('Models') + plt.title('F1 Score', pad=20) + plt.gca().spines['top'].set_visible(False) + plt.gca().spines['right'].set_visible(False) + plt.legend() + plt.tight_layout() + file_name = f"f1-comparison-{dataset}-{timestamp}.png" + plt.savefig(os.path.join(output_path, file_name)) + plt.show() + + # Plot for Cost + cost = [data['stats']['tokens']['cost'] for data in [prematch_single, match_single, cascade]] + plt.bar(approaches, cost, color='green', label='Cost') + for i, value in enumerate(cost): + plt.text(i, value + 0.0001, f'{value:.5f}', ha='center', va='bottom') + plt.ylabel('Cost') + plt.xlabel('Models') + plt.title('Cost') + plt.gca().spines['top'].set_visible(False) + plt.gca().spines['right'].set_visible(False) + plt.legend() + plt.tight_layout() + file_name = f"cost-comparison-{dataset}-{timestamp}.png" + plt.savefig(os.path.join(output_path, file_name)) + plt.show() + + # Confidence Calibration + prematch_acc = accuracy_groupby_confidence(result_prematch) + match_acc = accuracy_groupby_confidence(result_match) + output_path = os.path.join("examples", "cascade", "output", dataset) + confidence_calibration_graph(prematch_acc, match_acc, dataset, output_path) + + +def accuracy_groupby_confidence(data): + confidence_dict = defaultdict(list) + for item in data: + confidence = item['confidence'] + confidence_dict[confidence].append(item) + + accuracy_dict = {} + + for confidence, items in confidence_dict.items(): + truth = [item['label'] == 1 for item in items] + predictions = [item['pred'] == "yes" for item in items] + acc = accuracy(truth, predictions) + if len(items) > 5: + accuracy_dict[confidence] = acc + + keys = accuracy_dict.keys() + keys = [key for key in keys if key is not None] + keys_sorted = sorted(keys) + sorted_dict = {key: accuracy_dict[key] for key in keys_sorted} + + return sorted_dict + + +def confidence_calibration_graph(prematch_acc, match_acc, dataset, output_path): + x_gpt4o_mini = list(prematch_acc.keys()) + y_gpt4o_mini = list(prematch_acc.values()) + + x_gpt4o = list(match_acc.keys()) + y_gpt4o = list(match_acc.values()) + + perfect_calibration = np.linspace(0, 1, 100) + + # Plot + plt.figure(figsize=(8, 6)) + plt.plot(x_gpt4o_mini, y_gpt4o_mini, '-o', label="GPT-4o-mini", color='darkblue', linewidth=1.5, alpha=0.8) + plt.plot(x_gpt4o, y_gpt4o, '-o', label="GPT-4o", color='red', linewidth=1.5, alpha=0.8) + plt.plot(perfect_calibration, perfect_calibration, '--', label="Perfect Calibration", color='black', alpha=0.5) + + plt.xlabel("Confidence", fontsize=11) + plt.ylabel("Accuracy", fontsize=11) + plt.title("Accuracy vs. Confidence", fontsize=12) + plt.legend(loc='lower right', fontsize=10, frameon=False) + + plt.grid(color='gray', linestyle=':', linewidth=0.5, alpha=0.6) + plt.gca().spines['top'].set_color('none') + plt.gca().spines['right'].set_color('none') + plt.gca().spines['left'].set_color('white') + plt.gca().spines['bottom'].set_color('white') + + plt.tight_layout() + timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + file_name = f"confidence-calibration-{dataset}-{timestamp}.png" + plt.savefig(os.path.join(output_path, file_name)) + plt.show() + + +def compare_results(result_data): + prematch_stats = result_data["prematch_stats"] + prematch_results = result_data["prematch_results"] + match_stats = result_data["match_stats"] + match_results = result_data["match_results"] + final = result_data["cascade_result"] + dataset = result_data["dataset"] + + result = [] + + for prematch_result in prematch_results: + for final_result in final: + if final_result['left'] == prematch_result['left'] and final_result['right'] == prematch_result['right']: + final_pred = final_result['pred'] + break + + entry = { + "left": prematch_result['left'], + "right": prematch_result['right'], + "label": prematch_result['label'], + "final_pred": final_pred, + "prematch_results": { + "pred": prematch_result['pred'], + "confidence": prematch_result['confidence'], + "calibrated_confidence": prematch_result['calibrated_confidence'], + "explanation": prematch_result['explanation'], + "model_output": prematch_result['model_output'], + "tool_outputs": prematch_result['tool_outputs'], + "latency": prematch_result['latency'], + "tokens": prematch_result['tokens'] + }, + "match_results": {} + } + + for match_result in match_results: + if match_result['left'] == prematch_result['left'] and match_result['right'] == prematch_result['right']: + entry['match_results'] = { + "pred": match_result['pred'], + "confidence": match_result['confidence'], + "explanation": match_result['explanation'], + "model_output": match_result['model_output'], + "tool_outputs": match_result['tool_outputs'], + "latency": match_result['latency'], + "tokens": match_result['tokens'] + } + break + + result.append(entry) + + # Output to a JSON file + timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + output_filename = f"results-by-stages-{dataset}-{timestamp}.json" + output_path = os.path.join("examples", "cascade", "output", dataset) + output_filepath = os.path.join(output_path, output_filename) + os.makedirs(os.path.dirname(output_filepath), exist_ok=True) + with open(output_filepath, "w") as json_file: + json.dump(result, json_file, indent=4) + + total_pairs = len(result) + num_prematch_only = 0 + num_prematch_only_correct = 0 + prematch_only_pairs = [] + + num_correct_flip = 0 + num_incorrect_flip = 0 + num_correct_no_flip = 0 + num_incorrect_no_flip = 0 + + match_correct_flip = [] + match_incorrect_flip = [] + match_correct_no_flip = [] + match_incorrect_no_flip = [] + + for entry in result: + if not entry["match_results"]: + num_prematch_only += 1 + prematch_only_pairs.append(entry) + label = entry["label"] + final_pred = entry["final_pred"] + if (label == 0 and "no" in final_pred.lower()) or (label == 1 and "yes" in final_pred.lower()): + num_prematch_only_correct += 1 + else: + label = entry["label"] + final_pred = entry["final_pred"] + prematch_pred = entry["prematch_results"]["pred"] + match_pred = entry["match_results"]["pred"] + correct_flip_0 = (label == 0 and "yes" in prematch_pred.lower() and "no" in match_pred.lower()) + correct_flip_1 = (label == 1 and "no" in prematch_pred.lower() and "yes" in match_pred.lower()) + incorrect_flip_0 = (label == 0 and "no" in prematch_pred.lower() and "yes" in match_pred.lower()) + incorrect_flip_1 = (label == 1 and "yes" in prematch_pred.lower() and "no" in match_pred.lower()) + correct_no_flip_0 = (label == 0 and "no" in prematch_pred.lower() and "no" in match_pred.lower()) + correct_no_flip_1 = (label == 1 and "yes" in prematch_pred.lower() and "yes" in match_pred.lower()) + incorrect_no_flip_0 = (label == 1 and "no" in prematch_pred.lower() and "no" in match_pred.lower()) + incorrect_no_flip_1 = (label == 0 and "yes" in prematch_pred.lower() and "yes" in match_pred.lower()) + + if correct_flip_0 or correct_flip_1: + num_correct_flip += 1 + match_correct_flip.append(entry) + elif incorrect_flip_0 or incorrect_flip_1: + num_incorrect_flip += 1 + match_incorrect_flip.append(entry) + elif correct_no_flip_0 or correct_no_flip_1: + num_correct_no_flip += 1 + match_correct_no_flip.append(entry) + elif incorrect_no_flip_0 or incorrect_no_flip_1: + num_incorrect_no_flip += 1 + match_incorrect_no_flip.append(entry) + else: + print("Invalid entry") + + stat = { + "total_pairs": total_pairs, + "prematch_stats": prematch_stats, + "match_stats": match_stats, + "prematch_stage": { + "num_total": num_prematch_only, + "num_correct": num_prematch_only_correct, + }, + "match_stage":{ + "num_total": total_pairs - num_prematch_only, + "num_correct": num_correct_flip + num_correct_no_flip, + "num_correct_flip": num_correct_flip, + "num_incorrect_flip": num_incorrect_flip, + "num_correct_no_flip": num_correct_no_flip, + "num_incorrect_no_flip": num_incorrect_no_flip + } + } + result_analysis = { + "stat": stat, + "result": { + "prematch_final": prematch_only_pairs, + "match_correct_flip": match_correct_flip, + "match_incorrect_flip": match_incorrect_flip, + "match_correct_no_flip": match_correct_no_flip, + "match_incorrect_no_flip": match_incorrect_no_flip + } + } + + timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + output_filename = f"analysis-{dataset}-{timestamp}.json" + output_path = os.path.join("examples", "cascade", "output", dataset) + output_filepath = os.path.join(output_path, output_filename) + os.makedirs(os.path.dirname(output_filepath), exist_ok=True) + with open(output_filepath, "w") as json_file: + json.dump(result_analysis, json_file, indent=4) + + print(f"Comparison results saved to {output_filepath}") + + +def sensitivity_analysis(args, dataset, thresholds, prematch_model="gpt-4o-mini", match_model="gpt-4o", num_pairs=2): + f1_scores = [] + costs = [] + + for threshold in thresholds: + results_data = online( + args=args, + dataset=dataset, + prematch_model=prematch_model, + match_model=match_model, + num_pairs=num_pairs, + threshold=threshold + ) + cascade_stats, prematch_single, match_single = generate_stats(results_data) + + f1_scores.append(cascade_stats['stats']['f1']) + costs.append(cascade_stats['stats']['tokens']['cost']) + + print(f"Threshold: {threshold:.1f}, F1 Score: {cascade_stats['stats']['f1']:.4f}, Cost: {cascade_stats['stats']['tokens']['cost']:.2f}") + + return f1_scores, costs + + +def confidence_cost_plot(thresholds, costs): + plt.figure(figsize=(8, 5)) + plt.plot(thresholds, costs, marker='s', linestyle='-', color='red', label='Cost') + plt.title('Confidence Threshold vs. Cost') + plt.xlabel('Confidence Threshold') + plt.ylabel('Cost') + plt.grid(True, linestyle='--', alpha=0.6) + plt.legend() + plt.tight_layout() + plt.show() + + output_path = os.path.join("examples", "cascade", "output") + timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + file_name = f"sensitivity-confidence-over-cost-{timestamp}.png" + file_path = os.path.join(output_path, file_name) + plt.savefig(file_path) + + +def confidence_f1_plot(thresholds, f1): + plt.figure(figsize=(8, 5)) + plt.plot(thresholds, f1, marker='s', linestyle='-', color='red', label='F1') + plt.title('Confidence Threshold vs. F1') + plt.xlabel('Confidence Threshold') + plt.ylabel('F1') + plt.ylim(0, 1.05) + plt.grid(True, linestyle='--', alpha=0.6) + plt.legend() + plt.tight_layout() + plt.show() + + output_path = os.path.join("examples", "cascade", "output") + timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + file_name = f"sensitivity-confidence-over-f1-{timestamp}.png" + file_path = os.path.join(output_path, file_name) + plt.savefig(file_path) diff --git a/libem/cascade/__init__.py b/libem/cascade/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/libem/cascade/function.py b/libem/cascade/function.py new file mode 100644 index 0000000..61cc93d --- /dev/null +++ b/libem/cascade/function.py @@ -0,0 +1,58 @@ +import time +from libem.cascade.util import stage_filter +from libem.cascade.vectorize.function import run as vectorize +from libem.cascade.prematch.function import run as prematch +from libem.cascade.match.function import run as match + + +def online(args, dataset, prematch_model="gpt-4o-mini", match_model="gpt-4o", num_pairs=None, threshold=0.2): + + args.schema = False + args.name = dataset.__name__.split('.')[-1] + + if dataset: + train_set, test_set = vectorize(args, dataset, num_pairs) + num_pairs_test = len(list(test_set)) + args.num_pairs = num_pairs_test + cascade_stats, cascade_result = {}, [] + start_time = time.time() + prematch_stats, prematch_results = prematch(train_set, test_set, args, model_choice=prematch_model) + unconfident_pairs, confident_pairs = stage_filter(prematch_results, threshold) + cascade_result += confident_pairs + + if unconfident_pairs: + + match_stats, match_results = match(train_set, unconfident_pairs, args, model_choice=match_model) + end_time = time.time() + cascade_result += match_results + + else: + match_stats, match_results = {}, {} + end_time = time.time() + + + results_data = { + "args": args, + "dataset": args.name, + "train_set": train_set, + "test_set": test_set, + "prematch_model": prematch_model, + "match_model": match_model, + "num_pairs": num_pairs_test, + "threshold": threshold, + "prematch_stats": prematch_stats, + "prematch_results": prematch_results, + "match_stats": match_stats, + "match_results": match_results, + "cascade_stats": cascade_stats, + "cascade_result": cascade_result, + "start_time": start_time, + "end_time": end_time, + "confident_pairs": confident_pairs, + } + + return results_data + + +def offline(args): + pass diff --git a/libem/cascade/match/function.py b/libem/cascade/match/function.py new file mode 100644 index 0000000..c407d9d --- /dev/null +++ b/libem/cascade/match/function.py @@ -0,0 +1,24 @@ +import libem +import openai +import time +from libem.core.model.openai import reset +from libem.cascade.util import run as run_match + + +def run(train_set, test_set, args, model_choice="gpt-4o"): + reset() + + results, stats = {}, {} + + libem.calibrate({ + "libem.match.parameter.model": model_choice, + "libem.match.parameter.confidence": True + }, verbose=True) + args.model = model_choice + + stats, results = run_match(train_set, test_set, args) + + print("args", args) + print("run matching using model", args.model) + + return stats, results \ No newline at end of file diff --git a/libem/cascade/prematch/function.py b/libem/cascade/prematch/function.py new file mode 100644 index 0000000..9bfff0f --- /dev/null +++ b/libem/cascade/prematch/function.py @@ -0,0 +1,23 @@ +import libem +from benchmark.util import run_block +from libem.core.model.openai import reset +from libem.cascade.util import run as run_prematch + +def run(train_set, test_set, args, model_choice="gpt-4o-mini"): + reset() + + results, stats = {}, {} + + if model_choice == "block": + test_set, stats['block'], results['block'] = run_block(test_set, args) + + else: + libem.calibrate({ + "libem.match.parameter.model": model_choice, + "libem.match.parameter.confidence": True + }, verbose=True) + args.model = model_choice + + stats, results = run_prematch(train_set, test_set, args) + + return stats, results \ No newline at end of file diff --git a/libem/cascade/util.py b/libem/cascade/util.py new file mode 100644 index 0000000..9d10b43 --- /dev/null +++ b/libem/cascade/util.py @@ -0,0 +1,272 @@ +import time +import math +import numpy as np +import libem +from libem.core import eval +from libem.optimize import cost as cost_util +from libem.tune.learn.confidence.calibrate import temperature_scale +from libem.match import prompt as match_prompt +from libem.match import digest as match_digest + +from libem.core.struct import Shots, Shot + +def stage_filter(results, threshold=0.5): + low_confidence_pairs = [] + high_confidence_results = [] + + for result in results: + pred = result["pred"] + confidence = result['confidence'] + if confidence is not None and confidence < threshold: + low_confidence_pairs.append({ + 'left': result['left'], + 'right': result['right'], + 'label': result['label'] + }) + else: + high_confidence_results.append(result) + + return low_confidence_pairs, high_confidence_results + + +def run(train_set, test_set, args): + num_pairs = len(test_set) + num_batches = math.ceil(num_pairs / args.batch_size) + + print(f"Cascading: Matching {num_pairs} " + f"{'pair' if num_pairs == 1 else 'pairs'} " + f"{f'in {num_batches} batches ' if args.batch_size > 1 else ''}" + f"using {args.model}" + f"from the {args.name} dataset.") + + if args.num_shots > 0: + shots = Shots([ + Shot( + question=match_prompt.query(left=d['left'], right=d['right']), + answer="yes" if d['label'] == 1 else "no" + ) for d in train_set + ]) + print(f"Cascading: Using {args.num_shots} shots with {args.icl} strategy " + f"for in-context learning.") + else: + shots = [] + + start_time = time.time() + + results = {} + + with libem.trace as t: + libem.calibrate({ + "libem.match.parameter.batch_size": args.batch_size, + "libem.match.parameter.sync": args.sync, + "libem.match.prompt.shots": shots, + }) + + if args.sync and args.batch_size == 1: + print("Cascading: Running in synchronous mode with batch size 1.") + # iterate and match each pair + for i, data in enumerate(test_set): + if 0 < args.num_pairs < i + 1: + break + + left, right = data['left'], data['right'] + label = data['label'] + + if not args.quiet: + print(f"Pair #{i + 1}\n") + print(f"Entity 1: {left}\n") + print(f"Entity 2: {right}") + + num_retries = 0 + while True: + try: + is_match: dict = libem.match(left, right) + results[match_digest(left, right)] = { + 'left': left, + 'right': right, + 'label': label, + 'pred': is_match['answer'], + 'confidence': is_match['confidence'], + 'explanation': is_match['explanation'], + } + + if not args.quiet: + print(f"Match: {is_match['answer']}; " + f"Confidence: {is_match['confidence']}; " + f"Label: {label}\n") + break + except libem.ModelTimedoutException: + num_retries += 1 + print(f"Retrying {num_retries} time(s) " + f"due to model call timeout..") + else: + # prepare datasets + print("Cascading: Running in asynchronous mode.") + left, right, labels = [], [], [] + for i, data in enumerate(test_set): + + if 0 < args.num_pairs < i + 1: + break + + left.append(data['left']) + right.append(data['right']) + labels.append(data['label']) + + answers: list[dict] = libem.match(left, right) + + results = { + match_digest(l, r): { + 'left': l, + 'right': r, + 'label': label, + 'pred': is_match['answer'], + 'confidence': is_match['confidence'], + 'explanation': is_match['explanation'], + } + for l, r, label, is_match in zip(left, right, labels, answers) + } + + # fill in additional info from the trace + for span in t.get(): + if 'match' not in span: + continue + + match = span['match'] + left, right = match['left'], match['right'] + + # for batch matching, the trace are + # shared between pairs in each batch + if isinstance(left, list): + pass + else: + left, right = [left], [right] + + for l, r in zip(left, right): + digest = match_digest(l, r) + model_usage = match['model_usage'] + results[digest].update( + { + 'model_output': match['model_output'], + 'tool_outputs': match['tool_outputs'], + 'latency': libem.round(match['latency'], 2), + 'tokens': { + 'num_input_tokens': model_usage['num_input_tokens'], + 'num_output_tokens': model_usage['num_output_tokens'], + 'cost': libem.round(cost_util.get_cost( + args.model, + model_usage['num_input_tokens'], + model_usage['num_output_tokens'], + ), 4) + } + } + ) + end_time = time.time() + + # ignore the digest key + results = list(results.values()) + + truth = [result['label'] for result in results] + predictions = [1 if result['pred'] == 'yes' else 0 for result in results] + latencies = [result['latency'] for result in results] + confidences = [result['confidence'] for result in results if result['confidence'] is not None] + print(f"Model: {args.model}") + if args.model != "gpt-4o": + print(f"Calibrating confidences for model: {args.model}") + calibrated_confidences = temperature_scale(confidences, truth) + results = patch_calibrated_confidence(results, calibrated_confidences) + + # generate stats + metrics = eval.report( + truth, predictions + ) + telemetry = t.telemetry(flatten=True) + + stats = { + 'num_pairs': num_pairs, + 'precision': round(metrics['precision'] * 100, 2), + 'recall': round(metrics['recall'] * 100, 2), + 'f1': round(metrics['f1'] * 100, 2), + 'latency': round(end_time - start_time, 2), + 'throughput': libem.round(num_pairs / (end_time - start_time), 2), + 'accuracy': round(metrics['accuracy'] * 100, 2), + 'per_pair_latency': libem.round((end_time - start_time) / num_pairs, 2), + 'avg_batch_latency': libem.round(np.mean(latencies), 2), + 'avg_confidence': libem.round(np.mean(confidences), 2) if confidences else -1, + 'tokens': { + 'num_input_tokens': telemetry['model.num_input_tokens']['sum'], + 'num_output_tokens': telemetry['model.num_output_tokens']['sum'], + 'cost': libem.round(cost_util.get_cost( + args.model, + telemetry['model.num_input_tokens']['sum'], + telemetry['model.num_output_tokens']['sum'], + ), 4) + }, + 'confusion_matrix': { + 'tp': metrics['tp'], + 'fp': metrics['fp'], + 'tn': metrics['tn'], + 'fn': metrics['fn'], + } + } + + print(f"Cascading: Matching done in {stats['latency']}s.") + if not args.quiet: + print(f"Benchmark: Precision\t {stats['precision']}") + print(f"Benchmark: Recall\t {stats['recall']}") + print(f"Benchmark: F1 score\t {stats['f1']}") + print(f"Benchmark: Throughput\t {stats['throughput']} pps") + print(f"Benchmark: Cost \t ${stats['tokens']['cost']}") + + return stats, results + + +def patch_calibrated_confidence(results, calibrated_confidences): + for i, result in enumerate(results): + if result['confidence'] is not None: + results[i] = place_to_next( + result, 'confidence', 'calibrated_confidence', + round(calibrated_confidences[i], 2) + if result['confidence'] is not None else None + ) + return results + + +def place_to_next(d, key_next_to, key_to_place, value_to_place): + new_dict = dict() + + for key, value in d.items(): + new_dict[key] = value + if key == key_next_to: + new_dict[key_to_place] = value_to_place + + return new_dict + + +def profile(args, results, num_pairs): + truth = [result['label'] for result in results] + predictions = [1 if result['pred'] == 'yes' else 0 for result in results] + latencies = [result['latency'] for result in results] + confidences = [result['confidence'] for result in results if result['confidence'] is not None] + + # generate stats + metrics = eval.report( + truth, predictions + ) + + stats = { + 'num_pairs': num_pairs, + 'precision': round(metrics['precision'] * 100, 2), + 'recall': round(metrics['recall'] * 100, 2), + 'f1': round(metrics['f1'] * 100, 2), + 'accuracy': round(metrics['accuracy'] * 100, 2), + 'avg_batch_latency': libem.round(np.mean(latencies), 2), + 'avg_confidence': libem.round(np.mean(confidences), 2) if confidences else -1, + 'confusion_matrix': { + 'tp': metrics['tp'], + 'fp': metrics['fp'], + 'tn': metrics['tn'], + 'fn': metrics['fn'], + } + } + + return stats \ No newline at end of file diff --git a/libem/cascade/vectorize/function.py b/libem/cascade/vectorize/function.py new file mode 100644 index 0000000..500d60e --- /dev/null +++ b/libem/cascade/vectorize/function.py @@ -0,0 +1,25 @@ +import libem +import random + + +def run(args, dataset, num_pairs=None): + random.seed(42) + + # construct kwargs dict + kwargs = { + 'version': 0, + } + + if num_pairs: + args.num_pairs = num_pairs + + train_set = dataset.read_train(**kwargs) + test_set = dataset.read_test(**kwargs) + test_set = list(test_set) + + if num_pairs: + test_set = test_set[:num_pairs] + print(f"Number of test pairs: {len(test_set)}") + random.shuffle(test_set) + + return train_set, test_set, \ No newline at end of file diff --git a/libem/optimize/cost/openai.py b/libem/optimize/cost/openai.py index 6fb7905..9220c36 100644 --- a/libem/optimize/cost/openai.py +++ b/libem/optimize/cost/openai.py @@ -7,7 +7,6 @@ 'output_cost_per_token': 0 } - def get_model_info(model=None): if model is None: return cache.load_openai() diff --git a/libem/optimize/function.py b/libem/optimize/function.py index f098e34..765a055 100644 --- a/libem/optimize/function.py +++ b/libem/optimize/function.py @@ -26,9 +26,10 @@ def profile(dataset: Iterable, num_samples=-1, detailed=False): break start = time.time() + matches = libem.match(left, right) preds = [ 1 if is_match["answer"].lower() == "yes" else 0 - for is_match in libem.match(left, right) + for is_match in matches ] latency = time.time() - start @@ -36,6 +37,7 @@ def profile(dataset: Iterable, num_samples=-1, detailed=False): p = eval.precision(truths, preds) r = eval.recall(truths, preds) tp, fp, tn, fn = eval.confusion_matrix(truths, preds) + confidence = sum([is_match['confidence'] for is_match in matches]) / len(matches) stats = t.stats() stats["model"]["cost"] = cost.get_cost( @@ -68,6 +70,7 @@ def profile(dataset: Iterable, num_samples=-1, detailed=False): "fn": fn, "tp": tp, "tn": tn, + "confidence" : confidence, "num_model_calls": stats["model"]["num_model_calls"]["sum"], "num_input_tokens": stats["model"]["num_input_tokens"]["sum"], "num_output_tokens": stats["model"]["num_output_tokens"]["sum"],