From 7faa2b30437dbee5a19f779eec97eae05d7eb8d4 Mon Sep 17 00:00:00 2001
From: David <davidspacewang@gmail.com>
Date: Thu, 21 Nov 2024 10:04:37 -0800
Subject: [PATCH 1/2] improve confidence score generation

---
 libem/core/struct/pattern.py |  4 ++--
 libem/match/function.py      | 26 ++++++++++++++++----------
 libem/match/prompt.py        | 13 ++++++++-----
 3 files changed, 26 insertions(+), 17 deletions(-)
diff --git a/libem/core/struct/pattern.py b/libem/core/struct/pattern.py
index ef62ad6..54ae2c8 100644
--- a/libem/core/struct/pattern.py
+++ b/libem/core/struct/pattern.py
@@ -9,6 +9,6 @@
 )
 
 Confidence = Prompt(
-    default="Give a confidence score from 0.0 to 1.0, with 0.0 being a guess "
-            "and 1.0 being confident, give the score only, do not justify.",
+    default="At the end, give a confidence score from 0.0 to 1.0, "
+            "do not justify.",
 )
diff --git a/libem/match/function.py b/libem/match/function.py
index d561ba8..b0871b1 100644
--- a/libem/match/function.py
+++ b/libem/match/function.py
@@ -127,9 +127,9 @@ async def once(left: str, right: str) -> dict:
         prompt.role(),
         prompt.rules(),
         prompt.experiences(),
-        struct.CoT() if parameter.cot() else "",
-        struct.Confidence() if parameter.confidence() else "",
+        prompt.CoT() if parameter.cot() else "",
         prompt.output(),
+        prompt.Confidence() if parameter.confidence() else "",
     )
 
     match_prompt = Prompt.join(
@@ -289,20 +289,14 @@ def parse_output(output: str) -> dict:
     """Handle the model output of format:
     ```
     <explanation> (e.g., "Name Comparison: ...")
-    <confidence> (e.g., "Confidence Score: 5" or "5")
     <answer> (e.g., yes)
+    <confidence> (e.g., "Confidence Score: 0.9" or "0.9")
     ```
     """
     # remove empty lines and process lines in reverse order
     output = [s for s in output.splitlines() if s][::-1]
 
-    answer = output.pop(0).lower()
-    if prompt.output.value == Index("likelihood"):
-        answer = float(answer)
-    else:
-        answer = "yes" if "yes" in answer else "no"
-
-    confidence, explanation = None, None
+    answer, confidence, explanation = "no", None, None
 
     if parameter.confidence():
         for i, line in enumerate(output):
@@ -312,6 +306,18 @@ def parse_output(output: str) -> dict:
                 confidence = float(''.join(nums))
                 output = output[i + 1:]
                 break
+    
+    for i, line in enumerate(output):
+        line = line.lower()
+        nums = re.findall(r"\d+\.\d+|\d+", line)
+        if prompt.output.value == Index("likelihood") and nums:
+            answer = float(answer)
+            output = output[i + 1:]
+            break
+        elif 'yes' in line or 'no' in line:
+            answer = "yes" if "yes" in line else "no"
+            output = output[i + 1:]
+            break
 
     if parameter.cot():
         explanation = "\n".join(output[::-1])
diff --git a/libem/match/prompt.py b/libem/match/prompt.py
index 517d19c..19e8874 100644
--- a/libem/match/prompt.py
+++ b/libem/match/prompt.py
@@ -2,6 +2,9 @@
 from libem.core.struct.prompt import (
     Shot, Rules, Experiences
 )
+from libem.core.struct.pattern import (
+    CoT, Confidence
+)
 from libem.match.parameter import model
 
 """System prompts"""
@@ -25,17 +28,17 @@
     default=Index(
         lambda: "strict"
         if model() in {
-            "llama3", "llama3.1", "llama3.2-3b", "llama3.2-1b",
-            "gpt-4o-2024-08-06", "claude-3-5-sonnet-20240620",
+            "llama3", "llama3.1", "llama3.2-3b", "llama3.2-1b", 
+            "claude-3-5-sonnet-20240620",
         }
         else "standard"
     ),
     options={
-        "standard": "At the end, give your answer in the form of a "
+        "standard": "Give your answer in the form of a "
                     "single 'yes' or 'no'.",
-        "strict": "At the end, give your answer in the form of a "
+        "strict": "Give your answer in the form of a "
                   "single 'yes' or 'no'. Nothing else.",
-        "likelihood": "At the end, give your answer strictly in the "
+        "likelihood": "Give your answer strictly in the "
                       "format of a single number between 0.0 and 1.0, "
                       "estimating the likelihood that the two entities "
                       "are the same.",

From cbe4aec3853a2918486f49be614cc52edee14df8 Mon Sep 17 00:00:00 2001
From: Char15Xu <charlesxuty3@berkeley.edu>
Date: Sun, 1 Dec 2024 23:49:51 -0500
Subject: [PATCH 2/2] Initial commit in cascade-feature

---
 .gitignore                          |  10 +
 examples/cascade/bench.py           |  28 ++
 examples/cascade/online.py          |  11 +
 examples/cascade/util.py            | 435 ++++++++++++++++++++++++++++
 libem/cascade/__init__.py           |   0
 libem/cascade/function.py           |  58 ++++
 libem/cascade/match/function.py     |  24 ++
 libem/cascade/prematch/function.py  |  23 ++
 libem/cascade/util.py               | 272 +++++++++++++++++
 libem/cascade/vectorize/function.py |  25 ++
 libem/optimize/cost/openai.py       |   1 -
 libem/optimize/function.py          |   5 +-
 12 files changed, 890 insertions(+), 2 deletions(-)
 create mode 100644 examples/cascade/bench.py
 create mode 100644 examples/cascade/online.py
 create mode 100644 examples/cascade/util.py
 create mode 100644 libem/cascade/__init__.py
 create mode 100644 libem/cascade/function.py
 create mode 100644 libem/cascade/match/function.py
 create mode 100644 libem/cascade/prematch/function.py
 create mode 100644 libem/cascade/util.py
 create mode 100644 libem/cascade/vectorize/function.py

diff --git a/.gitignore b/.gitignore
index edff4e5..564ddf0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -172,3 +172,13 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
 
+# Ignore macOS .DS_Store files
+.DS_Store
+examples/.DS_Store
+examples/cascade/.DS_Store
+
+# Ignore specific output folder
+examples/cascade/output/
+
+# Ignore libem-specific .DS_Store file
+libem/.DS_Store
\ No newline at end of file
diff --git a/examples/cascade/bench.py b/examples/cascade/bench.py
new file mode 100644
index 0000000..b3636ab
--- /dev/null
+++ b/examples/cascade/bench.py
@@ -0,0 +1,28 @@
+import numpy as np
+from benchmark.run import args
+import json
+from libem.prepare.datasets import abt_buy, beer, itunes_amazon
+from libem.cascade.function import online
+from util import sensitivity_analysis, generate_stats, confidence_cost_plot, confidence_f1_plot, plot_result, save_results, compare_results
+
+def bench():
+    results_data = online(args(), dataset=abt_buy, num_pairs=None, threshold=0.9)
+    compare_results(results_data)
+    cascade_stats, prematch_single, match_single = generate_stats(results_data)
+    save_results(cascade_stats, prematch_single, match_single)
+    plot_result(cascade_stats, prematch_single, match_single)
+
+
+def sensitivity_analysis():
+    # Perform sensitivity analysis
+    thresholds = np.arange(0.1, 1.0, 0.1)
+    f1_scores, costs = sensitivity_analysis(args(), abt_buy, thresholds, num_pairs=100)
+    F1_SCORE = f1_scores
+    COSTS = costs
+    confidence_cost_plot(thresholds, COSTS)
+    confidence_f1_plot(thresholds, F1_SCORE)
+
+
+
+if __name__ == '__main__':
+    bench()
\ No newline at end of file
diff --git a/examples/cascade/online.py b/examples/cascade/online.py
new file mode 100644
index 0000000..e0e44f8
--- /dev/null
+++ b/examples/cascade/online.py
@@ -0,0 +1,11 @@
+from libem.prepare.datasets import abt_buy
+from benchmark.run import args
+import numpy as np
+from libem.cascade.function import online
+
+def main():
+    online(args(), abt_buy, num_pairs=100, threshold=0.9)
+    
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/examples/cascade/util.py b/examples/cascade/util.py
new file mode 100644
index 0000000..bae241d
--- /dev/null
+++ b/examples/cascade/util.py
@@ -0,0 +1,435 @@
+import os
+import json
+import libem
+import numpy as np
+from datetime import datetime
+import matplotlib.pyplot as plt
+from collections import defaultdict
+from libem.core.eval import accuracy
+from libem.cascade.util import profile
+from libem.cascade.function import online
+from libem.cascade.match.function import run as match
+
+
+def benchmark(result):
+    cascade_stats = generate_stats(result)
+    
+    save_results(result, cascade_stats)
+    
+    plot_result(result, cascade_stats)
+
+    return cascade_stats, result["cascade_result"]
+
+
+def generate_stats(result):
+
+    args = result["args"]
+    train_set = result["train_set"]
+    test_set = result["test_set"]
+    prematch_model = result["prematch_model"]
+    match_model = result["match_model"]
+    num_pairs = result["num_pairs"]
+    prematch_stats = result["prematch_stats"]
+    prematch_results = result["prematch_results"]
+    match_stats = result["match_stats"]
+    cascade_stats = result["cascade_stats"]
+    cascade_result = result["cascade_result"]
+    start_time = result["start_time"]
+    end_time = result["end_time"]
+
+    print("Prematch Stats: ", prematch_stats)
+    print("Match Stats: ", match_stats)
+
+    overall_stat = profile(args, cascade_result, num_pairs)
+    overall_stat['throughput'] = libem.round(len(test_set) / (end_time - start_time), 2)
+    overall_stat['tokens'] = {}
+
+    prematch_cost = prematch_stats.get('tokens', {}).get('cost', 0)
+    match_cost = match_stats.get('tokens', {}).get('cost', 0)
+    print(prematch_cost, match_cost)
+
+    overall_stat['tokens']['cost'] = prematch_cost + match_cost
+    cascade_stats = {
+        "dataset": args.name,
+        "stats": overall_stat, 
+        "stats by stages": {
+            f"Prematch {prematch_model}": prematch_stats,
+            f"Match {match_model}": match_stats,
+        },
+        "results": cascade_result
+        }
+    
+    result["cascade_stats"] = cascade_stats
+
+    # Result when only Prematch model is used
+    prematch_single_stats, prematch_single_results = prematch_stats, prematch_results
+    
+    prematch_single = {
+        "dataset": args.name,
+        "model": prematch_model,
+        "stats": prematch_single_stats,
+        "results": prematch_single_results
+    }
+
+    # Result when only Match model is used
+    match_single_state, match_single_result = match(train_set, test_set, args, model_choice=match_model)
+
+    match_single = {
+        "dataset": args.name,
+        "model": match_model,
+        "stats": match_single_state,
+        "results": match_single_result
+    }
+
+    return cascade_stats, prematch_single, match_single
+
+
+def save_results(cascade, prematch_single, match_single):
+    # Save the output
+    dataset = cascade["dataset"]
+    output_path = os.path.join("examples", "cascade", "output", dataset)
+    timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+
+    # Output of the Cascade
+    dataset = cascade["dataset"]
+    cascade_file_name = f"cascade-{dataset}-{timestamp}.json"
+    cascade_file_path = os.path.join(output_path, cascade_file_name)
+    os.makedirs(os.path.dirname(cascade_file_path), exist_ok=True)
+    with open(cascade_file_path, "w") as json_file:
+        json.dump(cascade, json_file, indent=4)
+
+    # Output of the Prematch
+    dataset = prematch_single["dataset"]
+    prematch_model = prematch_single["model"]
+    prematch_file_name = f"{prematch_model}-{dataset}-{timestamp}.json"
+    prematch_file_path = os.path.join(output_path, prematch_file_name)
+    os.makedirs(os.path.dirname(prematch_file_path), exist_ok=True)
+    with open(prematch_file_path, "w") as json_file:
+        json.dump(prematch_single, json_file, indent=4)
+
+    # Output of the Match   
+    dataset = match_single["dataset"]
+    match_model = match_single["model"]
+    match_file_name = f"{match_model}-{dataset}-{timestamp}.json"
+    match_file_path = os.path.join(output_path, match_file_name)
+    os.makedirs(os.path.dirname(match_file_path), exist_ok=True)
+    with open(match_file_path, "w") as json_file:
+        json.dump(match_single, json_file, indent=4)
+
+
+def plot_result(cascade, prematch_single, match_single):
+    dataset = cascade["dataset"]
+    output_path = os.path.join("examples", "cascade", "output", dataset)
+    timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+
+    prematch_model = prematch_single["model"]
+    match_model = match_single["model"]
+    result_prematch = prematch_single["results"]
+    result_match = match_single["results"]
+    approaches = [prematch_model, match_model, "Cascade"]
+
+    # Plot for F1 Graph
+    f1 = [data['stats']['f1'] for data in [prematch_single, match_single, cascade]]
+    min_f1, max_f1 = min(f1), max(f1)
+    y_margin = 2
+    plt.bar(approaches, f1, color='skyblue', label='F1 Score')
+    for i, value in enumerate(f1):
+        plt.text(i, value + 0.01, f'{value:.2f}', ha='center', va='bottom')
+    plt.ylim(min_f1 - y_margin, max_f1 + y_margin)
+    plt.ylabel('F1 Score')
+    plt.xlabel('Models')
+    plt.title('F1 Score', pad=20)
+    plt.gca().spines['top'].set_visible(False)
+    plt.gca().spines['right'].set_visible(False)
+    plt.legend()
+    plt.tight_layout()
+    file_name = f"f1-comparison-{dataset}-{timestamp}.png"
+    plt.savefig(os.path.join(output_path, file_name))
+    plt.show()
+
+    # Plot for Cost
+    cost = [data['stats']['tokens']['cost'] for data in [prematch_single, match_single, cascade]]
+    plt.bar(approaches, cost, color='green', label='Cost')
+    for i, value in enumerate(cost):
+        plt.text(i, value + 0.0001, f'{value:.5f}', ha='center', va='bottom')
+    plt.ylabel('Cost')
+    plt.xlabel('Models')
+    plt.title('Cost')
+    plt.gca().spines['top'].set_visible(False)
+    plt.gca().spines['right'].set_visible(False)
+    plt.legend()
+    plt.tight_layout()
+    file_name = f"cost-comparison-{dataset}-{timestamp}.png"
+    plt.savefig(os.path.join(output_path, file_name))
+    plt.show()
+
+    # Confidence Calibration
+    prematch_acc = accuracy_groupby_confidence(result_prematch)
+    match_acc = accuracy_groupby_confidence(result_match)
+    output_path = os.path.join("examples", "cascade", "output", dataset)
+    confidence_calibration_graph(prematch_acc, match_acc, dataset, output_path)
+
+
+def accuracy_groupby_confidence(data):
+    confidence_dict = defaultdict(list)
+    for item in data:
+        confidence = item['confidence']
+        confidence_dict[confidence].append(item)
+    
+    accuracy_dict = {}
+
+    for confidence, items in confidence_dict.items():
+        truth = [item['label'] == 1 for item in items]
+        predictions = [item['pred'] == "yes" for item in items]
+        acc = accuracy(truth, predictions)
+        if len(items) > 5:
+            accuracy_dict[confidence] = acc
+
+    keys = accuracy_dict.keys()
+    keys = [key for key in keys if key is not None]
+    keys_sorted = sorted(keys)
+    sorted_dict = {key: accuracy_dict[key] for key in keys_sorted}
+
+    return sorted_dict
+
+
+def confidence_calibration_graph(prematch_acc, match_acc, dataset, output_path):
+    x_gpt4o_mini = list(prematch_acc.keys())
+    y_gpt4o_mini = list(prematch_acc.values())
+    
+    x_gpt4o = list(match_acc.keys())
+    y_gpt4o = list(match_acc.values())
+    
+    perfect_calibration = np.linspace(0, 1, 100)
+    
+    # Plot
+    plt.figure(figsize=(8, 6))
+    plt.plot(x_gpt4o_mini, y_gpt4o_mini, '-o', label="GPT-4o-mini", color='darkblue', linewidth=1.5, alpha=0.8)
+    plt.plot(x_gpt4o, y_gpt4o, '-o', label="GPT-4o", color='red', linewidth=1.5, alpha=0.8)
+    plt.plot(perfect_calibration, perfect_calibration, '--', label="Perfect Calibration", color='black', alpha=0.5)
+    
+    plt.xlabel("Confidence", fontsize=11)
+    plt.ylabel("Accuracy", fontsize=11)
+    plt.title("Accuracy vs. Confidence", fontsize=12)
+    plt.legend(loc='lower right', fontsize=10, frameon=False)
+    
+    plt.grid(color='gray', linestyle=':', linewidth=0.5, alpha=0.6)
+    plt.gca().spines['top'].set_color('none')
+    plt.gca().spines['right'].set_color('none')
+    plt.gca().spines['left'].set_color('white')
+    plt.gca().spines['bottom'].set_color('white')
+    
+    plt.tight_layout()
+    timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+    file_name = f"confidence-calibration-{dataset}-{timestamp}.png"
+    plt.savefig(os.path.join(output_path, file_name))
+    plt.show()
+
+
+def compare_results(result_data):
+    prematch_stats = result_data["prematch_stats"]
+    prematch_results = result_data["prematch_results"]
+    match_stats = result_data["match_stats"]
+    match_results = result_data["match_results"]
+    final = result_data["cascade_result"]
+    dataset = result_data["dataset"]
+
+    result = []
+
+    for prematch_result in prematch_results:
+        for final_result in final:
+            if final_result['left'] == prematch_result['left'] and final_result['right'] == prematch_result['right']:
+                final_pred = final_result['pred']
+                break
+
+        entry = {
+            "left": prematch_result['left'],
+            "right": prematch_result['right'],
+            "label": prematch_result['label'],
+            "final_pred": final_pred,
+            "prematch_results": {
+                "pred": prematch_result['pred'],
+                "confidence": prematch_result['confidence'],
+                "calibrated_confidence": prematch_result['calibrated_confidence'],
+                "explanation": prematch_result['explanation'],
+                "model_output": prematch_result['model_output'],
+                "tool_outputs": prematch_result['tool_outputs'],
+                "latency": prematch_result['latency'],
+                "tokens": prematch_result['tokens']
+            },
+            "match_results": {} 
+        }
+
+        for match_result in match_results:
+            if match_result['left'] == prematch_result['left'] and match_result['right'] == prematch_result['right']:
+                entry['match_results'] = {
+                    "pred": match_result['pred'],
+                    "confidence": match_result['confidence'],
+                    "explanation": match_result['explanation'],
+                    "model_output": match_result['model_output'],
+                    "tool_outputs": match_result['tool_outputs'],
+                    "latency": match_result['latency'],
+                    "tokens": match_result['tokens']
+                }
+                break
+
+        result.append(entry)
+
+    # Output to a JSON file
+    timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+    output_filename = f"results-by-stages-{dataset}-{timestamp}.json"
+    output_path = os.path.join("examples", "cascade", "output", dataset)
+    output_filepath = os.path.join(output_path, output_filename)
+    os.makedirs(os.path.dirname(output_filepath), exist_ok=True)
+    with open(output_filepath, "w") as json_file:
+        json.dump(result, json_file, indent=4)
+
+    total_pairs = len(result)
+    num_prematch_only = 0
+    num_prematch_only_correct = 0
+    prematch_only_pairs = []
+
+    num_correct_flip = 0
+    num_incorrect_flip = 0
+    num_correct_no_flip = 0
+    num_incorrect_no_flip = 0
+
+    match_correct_flip = []
+    match_incorrect_flip = []
+    match_correct_no_flip = []
+    match_incorrect_no_flip = []
+
+    for entry in result:
+        if not entry["match_results"]:
+            num_prematch_only += 1
+            prematch_only_pairs.append(entry)
+            label = entry["label"] 
+            final_pred = entry["final_pred"]
+            if (label == 0 and "no" in final_pred.lower()) or (label == 1 and "yes" in final_pred.lower()):
+                num_prematch_only_correct += 1
+        else:
+            label = entry["label"] 
+            final_pred = entry["final_pred"]
+            prematch_pred = entry["prematch_results"]["pred"]
+            match_pred = entry["match_results"]["pred"]
+            correct_flip_0 = (label == 0 and "yes" in prematch_pred.lower() and "no" in match_pred.lower())
+            correct_flip_1 = (label == 1 and "no" in prematch_pred.lower() and "yes" in match_pred.lower())
+            incorrect_flip_0 = (label == 0 and "no" in prematch_pred.lower() and "yes" in match_pred.lower())
+            incorrect_flip_1 = (label == 1 and "yes" in prematch_pred.lower() and "no" in match_pred.lower())
+            correct_no_flip_0 = (label == 0 and "no" in prematch_pred.lower() and "no" in match_pred.lower())
+            correct_no_flip_1 = (label == 1 and "yes" in prematch_pred.lower() and "yes" in match_pred.lower())
+            incorrect_no_flip_0 = (label == 1 and "no" in prematch_pred.lower() and "no" in match_pred.lower())
+            incorrect_no_flip_1 = (label == 0 and "yes" in prematch_pred.lower() and "yes" in match_pred.lower())
+    
+            if correct_flip_0 or correct_flip_1:
+                num_correct_flip += 1
+                match_correct_flip.append(entry)
+            elif incorrect_flip_0 or incorrect_flip_1:
+                num_incorrect_flip += 1
+                match_incorrect_flip.append(entry)
+            elif correct_no_flip_0 or correct_no_flip_1:
+                num_correct_no_flip += 1
+                match_correct_no_flip.append(entry)
+            elif incorrect_no_flip_0 or incorrect_no_flip_1:
+                num_incorrect_no_flip += 1
+                match_incorrect_no_flip.append(entry)
+            else:
+                print("Invalid entry")
+
+    stat = {
+        "total_pairs": total_pairs,
+        "prematch_stats": prematch_stats,
+        "match_stats": match_stats,
+        "prematch_stage": {
+            "num_total": num_prematch_only, 
+            "num_correct": num_prematch_only_correct, 
+            },
+        "match_stage":{
+            "num_total": total_pairs - num_prematch_only, 
+            "num_correct": num_correct_flip + num_correct_no_flip,
+            "num_correct_flip": num_correct_flip, 
+            "num_incorrect_flip": num_incorrect_flip, 
+            "num_correct_no_flip": num_correct_no_flip, 
+            "num_incorrect_no_flip": num_incorrect_no_flip
+        }
+    }
+    result_analysis = {
+        "stat": stat,
+        "result": {
+            "prematch_final": prematch_only_pairs,
+            "match_correct_flip": match_correct_flip,
+            "match_incorrect_flip": match_incorrect_flip, 
+            "match_correct_no_flip": match_correct_no_flip, 
+            "match_incorrect_no_flip": match_incorrect_no_flip
+        }
+    }
+    
+    timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+    output_filename = f"analysis-{dataset}-{timestamp}.json"
+    output_path = os.path.join("examples", "cascade", "output", dataset)
+    output_filepath = os.path.join(output_path, output_filename)
+    os.makedirs(os.path.dirname(output_filepath), exist_ok=True)
+    with open(output_filepath, "w") as json_file:
+        json.dump(result_analysis, json_file, indent=4)
+    
+    print(f"Comparison results saved to {output_filepath}")
+
+
+def sensitivity_analysis(args, dataset, thresholds, prematch_model="gpt-4o-mini", match_model="gpt-4o", num_pairs=2):
+    f1_scores = []
+    costs = []
+
+    for threshold in thresholds:
+        results_data = online(
+            args=args,
+            dataset=dataset,
+            prematch_model=prematch_model,
+            match_model=match_model,
+            num_pairs=num_pairs,
+            threshold=threshold
+        )
+        cascade_stats, prematch_single, match_single = generate_stats(results_data)
+
+        f1_scores.append(cascade_stats['stats']['f1'])
+        costs.append(cascade_stats['stats']['tokens']['cost'])
+
+        print(f"Threshold: {threshold:.1f}, F1 Score: {cascade_stats['stats']['f1']:.4f}, Cost: {cascade_stats['stats']['tokens']['cost']:.2f}")
+
+    return f1_scores, costs
+
+
+def confidence_cost_plot(thresholds, costs):
+    plt.figure(figsize=(8, 5))
+    plt.plot(thresholds, costs, marker='s', linestyle='-', color='red', label='Cost')
+    plt.title('Confidence Threshold vs. Cost')
+    plt.xlabel('Confidence Threshold')
+    plt.ylabel('Cost')
+    plt.grid(True, linestyle='--', alpha=0.6)
+    plt.legend()
+    plt.tight_layout()
+    plt.show()
+
+    output_path = os.path.join("examples", "cascade", "output")
+    timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+    file_name = f"sensitivity-confidence-over-cost-{timestamp}.png"
+    file_path = os.path.join(output_path, file_name)
+    plt.savefig(file_path)
+
+
+def confidence_f1_plot(thresholds, f1):
+    plt.figure(figsize=(8, 5))
+    plt.plot(thresholds, f1, marker='s', linestyle='-', color='red', label='F1')
+    plt.title('Confidence Threshold vs. F1')
+    plt.xlabel('Confidence Threshold')
+    plt.ylabel('F1')
+    plt.ylim(0, 1.05) 
+    plt.grid(True, linestyle='--', alpha=0.6)
+    plt.legend()
+    plt.tight_layout()
+    plt.show()
+
+    output_path = os.path.join("examples", "cascade", "output")
+    timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+    file_name = f"sensitivity-confidence-over-f1-{timestamp}.png"
+    file_path = os.path.join(output_path, file_name)
+    plt.savefig(file_path)
diff --git a/libem/cascade/__init__.py b/libem/cascade/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/libem/cascade/function.py b/libem/cascade/function.py
new file mode 100644
index 0000000..61cc93d
--- /dev/null
+++ b/libem/cascade/function.py
@@ -0,0 +1,58 @@
+import time
+from libem.cascade.util import stage_filter
+from libem.cascade.vectorize.function import run as vectorize
+from libem.cascade.prematch.function import run as prematch
+from libem.cascade.match.function import run as match
+
+
+def online(args, dataset, prematch_model="gpt-4o-mini", match_model="gpt-4o", num_pairs=None, threshold=0.2):
+
+    args.schema = False
+    args.name = dataset.__name__.split('.')[-1]
+
+    if dataset:
+        train_set, test_set = vectorize(args, dataset, num_pairs)
+        num_pairs_test = len(list(test_set))
+        args.num_pairs = num_pairs_test
+        cascade_stats, cascade_result = {}, []
+        start_time = time.time()
+        prematch_stats, prematch_results = prematch(train_set, test_set, args, model_choice=prematch_model)
+        unconfident_pairs, confident_pairs = stage_filter(prematch_results, threshold)
+        cascade_result += confident_pairs
+
+        if unconfident_pairs:
+            
+            match_stats, match_results = match(train_set, unconfident_pairs, args, model_choice=match_model)
+            end_time = time.time()
+            cascade_result += match_results
+
+        else:
+            match_stats, match_results = {}, {}
+            end_time = time.time()
+    
+
+    results_data = {
+            "args": args,
+            "dataset": args.name,
+            "train_set": train_set,
+            "test_set": test_set,
+            "prematch_model": prematch_model,
+            "match_model": match_model,
+            "num_pairs": num_pairs_test,
+            "threshold": threshold,
+            "prematch_stats": prematch_stats,
+            "prematch_results": prematch_results,
+            "match_stats": match_stats,
+            "match_results": match_results,
+            "cascade_stats": cascade_stats,
+            "cascade_result": cascade_result,
+            "start_time": start_time,
+            "end_time": end_time,
+            "confident_pairs": confident_pairs,
+        }
+        
+    return results_data
+
+
+def offline(args):
+    pass
diff --git a/libem/cascade/match/function.py b/libem/cascade/match/function.py
new file mode 100644
index 0000000..c407d9d
--- /dev/null
+++ b/libem/cascade/match/function.py
@@ -0,0 +1,24 @@
+import libem
+import openai
+import time
+from libem.core.model.openai import reset
+from libem.cascade.util import run as run_match
+
+
+def run(train_set, test_set, args, model_choice="gpt-4o"):
+    reset()
+    
+    results, stats = {}, {}
+
+    libem.calibrate({
+        "libem.match.parameter.model": model_choice,
+        "libem.match.parameter.confidence": True
+    }, verbose=True)
+    args.model = model_choice
+    
+    stats, results = run_match(train_set, test_set, args)
+
+    print("args", args)
+    print("run matching using model", args.model)
+    
+    return stats, results
\ No newline at end of file
diff --git a/libem/cascade/prematch/function.py b/libem/cascade/prematch/function.py
new file mode 100644
index 0000000..9bfff0f
--- /dev/null
+++ b/libem/cascade/prematch/function.py
@@ -0,0 +1,23 @@
+import libem
+from benchmark.util import run_block
+from libem.core.model.openai import reset
+from libem.cascade.util import run as run_prematch
+
+def run(train_set, test_set, args, model_choice="gpt-4o-mini"):
+    reset()
+
+    results, stats = {}, {}
+
+    if model_choice == "block":
+        test_set, stats['block'], results['block'] = run_block(test_set, args)
+
+    else:
+        libem.calibrate({
+            "libem.match.parameter.model": model_choice,
+            "libem.match.parameter.confidence": True
+        }, verbose=True)
+        args.model = model_choice
+        
+        stats, results = run_prematch(train_set, test_set, args)
+    
+    return stats, results
\ No newline at end of file
diff --git a/libem/cascade/util.py b/libem/cascade/util.py
new file mode 100644
index 0000000..9d10b43
--- /dev/null
+++ b/libem/cascade/util.py
@@ -0,0 +1,272 @@
+import time
+import math
+import numpy as np
+import libem
+from libem.core import eval
+from libem.optimize import cost as cost_util
+from libem.tune.learn.confidence.calibrate import temperature_scale
+from libem.match import prompt as match_prompt
+from libem.match import digest as match_digest
+
+from libem.core.struct import Shots, Shot
+
+def stage_filter(results, threshold=0.5):
+    low_confidence_pairs = []
+    high_confidence_results = []
+
+    for result in results:
+        pred = result["pred"]
+        confidence = result['confidence']
+        if confidence is not None and confidence < threshold:
+            low_confidence_pairs.append({
+                'left': result['left'],
+                'right': result['right'],
+                'label': result['label']
+            })
+        else:
+            high_confidence_results.append(result)
+            
+    return low_confidence_pairs, high_confidence_results
+
+
+def run(train_set, test_set, args):
+    num_pairs = len(test_set)
+    num_batches = math.ceil(num_pairs / args.batch_size)
+
+    print(f"Cascading: Matching {num_pairs} "
+          f"{'pair' if num_pairs == 1 else 'pairs'} "
+          f"{f'in {num_batches} batches ' if args.batch_size > 1 else ''}"
+          f"using {args.model}"
+          f"from the {args.name} dataset.")
+
+    if args.num_shots > 0:
+        shots = Shots([
+            Shot(
+                question=match_prompt.query(left=d['left'], right=d['right']),
+                answer="yes" if d['label'] == 1 else "no"
+            ) for d in train_set
+        ])
+        print(f"Cascading: Using {args.num_shots} shots with {args.icl} strategy "
+              f"for in-context learning.")
+    else:
+        shots = []
+
+    start_time = time.time()
+
+    results = {}
+
+    with libem.trace as t:
+        libem.calibrate({
+            "libem.match.parameter.batch_size": args.batch_size,
+            "libem.match.parameter.sync": args.sync,
+            "libem.match.prompt.shots": shots,
+        })
+
+        if args.sync and args.batch_size == 1:
+            print("Cascading: Running in synchronous mode with batch size 1.")
+            # iterate and match each pair
+            for i, data in enumerate(test_set):
+                if 0 < args.num_pairs < i + 1:
+                    break
+
+                left, right = data['left'], data['right']
+                label = data['label']
+                
+                if not args.quiet:
+                    print(f"Pair #{i + 1}\n")
+                    print(f"Entity 1: {left}\n")
+                    print(f"Entity 2: {right}")
+
+                num_retries = 0
+                while True:
+                    try:
+                        is_match: dict = libem.match(left, right)
+                        results[match_digest(left, right)] = {
+                            'left': left,
+                            'right': right,
+                            'label': label,
+                            'pred': is_match['answer'],
+                            'confidence': is_match['confidence'],
+                            'explanation': is_match['explanation'],
+                        }
+
+                        if not args.quiet:
+                            print(f"Match: {is_match['answer']}; "
+                                  f"Confidence: {is_match['confidence']}; "
+                                  f"Label: {label}\n")
+                        break
+                    except libem.ModelTimedoutException:
+                        num_retries += 1
+                        print(f"Retrying {num_retries} time(s) "
+                              f"due to model call timeout..")
+        else:
+            # prepare datasets
+            print("Cascading: Running in asynchronous mode.")
+            left, right, labels = [], [], []
+            for i, data in enumerate(test_set):
+
+                if 0 < args.num_pairs < i + 1:
+                    break
+
+                left.append(data['left'])
+                right.append(data['right'])
+                labels.append(data['label'])
+
+            answers: list[dict] = libem.match(left, right)
+
+            results = {
+                match_digest(l, r): {
+                    'left': l,
+                    'right': r,
+                    'label': label,
+                    'pred': is_match['answer'],
+                    'confidence': is_match['confidence'],
+                    'explanation': is_match['explanation'],
+                }
+                for l, r, label, is_match in zip(left, right, labels, answers)
+            }
+
+        # fill in additional info from the trace
+        for span in t.get():
+            if 'match' not in span:
+                continue
+
+            match = span['match']
+            left, right = match['left'], match['right']
+
+            # for batch matching, the trace are
+            # shared between pairs in each batch
+            if isinstance(left, list):
+                pass
+            else:
+                left, right = [left], [right]
+
+            for l, r in zip(left, right):
+                digest = match_digest(l, r)
+                model_usage = match['model_usage']
+                results[digest].update(
+                    {
+                        'model_output': match['model_output'],
+                        'tool_outputs': match['tool_outputs'],
+                        'latency': libem.round(match['latency'], 2),
+                        'tokens': {
+                            'num_input_tokens': model_usage['num_input_tokens'],
+                            'num_output_tokens': model_usage['num_output_tokens'],
+                            'cost': libem.round(cost_util.get_cost(
+                                args.model,
+                                model_usage['num_input_tokens'],
+                                model_usage['num_output_tokens'],
+                            ), 4)
+                        }
+                    }
+                )
+        end_time = time.time()
+
+        # ignore the digest key
+        results = list(results.values())
+
+    truth = [result['label'] for result in results]
+    predictions = [1 if result['pred'] == 'yes' else 0 for result in results]
+    latencies = [result['latency'] for result in results]
+    confidences = [result['confidence'] for result in results if result['confidence'] is not None]
+    print(f"Model: {args.model}")
+    if args.model != "gpt-4o":
+        print(f"Calibrating confidences for model: {args.model}")
+        calibrated_confidences = temperature_scale(confidences, truth)
+        results = patch_calibrated_confidence(results, calibrated_confidences)
+
+    # generate stats
+    metrics = eval.report(
+        truth, predictions
+    )
+    telemetry = t.telemetry(flatten=True)
+
+    stats = {
+        'num_pairs': num_pairs,
+        'precision': round(metrics['precision'] * 100, 2),
+        'recall': round(metrics['recall'] * 100, 2),
+        'f1': round(metrics['f1'] * 100, 2),
+        'latency': round(end_time - start_time, 2),
+        'throughput': libem.round(num_pairs / (end_time - start_time), 2),
+        'accuracy': round(metrics['accuracy'] * 100, 2),
+        'per_pair_latency': libem.round((end_time - start_time) / num_pairs, 2),
+        'avg_batch_latency': libem.round(np.mean(latencies), 2),
+        'avg_confidence': libem.round(np.mean(confidences), 2) if confidences else -1,
+        'tokens': {
+            'num_input_tokens': telemetry['model.num_input_tokens']['sum'],
+            'num_output_tokens': telemetry['model.num_output_tokens']['sum'],
+            'cost': libem.round(cost_util.get_cost(
+                args.model,
+                telemetry['model.num_input_tokens']['sum'],
+                telemetry['model.num_output_tokens']['sum'],
+            ), 4)
+        },
+        'confusion_matrix': {
+            'tp': metrics['tp'],
+            'fp': metrics['fp'],
+            'tn': metrics['tn'],
+            'fn': metrics['fn'],
+        }
+    }
+
+    print(f"Cascading: Matching done in {stats['latency']}s.")
+    if not args.quiet:
+        print(f"Benchmark: Precision\t {stats['precision']}")
+        print(f"Benchmark: Recall\t {stats['recall']}")
+        print(f"Benchmark: F1 score\t {stats['f1']}")
+        print(f"Benchmark: Throughput\t {stats['throughput']} pps")
+        print(f"Benchmark: Cost \t ${stats['tokens']['cost']}")
+
+    return stats, results
+
+
+def patch_calibrated_confidence(results, calibrated_confidences):
+    for i, result in enumerate(results):
+        if result['confidence'] is not None:
+            results[i] = place_to_next(
+                result, 'confidence', 'calibrated_confidence',
+                round(calibrated_confidences[i], 2)
+                if result['confidence'] is not None else None
+            )
+    return results
+
+
+def place_to_next(d, key_next_to, key_to_place, value_to_place):
+    new_dict = dict()
+
+    for key, value in d.items():
+        new_dict[key] = value
+        if key == key_next_to:
+            new_dict[key_to_place] = value_to_place
+
+    return new_dict
+
+
+def profile(args, results, num_pairs):
+    truth = [result['label'] for result in results]
+    predictions = [1 if result['pred'] == 'yes' else 0 for result in results]
+    latencies = [result['latency'] for result in results]
+    confidences = [result['confidence'] for result in results if result['confidence'] is not None]
+
+    # generate stats
+    metrics = eval.report(
+        truth, predictions
+    )
+
+    stats = {
+        'num_pairs': num_pairs,
+        'precision': round(metrics['precision'] * 100, 2),
+        'recall': round(metrics['recall'] * 100, 2),
+        'f1': round(metrics['f1'] * 100, 2),
+        'accuracy': round(metrics['accuracy'] * 100, 2),
+        'avg_batch_latency': libem.round(np.mean(latencies), 2),
+        'avg_confidence': libem.round(np.mean(confidences), 2) if confidences else -1,
+        'confusion_matrix': {
+            'tp': metrics['tp'],
+            'fp': metrics['fp'],
+            'tn': metrics['tn'],
+            'fn': metrics['fn'],
+        }
+    }
+
+    return stats
\ No newline at end of file
diff --git a/libem/cascade/vectorize/function.py b/libem/cascade/vectorize/function.py
new file mode 100644
index 0000000..500d60e
--- /dev/null
+++ b/libem/cascade/vectorize/function.py
@@ -0,0 +1,25 @@
+import libem
+import random
+
+
+def run(args, dataset, num_pairs=None):
+    random.seed(42)
+
+    # construct kwargs dict
+    kwargs = {
+        'version': 0,
+    }
+    
+    if num_pairs:
+        args.num_pairs = num_pairs
+
+    train_set = dataset.read_train(**kwargs)
+    test_set = dataset.read_test(**kwargs)
+    test_set = list(test_set)
+    
+    if num_pairs: 
+        test_set = test_set[:num_pairs]
+    print(f"Number of test pairs: {len(test_set)}")
+    random.shuffle(test_set)
+
+    return train_set, test_set, 
\ No newline at end of file
diff --git a/libem/optimize/cost/openai.py b/libem/optimize/cost/openai.py
index 6fb7905..9220c36 100644
--- a/libem/optimize/cost/openai.py
+++ b/libem/optimize/cost/openai.py
@@ -7,7 +7,6 @@
     'output_cost_per_token': 0
 }
 
-
 def get_model_info(model=None):
     if model is None:
         return cache.load_openai()
diff --git a/libem/optimize/function.py b/libem/optimize/function.py
index f098e34..765a055 100644
--- a/libem/optimize/function.py
+++ b/libem/optimize/function.py
@@ -26,9 +26,10 @@ def profile(dataset: Iterable, num_samples=-1, detailed=False):
                 break
 
         start = time.time()
+        matches = libem.match(left, right)
         preds = [
             1 if is_match["answer"].lower() == "yes" else 0
-            for is_match in libem.match(left, right)
+            for is_match in matches
         ]
         latency = time.time() - start
 
@@ -36,6 +37,7 @@ def profile(dataset: Iterable, num_samples=-1, detailed=False):
         p = eval.precision(truths, preds)
         r = eval.recall(truths, preds)
         tp, fp, tn, fn = eval.confusion_matrix(truths, preds)
+        confidence = sum([is_match['confidence'] for is_match in matches]) / len(matches)
 
     stats = t.stats()
     stats["model"]["cost"] = cost.get_cost(
@@ -68,6 +70,7 @@ def profile(dataset: Iterable, num_samples=-1, detailed=False):
             "fn": fn,
             "tp": tp,
             "tn": tn,
+            "confidence" : confidence,
             "num_model_calls": stats["model"]["num_model_calls"]["sum"],
             "num_input_tokens": stats["model"]["num_input_tokens"]["sum"],
             "num_output_tokens": stats["model"]["num_output_tokens"]["sum"],