redhat-et · MichaelClifford · Oct 3, 2024 · Oct 3, 2024
diff --git a/eval/mmlu/__init__.py b/eval/mmlu/__init__.py
@@ -1,5 +1,4 @@
 from .components import run_mmlu_op, load_mmlu_results_op
-#from . import faked
+# from . import faked
 
 __all__ = ["run_mmlu_op", "load_mmlu_results_op"]
-
diff --git a/eval/mmlu/components.py b/eval/mmlu/components.py
@@ -6,6 +6,7 @@
 
 EVAL_IMAGE = "quay.io/sallyom/instructlab-ocp:eval"
 
+
 @component(base_image=EVAL_IMAGE)
 def run_mmlu_op(
     mmlu_output: Output[Artifact],
@@ -16,21 +17,27 @@ def run_mmlu_op(
     batch_size: int,
     device: str,
     models_list: List[str],
-) -> NamedTuple('outputs', best_model=str, best_score=float):
+) -> NamedTuple("outputs", best_model=str, best_score=float):
     import json
     import os
     import torch
     from instructlab.eval.mmlu import MMLUEvaluator, MMLU_TASKS
 
-    mmlu_tasks = mmlu_tasks_list.split(',') if mmlu_tasks_list else MMLU_TASKS
+    mmlu_tasks = mmlu_tasks_list.split(",") if mmlu_tasks_list else MMLU_TASKS
 
     # Device setup and debug
     gpu_available = torch.cuda.is_available()
-    gpu_name = torch.cuda.get_device_name(torch.cuda.current_device()) if gpu_available else "No GPU available"
+    gpu_name = (
+        torch.cuda.get_device_name(torch.cuda.current_device())
+        if gpu_available
+        else "No GPU available"
+    )
 
     print(f"GPU Available: {gpu_available}, Using: {gpu_name}")
 
-    effective_device = device if device is not None else ("cuda" if gpu_available else "cpu")
+    effective_device = (
+        device if device is not None else ("cuda" if gpu_available else "cpu")
+    )
     print(f"Running on device: {effective_device}")
 
     scores = {}
@@ -53,33 +60,39 @@ def run_mmlu_op(
 
         mmlu_score, individual_scores = evaluator.run()
         average_score = round(mmlu_score, 2)
-        print(f"Model {model_name} is stored at: {model_path} with AVERAGE_SCORE: {average_score}")
+        print(
+            f"Model {model_name} is stored at: {model_path} with AVERAGE_SCORE: {average_score}"
+        )
 
         mmlu_data = {
             "report_title": "KNOWLEDGE EVALUATION REPORT",
             "model": model_name,
             "average_score": average_score,
             "number_of_tasks": len(individual_scores),
-            "individual_scores": [{task: round(score['score'], 2)} for task, score in individual_scores.items()]
+            "individual_scores": [
+                {task: round(score["score"], 2)}
+                for task, score in individual_scores.items()
+            ],
         }
 
         all_mmlu_data.append(mmlu_data)
         scores[model_path] = average_score
 
-    with open(mmlu_output.path, 'w') as f:
+    with open(mmlu_output.path, "w") as f:
         json.dump(all_mmlu_data, f, indent=4)
-    outputs = NamedTuple('outputs', best_model=str, best_score=float)
+    outputs = NamedTuple("outputs", best_model=str, best_score=float)
     best_model = max(scores, key=scores.get)
     best_score = scores[best_model]
     return outputs(best_model=best_model, best_score=best_score)
 
+
 @component(base_image=PYTHON_IMAGE)
 def load_mmlu_results_op(mmlu_output: Input[Artifact]) -> list:
     import json
 
     mmlu_score_list = []
-    with open(mmlu_output.path, 'r') as f:
-         mmlu_score_list = json.load(f)
+    with open(mmlu_output.path, "r") as f:
+        mmlu_score_list = json.load(f)
 
     print("MMLU Evaluation Data:")
     for mmlu_score in mmlu_score_list:

diff --git a/eval/mt_bench/__init__.py b/eval/mt_bench/__init__.py
@@ -1,5 +1,4 @@
 from .components import run_mt_bench_op, load_mt_bench_results_op
-#from . import faked
+# from . import faked
 
 __all__ = ["run_mt_bench_op", "load_mt_bench_results_op"]
-
diff --git a/eval/mt_bench/components.py b/eval/mt_bench/components.py
@@ -6,6 +6,7 @@
 
 EVAL_IMAGE = "quay.io/sallyom/instructlab-ocp:eval"
 
+
 @component(base_image=EVAL_IMAGE, packages_to_install=["vllm"])
 def run_mt_bench_op(
     models_path_prefix: str,
@@ -17,10 +18,10 @@ def run_mt_bench_op(
     # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36
     max_workers: str = "auto",
     device: str = None,
-) -> NamedTuple('outputs', best_model=str, best_score=float):
-
-
-    def launch_vllm_server_background(model_path: str, gpu_count: int, retries: int = 60, delay: int = 5):
+) -> NamedTuple("outputs", best_model=str, best_score=float):
+    def launch_vllm_server_background(
+        model_path: str, gpu_count: int, retries: int = 60, delay: int = 5
+    ):
         import subprocess
         import sys
         import time
@@ -29,15 +30,20 @@ def launch_vllm_server_background(model_path: str, gpu_count: int, retries: int
         if gpu_count > 0:
             command = [
                 sys.executable,
-                "-m", "vllm.entrypoints.openai.api_server",
-                "--model", model_path,
-                "--tensor-parallel-size", str(gpu_count),
+                "-m",
+                "vllm.entrypoints.openai.api_server",
+                "--model",
+                model_path,
+                "--tensor-parallel-size",
+                str(gpu_count),
             ]
         else:
             command = [
                 sys.executable,
-                "-m", "vllm.entrypoints.openai.api_server",
-                "--model", model_path,
+                "-m",
+                "vllm.entrypoints.openai.api_server",
+                "--model",
+                model_path,
             ]
 
         subprocess.Popen(args=command)
@@ -54,10 +60,14 @@ def launch_vllm_server_background(model_path: str, gpu_count: int, retries: int
             except requests.ConnectionError:
                 pass
 
-            print(f"Server not available yet, retrying in {delay} seconds (Attempt {attempt + 1}/{retries})...")
+            print(
+                f"Server not available yet, retrying in {delay} seconds (Attempt {attempt + 1}/{retries})..."
+            )
             time.sleep(delay)
 
-        raise RuntimeError(f"Failed to start vLLM server at {server_url} after {retries} retries.")
+        raise RuntimeError(
+            f"Failed to start vLLM server at {server_url} after {retries} retries."
+        )
 
     # This seems like excessive effort to stop the vllm process, but merely saving & killing the pid doesn't work
     # Also, the base image does not include `pkill` cmd, so can't pkill -f vllm.entrypoints.openai.api_server either
@@ -67,21 +77,30 @@ def stop_vllm_server_by_name():
         for process in psutil.process_iter(attrs=["pid", "name", "cmdline"]):
             cmdline = process.info.get("cmdline")
             if cmdline and "vllm.entrypoints.openai.api_server" in cmdline:
-                print(f"Found vLLM server process with PID: {process.info['pid']}, terminating...")
+                print(
+                    f"Found vLLM server process with PID: {process.info['pid']}, terminating..."
+                )
                 try:
                     process.terminate()  # Try graceful termination
                     process.wait(timeout=5)  # Wait a bit for it to terminate
                     if process.is_running():
-                        print(f"Forcefully killing vLLM server process with PID: {process.info['pid']}")
+                        print(
+                            f"Forcefully killing vLLM server process with PID: {process.info['pid']}"
+                        )
                         process.kill()  # Force kill if it's still running
-                    print(f"Successfully stopped vLLM server with PID: {process.info['pid']}")
+                    print(
+                        f"Successfully stopped vLLM server with PID: {process.info['pid']}"
+                    )
                 except psutil.NoSuchProcess:
                     print(f"Process with PID {process.info['pid']} no longer exists.")
                 except psutil.AccessDenied:
-                    print(f"Access denied when trying to terminate process with PID {process.info['pid']}.")
+                    print(
+                        f"Access denied when trying to terminate process with PID {process.info['pid']}."
+                    )
                 except Exception as e:
-                    print(f"Failed to terminate process with PID {process.info['pid']}. Error: {e}")
-
+                    print(
+                        f"Failed to terminate process with PID {process.info['pid']}. Error: {e}"
+                    )
 
     import json
     import torch
@@ -93,7 +112,11 @@ def stop_vllm_server_by_name():
     candidate_server_url = "http://localhost:8000/v1"
 
     gpu_available = torch.cuda.is_available()
-    gpu_name = torch.cuda.get_device_name(torch.cuda.current_device()) if gpu_available else "No GPU available"
+    gpu_name = (
+        torch.cuda.get_device_name(torch.cuda.current_device())
+        if gpu_available
+        else "No GPU available"
+    )
     gpu_count = torch.cuda.device_count() if gpu_available else 0
 
     print(f"GPU Available: {gpu_available}, {gpu_name}")
@@ -108,12 +131,12 @@ def stop_vllm_server_by_name():
 
     # TODO: Using evaluator results in connection errors, need to determine why.
     #       For now, using mt_bench_answers.generate_answers & mt_bench_judgment.generate_judgment
-    #evaluator = MTBenchEvaluator(
+    # evaluator = MTBenchEvaluator(
     #    model_name=candidate_model_name,
     #    judge_model_name=judge_model_name,
     #    max_workers=max_workers,
     #    merge_system_user_message=merge_system_user_message
-    #)
+    # )
 
     judge_api_key = os.getenv("JUDGE_API_KEY", "")
     judge_model_name = os.getenv("JUDGE_NAME")
@@ -125,7 +148,7 @@ def stop_vllm_server_by_name():
     for model_name in models_list:
         print(f"Serving candidate model: {model_name}")
         model_path = f"{models_path_prefix}/{model_name}"
-        
+
         # Launch the vLLM server and wait until it is ready
         launch_vllm_server_background(model_path, gpu_count)
 
@@ -135,18 +158,20 @@ def stop_vllm_server_by_name():
             model_name=model_path,
             model_api_base=candidate_server_url,
             output_dir="/tmp/eval_output",
-            max_workers=max_workers
+            max_workers=max_workers,
         )
 
         print("Judging answers...")
-        overall_score, qa_pairs, turn_scores, error_rate = mt_bench_judgment.generate_judgment(
-            model_name=model_path,
-            judge_model_name=judge_model_name,
-            model_api_base=judge_endpoint,
-            api_key=judge_api_key,
-            output_dir="/tmp/eval_output",
-            max_workers=max_workers,
-            merge_system_user_message=merge_system_user_message
+        overall_score, qa_pairs, turn_scores, error_rate = (
+            mt_bench_judgment.generate_judgment(
+                model_name=model_path,
+                judge_model_name=judge_model_name,
+                model_api_base=judge_endpoint,
+                api_key=judge_api_key,
+                output_dir="/tmp/eval_output",
+                max_workers=max_workers,
+                merge_system_user_message=merge_system_user_message,
+            )
         )
 
         stop_vllm_server_by_name()
@@ -164,21 +189,22 @@ def stop_vllm_server_by_name():
         all_mt_bench_data.append(mt_bench_data)
         scores[model_path] = overall_score
 
-    with open(mt_bench_output.path, 'w') as f:
+    with open(mt_bench_output.path, "w") as f:
         json.dump(all_mt_bench_data, f, indent=4)
 
-    outputs = NamedTuple('outputs', best_model=str, best_score=float)
+    outputs = NamedTuple("outputs", best_model=str, best_score=float)
     best_model = max(scores, key=scores.get)
     best_score = scores[best_model]
     return outputs(best_model=best_model, best_score=best_score)
 
+
 @component(base_image=PYTHON_IMAGE)
 def load_mt_bench_results_op(mt_bench_output: Input[Artifact]) -> list:
     import json
 
     mt_bench_score_list = []
-    with open(mt_bench_output.path, 'r') as f:
-         mt_bench_score_list = json.load(f)
+    with open(mt_bench_output.path, "r") as f:
+        mt_bench_score_list = json.load(f)
 
     print("MT_Bench Evaluation Data:")
     for mt_bench_score in mt_bench_score_list: