Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Format entire codebase #57

Merged
merged 1 commit into from
Oct 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions eval/mmlu/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from .components import run_mmlu_op, load_mmlu_results_op
#from . import faked
# from . import faked

__all__ = ["run_mmlu_op", "load_mmlu_results_op"]

33 changes: 23 additions & 10 deletions eval/mmlu/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

EVAL_IMAGE = "quay.io/sallyom/instructlab-ocp:eval"


@component(base_image=EVAL_IMAGE)
def run_mmlu_op(
mmlu_output: Output[Artifact],
Expand All @@ -16,21 +17,27 @@ def run_mmlu_op(
batch_size: int,
device: str,
models_list: List[str],
) -> NamedTuple('outputs', best_model=str, best_score=float):
) -> NamedTuple("outputs", best_model=str, best_score=float):
import json
import os
import torch
from instructlab.eval.mmlu import MMLUEvaluator, MMLU_TASKS

mmlu_tasks = mmlu_tasks_list.split(',') if mmlu_tasks_list else MMLU_TASKS
mmlu_tasks = mmlu_tasks_list.split(",") if mmlu_tasks_list else MMLU_TASKS

# Device setup and debug
gpu_available = torch.cuda.is_available()
gpu_name = torch.cuda.get_device_name(torch.cuda.current_device()) if gpu_available else "No GPU available"
gpu_name = (
torch.cuda.get_device_name(torch.cuda.current_device())
if gpu_available
else "No GPU available"
)

print(f"GPU Available: {gpu_available}, Using: {gpu_name}")

effective_device = device if device is not None else ("cuda" if gpu_available else "cpu")
effective_device = (
device if device is not None else ("cuda" if gpu_available else "cpu")
)
print(f"Running on device: {effective_device}")

scores = {}
Expand All @@ -53,33 +60,39 @@ def run_mmlu_op(

mmlu_score, individual_scores = evaluator.run()
average_score = round(mmlu_score, 2)
print(f"Model {model_name} is stored at: {model_path} with AVERAGE_SCORE: {average_score}")
print(
f"Model {model_name} is stored at: {model_path} with AVERAGE_SCORE: {average_score}"
)

mmlu_data = {
"report_title": "KNOWLEDGE EVALUATION REPORT",
"model": model_name,
"average_score": average_score,
"number_of_tasks": len(individual_scores),
"individual_scores": [{task: round(score['score'], 2)} for task, score in individual_scores.items()]
"individual_scores": [
{task: round(score["score"], 2)}
for task, score in individual_scores.items()
],
}

all_mmlu_data.append(mmlu_data)
scores[model_path] = average_score

with open(mmlu_output.path, 'w') as f:
with open(mmlu_output.path, "w") as f:
json.dump(all_mmlu_data, f, indent=4)
outputs = NamedTuple('outputs', best_model=str, best_score=float)
outputs = NamedTuple("outputs", best_model=str, best_score=float)
best_model = max(scores, key=scores.get)
best_score = scores[best_model]
return outputs(best_model=best_model, best_score=best_score)


@component(base_image=PYTHON_IMAGE)
def load_mmlu_results_op(mmlu_output: Input[Artifact]) -> list:
import json

mmlu_score_list = []
with open(mmlu_output.path, 'r') as f:
mmlu_score_list = json.load(f)
with open(mmlu_output.path, "r") as f:
mmlu_score_list = json.load(f)

print("MMLU Evaluation Data:")
for mmlu_score in mmlu_score_list:
Expand Down
3 changes: 1 addition & 2 deletions eval/mt_bench/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from .components import run_mt_bench_op, load_mt_bench_results_op
#from . import faked
# from . import faked

__all__ = ["run_mt_bench_op", "load_mt_bench_results_op"]

94 changes: 60 additions & 34 deletions eval/mt_bench/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

EVAL_IMAGE = "quay.io/sallyom/instructlab-ocp:eval"


@component(base_image=EVAL_IMAGE, packages_to_install=["vllm"])
def run_mt_bench_op(
models_path_prefix: str,
Expand All @@ -17,10 +18,10 @@ def run_mt_bench_op(
# https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36
max_workers: str = "auto",
device: str = None,
) -> NamedTuple('outputs', best_model=str, best_score=float):


def launch_vllm_server_background(model_path: str, gpu_count: int, retries: int = 60, delay: int = 5):
) -> NamedTuple("outputs", best_model=str, best_score=float):
def launch_vllm_server_background(
model_path: str, gpu_count: int, retries: int = 60, delay: int = 5
):
import subprocess
import sys
import time
Expand All @@ -29,15 +30,20 @@ def launch_vllm_server_background(model_path: str, gpu_count: int, retries: int
if gpu_count > 0:
command = [
sys.executable,
"-m", "vllm.entrypoints.openai.api_server",
"--model", model_path,
"--tensor-parallel-size", str(gpu_count),
"-m",
"vllm.entrypoints.openai.api_server",
"--model",
model_path,
"--tensor-parallel-size",
str(gpu_count),
]
else:
command = [
sys.executable,
"-m", "vllm.entrypoints.openai.api_server",
"--model", model_path,
"-m",
"vllm.entrypoints.openai.api_server",
"--model",
model_path,
]

subprocess.Popen(args=command)
Expand All @@ -54,10 +60,14 @@ def launch_vllm_server_background(model_path: str, gpu_count: int, retries: int
except requests.ConnectionError:
pass

print(f"Server not available yet, retrying in {delay} seconds (Attempt {attempt + 1}/{retries})...")
print(
f"Server not available yet, retrying in {delay} seconds (Attempt {attempt + 1}/{retries})..."
)
time.sleep(delay)

raise RuntimeError(f"Failed to start vLLM server at {server_url} after {retries} retries.")
raise RuntimeError(
f"Failed to start vLLM server at {server_url} after {retries} retries."
)

# This seems like excessive effort to stop the vllm process, but merely saving & killing the pid doesn't work
# Also, the base image does not include `pkill` cmd, so can't pkill -f vllm.entrypoints.openai.api_server either
Expand All @@ -67,21 +77,30 @@ def stop_vllm_server_by_name():
for process in psutil.process_iter(attrs=["pid", "name", "cmdline"]):
cmdline = process.info.get("cmdline")
if cmdline and "vllm.entrypoints.openai.api_server" in cmdline:
print(f"Found vLLM server process with PID: {process.info['pid']}, terminating...")
print(
f"Found vLLM server process with PID: {process.info['pid']}, terminating..."
)
try:
process.terminate() # Try graceful termination
process.wait(timeout=5) # Wait a bit for it to terminate
if process.is_running():
print(f"Forcefully killing vLLM server process with PID: {process.info['pid']}")
print(
f"Forcefully killing vLLM server process with PID: {process.info['pid']}"
)
process.kill() # Force kill if it's still running
print(f"Successfully stopped vLLM server with PID: {process.info['pid']}")
print(
f"Successfully stopped vLLM server with PID: {process.info['pid']}"
)
except psutil.NoSuchProcess:
print(f"Process with PID {process.info['pid']} no longer exists.")
except psutil.AccessDenied:
print(f"Access denied when trying to terminate process with PID {process.info['pid']}.")
print(
f"Access denied when trying to terminate process with PID {process.info['pid']}."
)
except Exception as e:
print(f"Failed to terminate process with PID {process.info['pid']}. Error: {e}")

print(
f"Failed to terminate process with PID {process.info['pid']}. Error: {e}"
)

import json
import torch
Expand All @@ -93,7 +112,11 @@ def stop_vllm_server_by_name():
candidate_server_url = "http://localhost:8000/v1"

gpu_available = torch.cuda.is_available()
gpu_name = torch.cuda.get_device_name(torch.cuda.current_device()) if gpu_available else "No GPU available"
gpu_name = (
torch.cuda.get_device_name(torch.cuda.current_device())
if gpu_available
else "No GPU available"
)
gpu_count = torch.cuda.device_count() if gpu_available else 0

print(f"GPU Available: {gpu_available}, {gpu_name}")
Expand All @@ -108,12 +131,12 @@ def stop_vllm_server_by_name():

# TODO: Using evaluator results in connection errors, need to determine why.
# For now, using mt_bench_answers.generate_answers & mt_bench_judgment.generate_judgment
#evaluator = MTBenchEvaluator(
# evaluator = MTBenchEvaluator(
# model_name=candidate_model_name,
# judge_model_name=judge_model_name,
# max_workers=max_workers,
# merge_system_user_message=merge_system_user_message
#)
# )

judge_api_key = os.getenv("JUDGE_API_KEY", "")
judge_model_name = os.getenv("JUDGE_NAME")
Expand All @@ -125,7 +148,7 @@ def stop_vllm_server_by_name():
for model_name in models_list:
print(f"Serving candidate model: {model_name}")
model_path = f"{models_path_prefix}/{model_name}"

# Launch the vLLM server and wait until it is ready
launch_vllm_server_background(model_path, gpu_count)

Expand All @@ -135,18 +158,20 @@ def stop_vllm_server_by_name():
model_name=model_path,
model_api_base=candidate_server_url,
output_dir="/tmp/eval_output",
max_workers=max_workers
max_workers=max_workers,
)

print("Judging answers...")
overall_score, qa_pairs, turn_scores, error_rate = mt_bench_judgment.generate_judgment(
model_name=model_path,
judge_model_name=judge_model_name,
model_api_base=judge_endpoint,
api_key=judge_api_key,
output_dir="/tmp/eval_output",
max_workers=max_workers,
merge_system_user_message=merge_system_user_message
overall_score, qa_pairs, turn_scores, error_rate = (
mt_bench_judgment.generate_judgment(
model_name=model_path,
judge_model_name=judge_model_name,
model_api_base=judge_endpoint,
api_key=judge_api_key,
output_dir="/tmp/eval_output",
max_workers=max_workers,
merge_system_user_message=merge_system_user_message,
)
)

stop_vllm_server_by_name()
Expand All @@ -164,21 +189,22 @@ def stop_vllm_server_by_name():
all_mt_bench_data.append(mt_bench_data)
scores[model_path] = overall_score

with open(mt_bench_output.path, 'w') as f:
with open(mt_bench_output.path, "w") as f:
json.dump(all_mt_bench_data, f, indent=4)

outputs = NamedTuple('outputs', best_model=str, best_score=float)
outputs = NamedTuple("outputs", best_model=str, best_score=float)
best_model = max(scores, key=scores.get)
best_score = scores[best_model]
return outputs(best_model=best_model, best_score=best_score)


@component(base_image=PYTHON_IMAGE)
def load_mt_bench_results_op(mt_bench_output: Input[Artifact]) -> list:
import json

mt_bench_score_list = []
with open(mt_bench_output.path, 'r') as f:
mt_bench_score_list = json.load(f)
with open(mt_bench_output.path, "r") as f:
mt_bench_score_list = json.load(f)

print("MT_Bench Evaluation Data:")
for mt_bench_score in mt_bench_score_list:
Expand Down
Loading