diff --git a/eval/Containerfile b/eval/Containerfile index d00f55c..9e0e12a 100644 --- a/eval/Containerfile +++ b/eval/Containerfile @@ -4,9 +4,13 @@ RUN dnf install -y python3.11 python3.11-devel git python3-pip make automake gcc && dnf clean all \ && rm -rf /var/cache/*dnf* -# TODO update to instructlab/eval after https://github.com/instructlab/eval/pull/128 or equivalent merges +# TODO update to main/ilab-on-ocp utils/helpers +# helpers package includes start_vllm, stop_vllm RUN python3.11 -m ensurepip \ - && python3.11 -m pip install --no-cache-dir git+https://github.com/instructlab/eval@main \ + && python3.11 -m pip install --no-cache-dir git+https://github.com/instructlab/eval@v0.3.1 \ + && python3.11 -m pip install --no-cache-dir git+https://github.com/instructlab/instructlab@v0.19.2 \ + && python3.11 -m pip install --no-cache-dir git+https://github.com/sallyom/ilab-on-ocp.git@final-eval-mtbench#subdirectory=utils/helpers \ + && python3.11 -m pip install --no-cache-dir tenacity lm-eval[api] \ && rm -rf /usr/bin/python3 && ln -s /usr/bin/python3.11 /usr/bin/python3 ENV HF_HOME=/tmp diff --git a/eval/final/__init__.py b/eval/final/__init__.py new file mode 100644 index 0000000..9852157 --- /dev/null +++ b/eval/final/__init__.py @@ -0,0 +1,5 @@ +from .components import run_mt_bench_branch_op + +# from . import faked + +__all__ = ["run_mt_bench_branch_op"] diff --git a/eval/final/components.py b/eval/final/components.py new file mode 100644 index 0000000..b63e8b4 --- /dev/null +++ b/eval/final/components.py @@ -0,0 +1,243 @@ +# type: ignore +# pylint: disable=no-value-for-parameter,import-outside-toplevel,import-error +from typing import List, NamedTuple + +from kfp.dsl import Artifact, Dataset, Input, Model, Output, component, importer + +from utils.consts import EVAL_IMAGE, PYTHON_IMAGE + + +# TODO: update to ilab image, already has vLLM installed +@component( + base_image=EVAL_IMAGE, + packages_to_install=[ + "vllm", + ], +) +def run_mt_bench_branch_op( + mt_bench_branch_output: Output[Artifact], + candidate_model: str, + base_model_dir: str, + taxonomy: Input[Dataset], + base_branch: str, + candidate_branch: str, + max_workers: str, + device: str, + merge_system_user_message: bool, +): + import json + import os + + import torch + from helpers import ( + VLLM_SERVER, + launch_vllm, + stop_vllm, + ) + from instructlab.eval.mt_bench import MTBenchBranchEvaluator + from instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores, sort_score + + ###################################################################### + # branch_eval_summary_to_json creates a json object from output of instructlab/eval + # TODO: Add this to the instructlab/eval or instructlab/instructlab repository + def branch_eval_summary_to_json( + improvements: list[tuple[str, float, float, float]], + regressions: list[tuple[str, float, float, float]], + no_changes: list[tuple[str, float]], + new=None, + ) -> str: + """Generates a JSON object from the _branch benchmark evaluations""" + + import json + + summary = {"improvements": [], "regressions": [], "no_changes": [], "new": []} + + if len(improvements) > 0: + improvements.sort(key=sort_score, reverse=True) + for improvement in improvements: + task, delta, base_score, new_score = improvement + summary["improvements"].append( + { + "task": task, + "base_score": round(base_score, 2), + "new_score": round(new_score, 2), + "delta": delta, + } + ) + + if len(regressions) > 0: + regressions.sort(key=sort_score) + for regression in regressions: + task, delta, base_score, new_score = regression + summary["regressions"].append( + { + "task": task, + "base_score": round(base_score, 2), + "new_score": round(new_score, 2), + "delta": delta, + } + ) + + if len(no_changes) > 0: + for entry in no_changes: + task, avg_score = entry + summary["no_changes"].append( + {"task": task, "average_score": round(avg_score, 2)} + ) + + if new is not None and len(new) > 0: + for entry in new: + na, avg_score = entry + summary["new"].append( + {"qna": qna, "average_score": round(avg_score, 2)} + ) + + return json.dumps(summary, indent=4) + + ###################################################################### + + gpu_available = torch.cuda.is_available() + gpu_name = ( + torch.cuda.get_device_name(torch.cuda.current_device()) + if gpu_available + else "No GPU available" + ) + gpu_count = torch.cuda.device_count() if gpu_available else 0 + + print(f"GPU Available: {gpu_available}, Using: {gpu_name}") + + # MT_BENCH_BRANCH + + judge_api_key = os.getenv("JUDGE_API_KEY", "") + judge_model_name = os.getenv("JUDGE_NAME") + judge_endpoint = os.getenv("JUDGE_ENDPOINT") + + output_dir = "/tmp/eval_output" + + # TODO: candidate_branch must be in same repo, not a fork, or, can compare main branch against candidate, base models + base_branch = base_branch or "main" + candidate_branch = candidate_branch or "main" + + ###################################################################### + # TODO: Update ilab/model/evaluate evaluate def logic to allow for external judge model + # and when that happens, much of this logic can be imported from the `evaluate` definition: + # https://github.com/instructlab/instructlab/blob/83ca501ecdd858677380046e2a56da5b2f3f14e7/src/instructlab/model/evaluate.py#L504 + # + # With instructlab, model_name is synonomous with model_path + mt_bench_evaluators = [ + MTBenchBranchEvaluator( + model_name=candidate_model, + judge_model_name=judge_model_name, + taxonomy_git_repo_path=taxonomy.path, + branch=candidate_branch, + output_dir=output_dir, + merge_system_user_message=merge_system_user_message, + ), + MTBenchBranchEvaluator( + model_name=base_model_dir, + judge_model_name=judge_model_name, + taxonomy_git_repo_path=taxonomy.path, + branch=base_branch, + output_dir=output_dir, + merge_system_user_message=merge_system_user_message, + ), + ] + + # ilab/evaluate uses a magic word for its mt_bench evaluator - `auto` + # with `auto`, number of gpus allocated for serving is calculated based on environment + # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36 + if max_workers == "auto": + try: + usable_cpu_count = len(os.sched_getaffinity(0)) // 2 + except AttributeError: + usable_cpu_count = multiprocessing.cpu_count() // 2 + max_workers = usable_cpu_count + + branches = [candidate_branch, base_branch] + m_paths = [candidate_model, base_model_dir] + qa_pairs_and_errors = [] + for i, evaluator in enumerate(mt_bench_evaluators): + branch = branches[i] + m_path = m_paths[i] + + print( + f"Generating questions and reference answers from qna files for branch {branch}..." + ) + launch_vllm(m_path, gpu_count) + + evaluator.gen_answers( + server_url=VLLM_SERVER, + serving_gpus=gpu_count, + max_workers=max_workers, + ) + + stop_vllm() + + print(f"Evaluating answers for branch {branch}...") + overall_score, qa_pairs, error_rate = evaluator.judge_answers( + server_url=judge_endpoint, + api_key=judge_api_key, + serving_gpus=gpu_count, + max_workers=max_workers, + ) + + qa_pairs_and_errors.append((overall_score, qa_pairs, error_rate)) + + overall_score, qa_pairs, error_rate = qa_pairs_and_errors[0] + base_overall_score, base_qa_pairs, base_error_rate = qa_pairs_and_errors[1] + + qna_to_avg_scores = qa_pairs_to_qna_to_avg_scores(qa_pairs) + base_qna_to_avg_scores = qa_pairs_to_qna_to_avg_scores(base_qa_pairs) + + improvements, regressions, no_changes, new_qnas = [], [], [], [] + + for qna, avg_score in qna_to_avg_scores.items(): + base_avg_score = base_qna_to_avg_scores.get(qna) + if base_avg_score is not None: + if avg_score > base_avg_score: + improvements.append( + ( + qna, + round(avg_score - base_avg_score, 2), + base_avg_score, + avg_score, + ) + ) + elif avg_score == base_avg_score: + no_changes.append((qna, avg_score)) + else: + regressions.append( + ( + qna, + round(avg_score - base_avg_score, 2), + base_avg_score, + avg_score, + ) + ) + else: + new_qnas.append((qna, avg_score)) + + error_rate = (error_rate + base_error_rate) / 2 + if error_rate > 0: + error_rate = round(error_rate, 2) + + summary = branch_eval_summary_to_json( + improvements, + regressions, + no_changes, + new_qnas, + ) + + mt_bench_branch_data = { + "report_title": "SKILLS EVALUATION REPORT", + "model": candidate_model, + "judge_model": judge_model_name, + "max_score": "10.0", + "overall_score": overall_score, + "base_overall_score": base_overall_score, + "error_rate": error_rate, + "summary": summary, + } + + with open(mt_bench_branch_output.path, "w") as f: + json.dump(mt_bench_branch_data, f, indent=4) diff --git a/eval/mt_bench/components.py b/eval/mt_bench/components.py index b0487ad..429f4b2 100644 --- a/eval/mt_bench/components.py +++ b/eval/mt_bench/components.py @@ -4,10 +4,7 @@ from kfp.dsl import Artifact, Input, Model, Output, component, importer -from utils.consts import PYTHON_IMAGE - -# TODO: replace with ilab image -EVAL_IMAGE = "quay.io/sallyom/instructlab-ocp:eval-7ee213" +from utils.consts import EVAL_IMAGE, PYTHON_IMAGE @component(base_image=EVAL_IMAGE, packages_to_install=["vllm"]) @@ -23,96 +20,18 @@ def run_mt_bench_op( models_folder: Optional[str] = None, device: str = None, ) -> NamedTuple("outputs", best_model=str, best_score=float): - def launch_vllm(model_path: str, gpu_count: int, retries: int = 60, delay: int = 5): - import subprocess - import sys - import time - - import requests - - if gpu_count > 0: - command = [ - sys.executable, - "-m", - "vllm.entrypoints.openai.api_server", - "--model", - model_path, - "--tensor-parallel-size", - str(gpu_count), - ] - else: - command = [ - sys.executable, - "-m", - "vllm.entrypoints.openai.api_server", - "--model", - model_path, - ] - - subprocess.Popen(args=command) - - server_url = "http://localhost:8000/v1" - print(f"Waiting for vLLM server to start at {server_url}...") - - for attempt in range(retries): - try: - response = requests.get(f"{server_url}/models") - if response.status_code == 200: - print(f"vLLM server is up and running at {server_url}.") - return - except requests.ConnectionError: - pass - - print( - f"Server not available yet, retrying in {delay} seconds (Attempt {attempt + 1}/{retries})..." - ) - time.sleep(delay) - - raise RuntimeError( - f"Failed to start vLLM server at {server_url} after {retries} retries." - ) - - # This seems like excessive effort to stop the vllm process, but merely saving & killing the pid doesn't work - # Also, the base image does not include `pkill` cmd, so can't pkill -f vllm.entrypoints.openai.api_server either - def stop_vllm_server_by_name(): - import psutil - - for process in psutil.process_iter(attrs=["pid", "name", "cmdline"]): - cmdline = process.info.get("cmdline") - if cmdline and "vllm.entrypoints.openai.api_server" in cmdline: - print( - f"Found vLLM server process with PID: {process.info['pid']}, terminating..." - ) - try: - process.terminate() # Try graceful termination - process.wait(timeout=5) # Wait a bit for it to terminate - if process.is_running(): - print( - f"Forcefully killing vLLM server process with PID: {process.info['pid']}" - ) - process.kill() # Force kill if it's still running - print( - f"Successfully stopped vLLM server with PID: {process.info['pid']}" - ) - except psutil.NoSuchProcess: - print(f"Process with PID {process.info['pid']} no longer exists.") - except psutil.AccessDenied: - print( - f"Access denied when trying to terminate process with PID {process.info['pid']}." - ) - except Exception as e: - print( - f"Failed to terminate process with PID {process.info['pid']}. Error: {e}" - ) - import json import os import torch + from helpers import ( + VLLM_SERVER, + launch_vllm, + stop_vllm, + ) from instructlab.eval.mt_bench import MTBenchEvaluator os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" - vllm_server = "http://localhost:8000/v1" gpu_available = torch.cuda.is_available() gpu_name = ( @@ -159,12 +78,12 @@ def stop_vllm_server_by_name(): ) evaluator.gen_answers( - server_url=vllm_server, + server_url=VLLM_SERVER, serving_gpus=gpu_count, max_workers=max_workers, ) - stop_vllm_server_by_name() + stop_vllm() overall_score, qa_pairs, turn_scores, error_rate = evaluator.judge_answers( server_url=judge_endpoint, diff --git a/pipeline.py b/pipeline.py index e3650e4..bf94099 100644 --- a/pipeline.py +++ b/pipeline.py @@ -24,6 +24,7 @@ DEFAULT_REPO_URL = "https://github.com/instructlab/taxonomy.git" KFP_MODEL_SERVER_CM = "sdg/kfp-model-server.yaml" BASE_MODE = "ibm-granite/granite-7b-base" +BASE_MODEL_DIR = "/model/model" # <- "model ID for vLLM chat/completions - corresponds to path within pvc" MMLU_TASKS_LIST = "mmlu_anatomy,mmlu_astronomy" MODEL_DTYPE = "bfloat16" FEW_SHOTS = 5 @@ -63,8 +64,8 @@ def pipeline_wrapper(mock: List[Literal[MOCKED_STAGES]]): pvc_to_model_op, ) - # Imports for MMLU, MT_BENCH stage - # TODO: Add mock/fake components + # Imports for evaluation + from eval.final import run_mt_bench_branch_op from eval.mmlu import load_mmlu_results_op, run_mmlu_op ## from eval.mmlu import run_mmlu_op, load_mmlu_results_op @@ -312,6 +313,37 @@ def pipeline( use_secret_as_env(run_mt_bench_task, JUDGE_SECRET, {"api_key": "JUDGE_API_KEY"}) + final_eval_task = run_mt_bench_branch_op( + candidate_model=run_mt_bench_task.outputs["best_model"], + taxonomy=git_clone_task.outputs["taxonomy"], + # TODO: DO we need both candidate_branch and base_branch + base_branch=repo_branch, + candidate_branch=repo_branch, + device=device, + base_model_dir=BASE_MODEL_DIR, + max_workers=max_workers, + merge_system_user_message=merge_system_user_message, + ) + + mount_pvc( + task=final_eval_task, pvc_name=output_pvc_task.output, mount_path="/output" + ) + + mount_pvc( + task=final_eval_task, pvc_name=model_pvc_task.output, mount_path="/model" + ) + + use_config_map_as_env( + final_eval_task, + JUDGE_CONFIG_MAP, + dict(endpoint="JUDGE_ENDPOINT", model="JUDGE_NAME"), + ) + + use_secret_as_env(final_eval_task, JUDGE_SECRET, {"api_key": "JUDGE_API_KEY"}) + + final_eval_task.set_accelerator_type("nvidia.com/gpu") + final_eval_task.set_accelerator_limit(1) + # Technically `output_model_task` and `output_data_task` can happen before evaluation, # however the PVC can only be mounted once, so, setting these to _after_ so the eval proceeds. output_model_task = pvc_to_artifact_op( diff --git a/pipeline.yaml b/pipeline.yaml index 816e3fa..333a824 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -423,6 +423,35 @@ components: parameterType: STRING name: parameterType: STRING + comp-run-mt-bench-branch-op: + executorLabel: exec-run-mt-bench-branch-op + inputDefinitions: + artifacts: + taxonomy: + artifactType: + schemaTitle: system.Dataset + schemaVersion: 0.0.1 + parameters: + base_branch: + parameterType: STRING + base_model_dir: + parameterType: STRING + candidate_branch: + parameterType: STRING + candidate_model: + parameterType: STRING + device: + parameterType: STRING + max_workers: + parameterType: STRING + merge_system_user_message: + parameterType: BOOLEAN + outputDefinitions: + artifacts: + mt_bench_branch_output: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 comp-run-mt-bench-op: executorLabel: exec-run-mt-bench-op inputDefinitions: @@ -905,20 +934,21 @@ deploymentSpec: \ claimName: {output_pvc_name}\n \"\"\"\n\ \ )\n\n return Outputs(manifest, name)\n\n" image: registry.access.redhat.com/ubi9/python-311:latest - exec-run-mmlu-op: + exec-run-mt-bench-branch-op: container: args: - --executor_input - '{{$}}' - --function_to_execute - - run_mmlu_op + - run_mt_bench_branch_op command: - sh - -c - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.9.0'\ - \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\ + \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' &&\ + \ python3 -m pip install --quiet --no-warn-script-location 'vllm' && \"\ $0\" \"$@\"\n" - sh - -ec @@ -931,43 +961,115 @@ deploymentSpec: ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ - \ *\n\ndef run_mmlu_op(\n mmlu_output: Output[Artifact],\n models_path_prefix:\ - \ str,\n mmlu_tasks_list: str,\n model_dtype: str,\n few_shots:\ - \ int,\n batch_size: int,\n device: str = None,\n models_list:\ - \ List[str] = None,\n models_folder: Optional[str] = None,\n) -> NamedTuple(\"\ - outputs\", best_model=str, best_score=float):\n import json\n import\ - \ os\n\n import torch\n from instructlab.eval.mmlu import MMLU_TASKS,\ - \ MMLUEvaluator\n\n mmlu_tasks = mmlu_tasks_list.split(\",\") if mmlu_tasks_list\ - \ else MMLU_TASKS\n\n if models_list is None and models_folder:\n \ - \ models_list = os.listdir(models_folder)\n\n # Device setup and\ - \ debug\n gpu_available = torch.cuda.is_available()\n gpu_name = (\n\ - \ torch.cuda.get_device_name(torch.cuda.current_device())\n \ - \ if gpu_available\n else \"No GPU available\"\n )\n\n print(f\"\ - GPU Available: {gpu_available}, Using: {gpu_name}\")\n\n effective_device\ - \ = (\n device if device is not None else (\"cuda\" if gpu_available\ - \ else \"cpu\")\n )\n print(f\"Running on device: {effective_device}\"\ - )\n\n scores = {}\n all_mmlu_data = []\n\n for model_name in models_list:\n\ - \ model_path = f\"{models_path_prefix}/{model_name}\"\n #\ - \ Debug\n print(f\"Model {model_name} is stored at: {model_path}\"\ - )\n\n # Evaluation\n evaluator = MMLUEvaluator(\n \ - \ model_path=model_path,\n tasks=mmlu_tasks,\n \ - \ model_dtype=model_dtype,\n few_shots=few_shots,\n \ - \ batch_size=batch_size,\n device=effective_device,\n \ - \ )\n\n mmlu_score, individual_scores = evaluator.run()\n \ - \ average_score = round(mmlu_score, 2)\n print(\n f\"\ - Model {model_name} is stored at: {model_path} with AVERAGE_SCORE: {average_score}\"\ - \n )\n\n mmlu_data = {\n \"report_title\": \"KNOWLEDGE\ - \ EVALUATION REPORT\",\n \"model\": model_name,\n \ - \ \"average_score\": average_score,\n \"number_of_tasks\": len(individual_scores),\n\ - \ \"individual_scores\": [\n {task: round(score[\"\ - score\"], 2)}\n for task, score in individual_scores.items()\n\ - \ ],\n }\n\n all_mmlu_data.append(mmlu_data)\n\ - \ scores[model_path] = average_score\n\n with open(mmlu_output.path,\ - \ \"w\") as f:\n json.dump(all_mmlu_data, f, indent=4)\n outputs\ - \ = NamedTuple(\"outputs\", best_model=str, best_score=float)\n best_model\ - \ = max(scores, key=scores.get)\n best_score = scores[best_model]\n \ - \ return outputs(best_model=best_model, best_score=best_score)\n\n" - image: quay.io/sallyom/instructlab-ocp:eval + \ *\n\ndef run_mt_bench_branch_op(\n mt_bench_branch_output: Output[Artifact],\n\ + \ candidate_model: str,\n base_model_dir: str,\n taxonomy: Input[Dataset],\n\ + \ base_branch: str,\n candidate_branch: str,\n max_workers: str,\n\ + \ device: str,\n merge_system_user_message: bool,\n):\n import\ + \ json\n import os\n\n import torch\n from helpers import (\n \ + \ VLLM_SERVER,\n launch_vllm,\n stop_vllm,\n )\n\ + \ from instructlab.eval.mt_bench import MTBenchBranchEvaluator\n from\ + \ instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores, sort_score\n\ + \n ######################################################################\n\ + \ # branch_eval_summary_to_json creates a json object from output of\ + \ instructlab/eval\n # TODO: Add this to the instructlab/eval or instructlab/instructlab\ + \ repository\n def branch_eval_summary_to_json(\n improvements:\ + \ list[tuple[str, float, float, float]],\n regressions: list[tuple[str,\ + \ float, float, float]],\n no_changes: list[tuple[str, float]],\n\ + \ new=None,\n ) -> str:\n \"\"\"Generates a JSON object\ + \ from the _branch benchmark evaluations\"\"\"\n\n import json\n\n\ + \ summary = {\"improvements\": [], \"regressions\": [], \"no_changes\"\ + : [], \"new\": []}\n\n if len(improvements) > 0:\n improvements.sort(key=sort_score,\ + \ reverse=True)\n for improvement in improvements:\n \ + \ task, delta, base_score, new_score = improvement\n \ + \ summary[\"improvements\"].append(\n {\n \ + \ \"task\": task,\n \"base_score\"\ + : round(base_score, 2),\n \"new_score\": round(new_score,\ + \ 2),\n \"delta\": delta,\n }\n\ + \ )\n\n if len(regressions) > 0:\n regressions.sort(key=sort_score)\n\ + \ for regression in regressions:\n task, delta,\ + \ base_score, new_score = regression\n summary[\"regressions\"\ + ].append(\n {\n \"task\": task,\n\ + \ \"base_score\": round(base_score, 2),\n \ + \ \"new_score\": round(new_score, 2),\n \ + \ \"delta\": delta,\n }\n )\n\n\ + \ if len(no_changes) > 0:\n for entry in no_changes:\n\ + \ task, avg_score = entry\n summary[\"no_changes\"\ + ].append(\n {\"task\": task, \"average_score\": round(avg_score,\ + \ 2)}\n )\n\n if new is not None and len(new) > 0:\n\ + \ for entry in new:\n na, avg_score = entry\n\ + \ summary[\"new\"].append(\n {\"qna\"\ + : qna, \"average_score\": round(avg_score, 2)}\n )\n\n \ + \ return json.dumps(summary, indent=4)\n\n ######################################################################\n\ + \n gpu_available = torch.cuda.is_available()\n gpu_name = (\n \ + \ torch.cuda.get_device_name(torch.cuda.current_device())\n if\ + \ gpu_available\n else \"No GPU available\"\n )\n gpu_count\ + \ = torch.cuda.device_count() if gpu_available else 0\n\n print(f\"GPU\ + \ Available: {gpu_available}, Using: {gpu_name}\")\n\n # MT_BENCH_BRANCH\n\ + \n judge_api_key = os.getenv(\"JUDGE_API_KEY\", \"\")\n judge_model_name\ + \ = os.getenv(\"JUDGE_NAME\")\n judge_endpoint = os.getenv(\"JUDGE_ENDPOINT\"\ + )\n\n output_dir = \"/tmp/eval_output\"\n\n # TODO: candidate_branch\ + \ must be in same repo, not a fork, or, can compare main branch against\ + \ candidate, base models\n base_branch = base_branch or \"main\"\n \ + \ candidate_branch = candidate_branch or \"main\"\n\n ######################################################################\n\ + \ # TODO: Update ilab/model/evaluate evaluate def logic to allow for\ + \ external judge model\n # and when that happens, much of this logic\ + \ can be imported from the `evaluate` definition:\n # https://github.com/instructlab/instructlab/blob/83ca501ecdd858677380046e2a56da5b2f3f14e7/src/instructlab/model/evaluate.py#L504\n\ + \ #\n # With instructlab, model_name is synonomous with model_path\n\ + \ mt_bench_evaluators = [\n MTBenchBranchEvaluator(\n \ + \ model_name=candidate_model,\n judge_model_name=judge_model_name,\n\ + \ taxonomy_git_repo_path=taxonomy.path,\n branch=candidate_branch,\n\ + \ output_dir=output_dir,\n merge_system_user_message=merge_system_user_message,\n\ + \ ),\n MTBenchBranchEvaluator(\n model_name=base_model_dir,\n\ + \ judge_model_name=judge_model_name,\n taxonomy_git_repo_path=taxonomy.path,\n\ + \ branch=base_branch,\n output_dir=output_dir,\n \ + \ merge_system_user_message=merge_system_user_message,\n \ + \ ),\n ]\n\n # ilab/evaluate uses a magic word for its mt_bench\ + \ evaluator - `auto`\n # with `auto`, number of gpus allocated for serving\ + \ is calculated based on environment\n # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36\n\ + \ if max_workers == \"auto\":\n try:\n usable_cpu_count\ + \ = len(os.sched_getaffinity(0)) // 2\n except AttributeError:\n\ + \ usable_cpu_count = multiprocessing.cpu_count() // 2\n \ + \ max_workers = usable_cpu_count\n\n branches = [candidate_branch,\ + \ base_branch]\n m_paths = [candidate_model, base_model_dir]\n qa_pairs_and_errors\ + \ = []\n for i, evaluator in enumerate(mt_bench_evaluators):\n \ + \ branch = branches[i]\n m_path = m_paths[i]\n\n print(\n\ + \ f\"Generating questions and reference answers from qna files\ + \ for branch {branch}...\"\n )\n launch_vllm(m_path, gpu_count)\n\ + \n evaluator.gen_answers(\n server_url=VLLM_SERVER,\n\ + \ serving_gpus=gpu_count,\n max_workers=max_workers,\n\ + \ )\n\n stop_vllm()\n\n print(f\"Evaluating answers\ + \ for branch {branch}...\")\n overall_score, qa_pairs, error_rate\ + \ = evaluator.judge_answers(\n server_url=judge_endpoint,\n \ + \ api_key=judge_api_key,\n serving_gpus=gpu_count,\n\ + \ max_workers=max_workers,\n )\n\n qa_pairs_and_errors.append((overall_score,\ + \ qa_pairs, error_rate))\n\n overall_score, qa_pairs, error_rate = qa_pairs_and_errors[0]\n\ + \ base_overall_score, base_qa_pairs, base_error_rate = qa_pairs_and_errors[1]\n\ + \n qna_to_avg_scores = qa_pairs_to_qna_to_avg_scores(qa_pairs)\n base_qna_to_avg_scores\ + \ = qa_pairs_to_qna_to_avg_scores(base_qa_pairs)\n\n improvements, regressions,\ + \ no_changes, new_qnas = [], [], [], []\n\n for qna, avg_score in qna_to_avg_scores.items():\n\ + \ base_avg_score = base_qna_to_avg_scores.get(qna)\n if base_avg_score\ + \ is not None:\n if avg_score > base_avg_score:\n \ + \ improvements.append(\n (\n \ + \ qna,\n round(avg_score - base_avg_score, 2),\n\ + \ base_avg_score,\n avg_score,\n\ + \ )\n )\n elif avg_score ==\ + \ base_avg_score:\n no_changes.append((qna, avg_score))\n\ + \ else:\n regressions.append(\n \ + \ (\n qna,\n round(avg_score\ + \ - base_avg_score, 2),\n base_avg_score,\n \ + \ avg_score,\n )\n )\n\ + \ else:\n new_qnas.append((qna, avg_score))\n\n error_rate\ + \ = (error_rate + base_error_rate) / 2\n if error_rate > 0:\n \ + \ error_rate = round(error_rate, 2)\n\n summary = branch_eval_summary_to_json(\n\ + \ improvements,\n regressions,\n no_changes,\n \ + \ new_qnas,\n )\n\n mt_bench_branch_data = {\n \"report_title\"\ + : \"SKILLS EVALUATION REPORT\",\n \"model\": candidate_model,\n \ + \ \"judge_model\": judge_model_name,\n \"max_score\": \"10.0\"\ + ,\n \"overall_score\": overall_score,\n \"base_overall_score\"\ + : base_overall_score,\n \"error_rate\": error_rate,\n \"summary\"\ + : summary,\n }\n\n with open(mt_bench_branch_output.path, \"w\") as\ + \ f:\n json.dump(mt_bench_branch_data, f, indent=4)\n\n" + image: quay.io/sallyom/instructlab-ocp:eval-10-8 resources: accelerator: count: '1' @@ -1006,56 +1108,12 @@ deploymentSpec: \ # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36\n\ \ max_workers: str,\n models_list: List[str] = None,\n models_folder:\ \ Optional[str] = None,\n device: str = None,\n) -> NamedTuple(\"outputs\"\ - , best_model=str, best_score=float):\n def launch_vllm(model_path: str,\ - \ gpu_count: int, retries: int = 60, delay: int = 5):\n import subprocess\n\ - \ import sys\n import time\n\n import requests\n\n\ - \ if gpu_count > 0:\n command = [\n sys.executable,\n\ - \ \"-m\",\n \"vllm.entrypoints.openai.api_server\"\ - ,\n \"--model\",\n model_path,\n \ - \ \"--tensor-parallel-size\",\n str(gpu_count),\n \ - \ ]\n else:\n command = [\n sys.executable,\n\ - \ \"-m\",\n \"vllm.entrypoints.openai.api_server\"\ - ,\n \"--model\",\n model_path,\n \ - \ ]\n\n subprocess.Popen(args=command)\n\n server_url =\ - \ \"http://localhost:8000/v1\"\n print(f\"Waiting for vLLM server\ - \ to start at {server_url}...\")\n\n for attempt in range(retries):\n\ - \ try:\n response = requests.get(f\"{server_url}/models\"\ - )\n if response.status_code == 200:\n \ - \ print(f\"vLLM server is up and running at {server_url}.\")\n \ - \ return\n except requests.ConnectionError:\n \ - \ pass\n\n print(\n f\"Server not available\ - \ yet, retrying in {delay} seconds (Attempt {attempt + 1}/{retries})...\"\ - \n )\n time.sleep(delay)\n\n raise RuntimeError(\n\ - \ f\"Failed to start vLLM server at {server_url} after {retries}\ - \ retries.\"\n )\n\n # This seems like excessive effort to stop\ - \ the vllm process, but merely saving & killing the pid doesn't work\n \ - \ # Also, the base image does not include `pkill` cmd, so can't pkill\ - \ -f vllm.entrypoints.openai.api_server either\n def stop_vllm_server_by_name():\n\ - \ import psutil\n\n for process in psutil.process_iter(attrs=[\"\ - pid\", \"name\", \"cmdline\"]):\n cmdline = process.info.get(\"\ - cmdline\")\n if cmdline and \"vllm.entrypoints.openai.api_server\"\ - \ in cmdline:\n print(\n f\"Found vLLM\ - \ server process with PID: {process.info['pid']}, terminating...\"\n \ - \ )\n try:\n process.terminate()\ - \ # Try graceful termination\n process.wait(timeout=5)\ - \ # Wait a bit for it to terminate\n if process.is_running():\n\ - \ print(\n f\"Forcefully\ - \ killing vLLM server process with PID: {process.info['pid']}\"\n \ - \ )\n process.kill() # Force kill\ - \ if it's still running\n print(\n \ - \ f\"Successfully stopped vLLM server with PID: {process.info['pid']}\"\ - \n )\n except psutil.NoSuchProcess:\n\ - \ print(f\"Process with PID {process.info['pid']} no\ - \ longer exists.\")\n except psutil.AccessDenied:\n \ - \ print(\n f\"Access denied when trying\ - \ to terminate process with PID {process.info['pid']}.\"\n \ - \ )\n except Exception as e:\n print(\n\ - \ f\"Failed to terminate process with PID {process.info['pid']}.\ - \ Error: {e}\"\n )\n\n import json\n import os\n\ - \n import torch\n from instructlab.eval.mt_bench import MTBenchEvaluator\n\ - \n os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\"\ - \n vllm_server = \"http://localhost:8000/v1\"\n\n gpu_available =\ - \ torch.cuda.is_available()\n gpu_name = (\n torch.cuda.get_device_name(torch.cuda.current_device())\n\ + , best_model=str, best_score=float):\n import json\n import os\n\n\ + \ import torch\n from helpers import (\n VLLM_SERVER,\n \ + \ launch_vllm,\n stop_vllm,\n )\n from instructlab.eval.mt_bench\ + \ import MTBenchEvaluator\n\n os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"\ + ] = \"expandable_segments:True\"\n\n gpu_available = torch.cuda.is_available()\n\ + \ gpu_name = (\n torch.cuda.get_device_name(torch.cuda.current_device())\n\ \ if gpu_available\n else \"No GPU available\"\n )\n \ \ gpu_count = torch.cuda.device_count() if gpu_available else 0\n\n \ \ print(f\"GPU Available: {gpu_available}, {gpu_name}\")\n\n if models_list\ @@ -1076,24 +1134,23 @@ deploymentSpec: \ evaluator = MTBenchEvaluator(\n model_name=model_path,\n\ \ judge_model_name=judge_model_name,\n output_dir=\"\ /tmp/eval_output\",\n merge_system_user_message=merge_system_user_message,\n\ - \ )\n\n evaluator.gen_answers(\n server_url=vllm_server,\n\ + \ )\n\n evaluator.gen_answers(\n server_url=VLLM_SERVER,\n\ \ serving_gpus=gpu_count,\n max_workers=max_workers,\n\ - \ )\n\n stop_vllm_server_by_name()\n\n overall_score,\ - \ qa_pairs, turn_scores, error_rate = evaluator.judge_answers(\n \ - \ server_url=judge_endpoint,\n api_key=judge_api_key,\n \ - \ serving_gpus=gpu_count,\n max_workers=max_workers,\n\ - \ )\n\n mt_bench_data = {\n \"report_title\": \"\ - SKILLS EVALUATION REPORT\",\n \"model\": model_path,\n \ - \ \"judge_model\": judge_model_name,\n \"overall_score\"\ - : overall_score,\n \"turn_scores\": turn_scores,\n \ - \ \"qa_scores\": qa_pairs,\n \"error_rate\": error_rate,\n \ - \ }\n\n all_mt_bench_data.append(mt_bench_data)\n scores[model_path]\ - \ = overall_score\n\n with open(mt_bench_output.path, \"w\") as f:\n\ - \ json.dump(all_mt_bench_data, f, indent=4)\n\n outputs = NamedTuple(\"\ - outputs\", best_model=str, best_score=float)\n best_model = max(scores,\ - \ key=scores.get)\n best_score = scores[best_model]\n return outputs(best_model=best_model,\ - \ best_score=best_score)\n\n" - image: quay.io/sallyom/instructlab-ocp:eval-7ee213 + \ )\n\n stop_vllm()\n\n overall_score, qa_pairs, turn_scores,\ + \ error_rate = evaluator.judge_answers(\n server_url=judge_endpoint,\n\ + \ api_key=judge_api_key,\n serving_gpus=gpu_count,\n\ + \ max_workers=max_workers,\n )\n\n mt_bench_data\ + \ = {\n \"report_title\": \"SKILLS EVALUATION REPORT\",\n \ + \ \"model\": model_path,\n \"judge_model\": judge_model_name,\n\ + \ \"overall_score\": overall_score,\n \"turn_scores\"\ + : turn_scores,\n \"qa_scores\": qa_pairs,\n \"error_rate\"\ + : error_rate,\n }\n\n all_mt_bench_data.append(mt_bench_data)\n\ + \ scores[model_path] = overall_score\n\n with open(mt_bench_output.path,\ + \ \"w\") as f:\n json.dump(all_mt_bench_data, f, indent=4)\n\n \ + \ outputs = NamedTuple(\"outputs\", best_model=str, best_score=float)\n\ + \ best_model = max(scores, key=scores.get)\n best_score = scores[best_model]\n\ + \ return outputs(best_model=best_model, best_score=best_score)\n\n" + image: quay.io/sallyom/instructlab-ocp:eval-10-8 resources: accelerator: count: '1' @@ -1536,6 +1593,42 @@ root: constant: second taskInfo: name: pytorchjob-manifest-op-2 + run-mt-bench-branch-op: + cachingOptions: + enableCache: true + componentRef: + name: comp-run-mt-bench-branch-op + dependentTasks: + - createpvc + - createpvc-3 + - git-clone-op + - run-mt-bench-op + inputs: + artifacts: + taxonomy: + taskOutputArtifact: + outputArtifactKey: taxonomy + producerTask: git-clone-op + parameters: + base_branch: + componentInputParameter: repo_branch + base_model_dir: + runtimeValue: + constant: /model/model + candidate_branch: + componentInputParameter: repo_branch + candidate_model: + taskOutputParameter: + outputParameterKey: best_model + producerTask: run-mt-bench-op + device: + componentInputParameter: device + max_workers: + componentInputParameter: max_workers + merge_system_user_message: + componentInputParameter: merge_system_user_message + taskInfo: + name: run-mt-bench-branch-op run-mt-bench-op: cachingOptions: {} componentRef: @@ -1680,6 +1773,28 @@ platforms: taskOutputParameter: outputParameterKey: name producerTask: createpvc-3 + exec-run-mt-bench-branch-op: + configMapAsEnv: + - configMapName: kfp-model-server + keyToEnv: + - configMapKey: endpoint + envVar: JUDGE_ENDPOINT + - configMapKey: model + envVar: JUDGE_NAME + pvcMount: + - mountPath: /output + taskOutputParameter: + outputParameterKey: name + producerTask: createpvc-3 + - mountPath: /model + taskOutputParameter: + outputParameterKey: name + producerTask: createpvc + secretAsEnv: + - keyToEnv: + - envVar: JUDGE_API_KEY + secretKey: api_key + secretName: judge-server exec-run-mt-bench-op: configMapAsEnv: - configMapName: kfp-model-server diff --git a/utils/consts.py b/utils/consts.py index 28835d4..eb803d9 100644 --- a/utils/consts.py +++ b/utils/consts.py @@ -1,3 +1,4 @@ PYTHON_IMAGE = "registry.access.redhat.com/ubi9/python-311:latest" TOOLBOX_IMAGE = "registry.access.redhat.com/ubi9/toolbox" OC_IMAGE = "registry.redhat.io/openshift4/ose-cli" +EVAL_IMAGE = "quay.io/sallyom/instructlab-ocp:eval-10-8" diff --git a/utils/helpers/__init__.py b/utils/helpers/__init__.py new file mode 100644 index 0000000..7242e2f --- /dev/null +++ b/utils/helpers/__init__.py @@ -0,0 +1,11 @@ +from .helpers import ( + VLLM_SERVER, + launch_vllm, + stop_vllm, +) + +__all__ = [ + "launch_vllm", + "stop_vllm", + "VLLM_SERVER", +] diff --git a/utils/helpers/helpers.py b/utils/helpers/helpers.py new file mode 100644 index 0000000..326fafe --- /dev/null +++ b/utils/helpers/helpers.py @@ -0,0 +1,84 @@ +VLLM_SERVER = "http://localhost:8000/v1" + + +def launch_vllm(model_path: str, gpu_count: int, retries: int = 60, delay: int = 5): + import subprocess + import sys + import time + + import requests + + if gpu_count > 0: + command = [ + sys.executable, + "-m", + "vllm.entrypoints.openai.api_server", + "--model", + model_path, + "--tensor-parallel-size", + str(gpu_count), + ] + else: + command = [ + sys.executable, + "-m", + "vllm.entrypoints.openai.api_server", + "--model", + model_path, + ] + + subprocess.Popen(args=command) + + print(f"Waiting for vLLM server to start at {VLLM_SERVER}...") + + for attempt in range(retries): + try: + response = requests.get(f"{VLLM_SERVER}/models") + if response.status_code == 200: + print(f"vLLM server is up and running at {VLLM_SERVER}.") + return + except requests.ConnectionError: + pass + + print( + f"Server not available yet, retrying in {delay} seconds (Attempt {attempt + 1}/{retries})..." + ) + time.sleep(delay) + + raise RuntimeError( + f"Failed to start vLLM server at {VLLM_SERVER} after {retries} retries." + ) + + +# This seems like excessive effort to stop the vllm process, but merely saving & killing the pid doesn't work +# Also, the base image does not include `pkill` cmd, so can't pkill -f vllm.entrypoints.openai.api_server either +def stop_vllm(): + import psutil + + for process in psutil.process_iter(attrs=["pid", "name", "cmdline"]): + cmdline = process.info.get("cmdline") + if cmdline and "vllm.entrypoints.openai.api_server" in cmdline: + print( + f"Found vLLM server process with PID: {process.info['pid']}, terminating..." + ) + try: + process.terminate() # Try graceful termination + process.wait(timeout=5) # Wait a bit for it to terminate + if process.is_running(): + print( + f"Forcefully killing vLLM server process with PID: {process.info['pid']}" + ) + process.kill() # Force kill if it's still running + print( + f"Successfully stopped vLLM server with PID: {process.info['pid']}" + ) + except psutil.NoSuchProcess: + print(f"Process with PID {process.info['pid']} no longer exists.") + except psutil.AccessDenied: + print( + f"Access denied when trying to terminate process with PID {process.info['pid']}." + ) + except Exception as e: + print( + f"Failed to terminate process with PID {process.info['pid']}. Error: {e}" + ) diff --git a/utils/helpers/pyproject.toml b/utils/helpers/pyproject.toml new file mode 100644 index 0000000..1276633 --- /dev/null +++ b/utils/helpers/pyproject.toml @@ -0,0 +1,7 @@ +[project] +name = "helpers" +version = "0.0.1" + +[build-system] +requires = ["setuptools", "setuptools-scm"] +build-backend = "setuptools.build_meta"