Skip to content
This repository was archived by the owner on Oct 11, 2024. It is now read-only.

Commit 7c46a95

Browse files
[Rel Eng] Dial In LM Eval Tests Phase 1 (#289)
WAIT UNTIL UPSTREAM SYNC LANDS TO MERGE SUMMARY: * refactored lm-eval workflows to use a single script for generating a baseline * refactored lm-eval workflows to accept a config file so we can parameterize for the different length runs * added configuration for `remote-push` -> running `llama-3-8b` on 250 GSM prompts * removed lm-eval-smoke such that we have one single pathway for running lm-eval tests
1 parent 39e484e commit 7c46a95

15 files changed

+233
-299
lines changed

.github/actions/nm-lm-eval-smoke/action.yml

-32
This file was deleted.
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,20 @@
1-
name: run lm-eval full accuracy test
2-
description: 'run lm-eval full accuracy test'
1+
name: run lm-eval accuracy test
2+
description: 'run lm-eval accuracy test'
33
inputs:
44
python:
55
description: 'python version, e.g. 3.10.12'
66
required: true
77
venv:
88
description: 'name for python virtual environment'
99
required: true
10+
lm_eval_configuration:
11+
description: 'file containing test configuration'
12+
required: true
1013
runs:
1114
using: composite
1215
steps:
1316
- id: lm-eval
1417
run: |
15-
# move source directories
16-
mv vllm vllm-ignore || echo "no 'vllm' folder to move"
17-
mv csrc csrc-ignore || echo "no 'csrc' folder to move"
18-
1918
if [ -n "${{ inputs.venv }}" ]; then
2019
COMMIT=${{ github.sha }}
2120
VENV="${{ inputs.venv }}-${COMMIT:0:7}"
@@ -26,7 +25,7 @@ runs:
2625
pip3 install pytest openai==1.3.9
2726
2827
SUCCESS=0
29-
pytest -v tests/accuracy/test_lm_eval_correctness.py || SUCCESS=$?
30-
echo "test=${SUCCESS}" >> "$GITHUB_OUTPUT"
28+
./.github/scripts/nm-run-lm-eval-vllm.sh -c ${{ inputs.lm_eval_configuration }} || SUCCESS=$?
29+
echo "lm_eval=${SUCCESS}" >> "$GITHUB_OUTPUT"
3130
exit ${SUCCESS}
3231
shell: bash

.github/scripts/lm_eval_compare_hf_vs_vllm.py

-125
This file was deleted.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
#!/bin/bash
2+
# We can use this script to compute baseline accuracy on GSM for transformers.
3+
#
4+
# Make sure you have lm-eval-harness installed:
5+
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
6+
7+
usage() {
8+
echo``
9+
echo "Runs lm eval harness on GSM8k using huggingface transformers."
10+
echo "This pathway is intended to be used to create baselines for "
11+
echo "our automated nm-test-accuracy workflow"
12+
echo
13+
echo "usage: ${0} <options>"
14+
echo
15+
echo " -m - huggingface stub or local directory of the model"
16+
echo " -b - batch size to run the evaluation at"
17+
echo " -d - device to use (e.g. cuda, cuda:0, auto, cpu)"
18+
echo " -l - limit number of samples to run"
19+
echo " -f - number of fewshot samples to use"
20+
echo
21+
}
22+
23+
while getopts "m:b:d:l:f:" OPT; do
24+
case ${OPT} in
25+
m )
26+
MODEL="$OPTARG"
27+
;;
28+
b )
29+
BATCH_SIZE="$OPTARG"
30+
;;
31+
d )
32+
DEVICE="$OPTARG"
33+
;;
34+
l )
35+
LIMIT="$OPTARG"
36+
;;
37+
f )
38+
FEWSHOT="$OPTARG"
39+
;;
40+
\? )
41+
usage
42+
exit 1
43+
;;
44+
esac
45+
done
46+
47+
lm_eval --model hf \
48+
--model_args pretrained=$MODEL \
49+
--tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
50+
--batch_size $BATCH_SIZE --device $DEVICE
+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/bin/bash
2+
# We can use this script to compute baseline accuracy on GSM for transformers.
3+
#
4+
# Make sure you have lm-eval-harness installed:
5+
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
6+
7+
usage() {
8+
echo``
9+
echo "Runs lm eval harness on GSM8k using vllm server and compares to "
10+
echo "precomputed baseline (measured by HF transformers."
11+
echo
12+
echo "usage: ${0} <options>"
13+
echo
14+
echo " -c - path to the test data config (e.g. neuralmagic/lm-eval/YOUR_CONFIG.yaml)"
15+
echo
16+
}
17+
18+
while getopts "c:" OPT; do
19+
case ${OPT} in
20+
c )
21+
CONFIG="$OPTARG"
22+
;;
23+
\? )
24+
usage
25+
exit 1
26+
;;
27+
esac
28+
done
29+
30+
LM_EVAL_TEST_DATA_FILE=$CONFIG pytest -v tests/accuracy/test_lm_eval_correctness.py

.github/workflows/nm-build-test.yml

+18-7
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,19 @@ on:
6666
description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI"
6767
type: string
6868
default: "false"
69+
# lm-eval related parameters
70+
lm_eval_label:
71+
description: "requested runner label (specifies instance)"
72+
type: string
73+
default: ""
74+
lm_eval_timeout:
75+
description: "time limit for lm_eval in minutes"
76+
type: string
77+
default: "60"
78+
lm_eval_configuration:
79+
description: "configuration for lm-eval test (see neuralmagic/lm-eval)"
80+
type: string
81+
default: ""
6982

7083
jobs:
7184

@@ -134,16 +147,14 @@ jobs:
134147
push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}"
135148
secrets: inherit
136149

137-
TEST-ACCURACY-FULL:
150+
LM-EVAL-SOLO:
138151
needs: [BUILD]
139-
if: ${{ inputs.wf_category == 'WEEKLY' || inputs.wf_category == 'RELEASE' }}
140-
uses: ./.github/workflows/nm-test-accuracy-full.yml
152+
uses: ./.github/workflows/nm-lm-eval.yml
141153
with:
142-
label: ${{ inputs.test_label_multi }}
143-
timeout: ${{ inputs.benchmark_timeout }}
154+
label: ${{ inputs.lm_eval_label }}
155+
timeout: ${{ inputs.lm_eval_timeout }}
144156
gitref: ${{ inputs.gitref }}
145-
Gi_per_thread: ${{ inputs.Gi_per_thread }}
146-
nvcc_threads: ${{ inputs.nvcc_threads }}
147157
python: ${{ inputs.python }}
148158
whl: ${{ needs.BUILD.outputs.whl }}
159+
lm_eval_configuration: ${{ inputs.lm_eval_configuration }}
149160
secrets: inherit

.github/workflows/nm-test-accuracy-full.yml .github/workflows/nm-lm-eval.yml

+17-18
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,6 @@ on:
1515
description: "git commit hash or branch name"
1616
type: string
1717
required: true
18-
Gi_per_thread:
19-
description: 'requested GiB to reserve per thread'
20-
type: string
21-
required: true
22-
nvcc_threads:
23-
description: "number of threads nvcc build threads"
24-
type: string
25-
required: true
2618
python:
2719
description: "python version, e.g. 3.10.12"
2820
type: string
@@ -31,6 +23,10 @@ on:
3123
description: "whl to test (variable appears late binding so unusable outside 'download artifact')"
3224
type: string
3325
required: true
26+
lm_eval_configuration:
27+
description: 'file containing tests configuration (see: nm-vllm/neuralmagic/lm-eval)'
28+
type: string
29+
required: true
3430

3531
# makes workflow manually callable
3632
workflow_dispatch:
@@ -47,14 +43,6 @@ on:
4743
description: "git commit hash or branch name"
4844
type: string
4945
required: true
50-
Gi_per_thread:
51-
description: 'requested GiB to reserve per thread'
52-
type: string
53-
required: true
54-
nvcc_threads:
55-
description: "number of threads nvcc build threads"
56-
type: string
57-
required: true
5846
python:
5947
description: "python version, e.g. 3.10.12"
6048
type: string
@@ -63,9 +51,13 @@ on:
6351
description: "whl to test (variable appears late binding so unusable outside 'download artifact')"
6452
type: string
6553
required: true
54+
lm_eval_configuration:
55+
description: 'file containing tests configuration (see: nm-vllm/neuralmagic/lm-eval)'
56+
type: string
57+
required: true
6658

6759
jobs:
68-
TEST-ACCURACY-FULL:
60+
LM-EVAL:
6961

7062
runs-on: ${{ inputs.label }}
7163
timeout-minutes: ${{ fromJSON(inputs.timeout) }}
@@ -77,6 +69,12 @@ jobs:
7769
uses: actions/setup-python@v5
7870
with:
7971
python-version: ${{ inputs.python }}
72+
73+
- name: install automation components
74+
run: |
75+
sudo apt-get update --fix-missing
76+
sudo apt-get install -y git-all
77+
sudo apt-get install -y curl
8078
8179
- name: checkout repository code
8280
uses: actions/checkout@v4
@@ -114,7 +112,8 @@ jobs:
114112
venv:
115113

116114
- name: run lm-eval-accuracy
117-
uses: ./.github/actions/nm-lm-eval-accuracy/
115+
uses: ./.github/actions/nm-lm-eval/
118116
with:
119117
python: ${{ inputs.python }}
120118
venv:
119+
lm_eval_configuration: ${{ inputs.lm_eval_configuration }}

0 commit comments

Comments
 (0)