Skip to content
This repository has been archived by the owner on Oct 25, 2024. It is now read-only.

Commit

Permalink
add fp8_e5m2 and nf4 ci
Browse files Browse the repository at this point in the history
Signed-off-by: Yu, Zhentao <zhentao.yu@intel.com>
  • Loading branch information
zhentaoyu committed Dec 14, 2023
1 parent 7290d05 commit 12dea92
Showing 1 changed file with 6 additions and 2 deletions.
8 changes: 6 additions & 2 deletions .github/workflows/script/models/cpp_graph_inference.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ function main() {
quant_script="./build/bin/quant_llama"
infer_cmd="./build/bin/run_llama"
input_model="/tf_dataset2/models/nlp_toolkit/llama-2-7b-chat/Llama-2-7b-chat-hf"
precision_list=("q4_j_b128" "q4_j_b32" "q4_0" "q8_e4m3" "q4_e2m1")
precision_list=("q4_j_b128" "q4_j_b32" "q4_0" "q8_e4m3" "q8_e5m2" "q4_e2m1" "nf4")
elif [[ "${model}" == "gpt-neox-20b" ]]; then
convert_script="${working_dir}/scripts/convert_gptneox.py"
quant_script="./build/bin/quant_gptneox"
Expand Down Expand Up @@ -121,9 +121,13 @@ function main() {
# elif [[ ${precision} == "q4_j_vnni_bf16_b32" ]]; then
# ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --nthread $cores_per_instance --weight_dtype int4 --group_size 32 --scale_dtype bf16 --compute_dtype int8 --alg sym
elif [[ ${precision} == "q8_e4m3" ]]; then
${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --nthread $cores_per_instance --weight_dtype fp8 --group_size 32 --scale_dtype fp32 --compute_dtype fp32 --alg sym
${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --nthread $cores_per_instance --weight_dtype fp8 --group_size 32 --scale_dtype fp8 --compute_dtype fp32 --alg sym
elif [[ ${precision} == "q8_e5m2" ]]; then
${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --nthread $cores_per_instance --weight_dtype fp8_e5m2 --group_size 32 --scale_dtype fp8 --compute_dtype fp32 --alg sym
elif [[ ${precision} == "q4_e2m1" ]]; then
${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --nthread $cores_per_instance --weight_dtype fp4 --group_size 32 --scale_dtype fp32 --compute_dtype fp32 --alg sym
elif [[ ${precision} == "nf4" ]]; then
${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --nthread $cores_per_instance --weight_dtype fp4 --group_size 32 --scale_dtype fp32 --compute_dtype fp32 --alg sym
elif [[ ${precision} == "q4_j_vnni_b32" ]]; then
${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --nthread $cores_per_instance --weight_dtype int4 --group_size 32 --scale_dtype fp32 --compute_dtype int8 --alg sym
elif [[ ${precision} == "q4_j_b32" ]]; then
Expand Down

0 comments on commit 12dea92

Please sign in to comment.