add fp8_e5m2 and nf4 ci

Signed-off-by: Yu, Zhentao <zhentao.yu@intel.com>
intel · Dec 14, 2023 · 12dea92 · 12dea92
1 parent 7290d05
commit 12dea92
Showing 1 changed file with 6 additions and 2 deletions.
diff --git a/.github/workflows/script/models/cpp_graph_inference.sh b/.github/workflows/script/models/cpp_graph_inference.sh
@@ -25,7 +25,7 @@ function main() {
         quant_script="./build/bin/quant_llama"
         infer_cmd="./build/bin/run_llama"
         input_model="/tf_dataset2/models/nlp_toolkit/llama-2-7b-chat/Llama-2-7b-chat-hf"
-        precision_list=("q4_j_b128" "q4_j_b32" "q4_0" "q8_e4m3" "q4_e2m1")
+        precision_list=("q4_j_b128" "q4_j_b32" "q4_0" "q8_e4m3" "q8_e5m2" "q4_e2m1" "nf4")
     elif [[ "${model}" == "gpt-neox-20b" ]]; then
         convert_script="${working_dir}/scripts/convert_gptneox.py"
         quant_script="./build/bin/quant_gptneox"
@@ -121,9 +121,13 @@ function main() {
                         # elif [[ ${precision} == "q4_j_vnni_bf16_b32" ]]; then
                         #     ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --nthread $cores_per_instance --weight_dtype int4 --group_size 32 --scale_dtype bf16 --compute_dtype int8 --alg sym
                         elif [[ ${precision} == "q8_e4m3" ]]; then
-                            ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --nthread $cores_per_instance --weight_dtype fp8 --group_size 32 --scale_dtype fp32 --compute_dtype fp32 --alg sym
+                            ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --nthread $cores_per_instance --weight_dtype fp8 --group_size 32 --scale_dtype fp8 --compute_dtype fp32 --alg sym
+                        elif [[ ${precision} == "q8_e5m2" ]]; then
+                            ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --nthread $cores_per_instance --weight_dtype fp8_e5m2 --group_size 32 --scale_dtype fp8 --compute_dtype fp32 --alg sym
                         elif [[ ${precision} == "q4_e2m1" ]]; then
                             ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --nthread $cores_per_instance --weight_dtype fp4 --group_size 32 --scale_dtype fp32 --compute_dtype fp32 --alg sym
+                        elif [[ ${precision} == "nf4" ]]; then
+                            ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --nthread $cores_per_instance --weight_dtype fp4 --group_size 32 --scale_dtype fp32 --compute_dtype fp32 --alg sym
                         elif [[ ${precision} == "q4_j_vnni_b32" ]]; then
                             ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --nthread $cores_per_instance --weight_dtype int4 --group_size 32 --scale_dtype fp32 --compute_dtype int8 --alg sym
                         elif [[ ${precision} == "q4_j_b32" ]]; then