From 3fa636fc55fe023246bf9daba710a430c3c5d3a0 Mon Sep 17 00:00:00 2001 From: HDCharles Date: Mon, 17 Jun 2024 23:07:27 -0700 Subject: [PATCH] fixing scripts Summary: a few bugfixes for scripts 1) convert_hf_checkpoint.py had a gpt-fast dependency that wasn't caught due to it being in the path 2) eval.py had a bug due to the switch to aqt apis 3) generate.py had a bug due to the deprecation of the old quant apis Test Plan: python eval.py python eval.py -q int8dq --compile --limit 2 python eval.py -q int8wo --compile --limit 2 python eval.py -q int4wo-64 --compile --limit 2 python eval.py -q int4wo-64-gptq --compile sh benchmarks.sh (going to add the output results once they finish Reviewers: Subscribers: Tasks: Tags: --- scripts/convert_hf_checkpoint.py | 7 +------ torchao/_models/llama/eval.py | 8 +++++--- torchao/_models/llama/generate.py | 17 +++++++++++------ 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/scripts/convert_hf_checkpoint.py b/scripts/convert_hf_checkpoint.py index 6a7da922a5..7b0f76903c 100644 --- a/scripts/convert_hf_checkpoint.py +++ b/scripts/convert_hf_checkpoint.py @@ -8,17 +8,12 @@ import json import re import shutil -import sys from pathlib import Path from typing import Optional import torch -# support running without installing as a package -wd = Path(__file__).parent.parent.resolve() -sys.path.append(str(wd)) - -from model import ModelArgs +from torchao._models.llama.model import ModelArgs @torch.inference_mode() diff --git a/torchao/_models/llama/eval.py b/torchao/_models/llama/eval.py index cccc13f09f..7842c3e66c 100644 --- a/torchao/_models/llama/eval.py +++ b/torchao/_models/llama/eval.py @@ -13,7 +13,7 @@ ) from torchao.quantization.quant_api import ( - quantize, int4wo, int8wo, int8da_int8w + quantize, int4wo, int8wo, int8da_int8w, unwrap_tensor_subclass ) from torchao._models._eval import TransformerEvalWrapper, InputRecorder @@ -70,7 +70,7 @@ def run_evaluation( if "int4wo" in quantization and "gptq" in quantization: groupsize=int(quantization.split("-")[-2]) assert groupsize in [32,64,128,256], f"int4wo groupsize needs to be one of [32,64,128,256] but got {groupsize}" - + assert precision==torch.bfloat16, f"{quantization} requires precision or bfloat16 but got {precision}" inputs = InputRecorder( tokenizer, calibration_seq_length, @@ -83,9 +83,11 @@ def run_evaluation( calibration_limit, ).get_inputs() - quantizer = Int4WeightOnlyGPTQQuantizer(groupsize=groupsize, precision=precision) + quantizer = Int4WeightOnlyGPTQQuantizer(groupsize=groupsize) model.setup_caches(max_batch_size=1, max_seq_length=calibration_seq_length) model = quantizer.quantize(model, inputs).to(device) + else: + unwrap_tensor_subclass(model) if compile: model = torch.compile(model, mode="max-autotune", fullgraph=True) diff --git a/torchao/_models/llama/generate.py b/torchao/_models/llama/generate.py index ea7200ea6b..a230ea4a34 100644 --- a/torchao/_models/llama/generate.py +++ b/torchao/_models/llama/generate.py @@ -189,20 +189,22 @@ def main( if quantization: from torchao.quantization.quant_api import ( - change_linear_weights_to_int4_woqtensors, - change_linear_weights_to_int8_woqtensors, - change_linear_weights_to_int8_dqtensors, + quantize, + int8wo, + int8da_int8w, + int4wo, autoquant, + unwrap_tensor_subclass ) if "int8wo" in quantization: - change_linear_weights_to_int8_woqtensors(model) + quantize(model, int8wo()) if "int8dq" in quantization: - change_linear_weights_to_int8_dqtensors(model) + quantize(model, int8da_int8w()) if "int4wo" in quantization: groupsize=int(quantization.split("-")[-1]) assert groupsize in [32,64,128,256], f"int4wo groupsize needs to be one of [32,64,128,256] but got {groupsize}" - change_linear_weights_to_int4_woqtensors(model, groupsize=groupsize) + quantize(model, int4wo(groupsize=groupsize)) if "autoquant" == quantization: model = autoquant(model) generate( @@ -211,6 +213,9 @@ def main( 2, interactive=False ) + else: + unwrap_tensor_subclass(model) + model_size = get_model_size_in_bytes(model, ignore_embeddings=True) / 1e9