pytorch · jerryzh168 · Sep 4, 2024 · Jul 24, 2024 · Jul 25, 2024 · Jul 26, 2024
diff --git a/test/prototype/test_autoround.py b/test/prototype/test_autoround.py
@@ -0,0 +1,96 @@
+import pytest
+from torchao.prototype.autoround.utils import is_auto_round_available
+
+if not is_auto_round_available():
+    pytest.skip("AutoRound is not available", allow_module_level=True)
+
+import torch
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
+    TestCase,
+)
+from torchao import quantize_
+
+from torchao.dtypes import AffineQuantizedTensor
+from torchao.prototype.autoround.core import (
+    apply_auto_round,
+    prepare_model_for_applying_auto_round_,
+)
+from torchao.prototype.autoround.multi_tensor import MultiTensor
+from torchao.utils import TORCH_VERSION_AT_LEAST_2_5
+
+_AVAILABLE_DEVICES = ["cpu"] + (["cuda"] if torch.cuda.is_available() else [])
+
+
+# Copied from https://github.com/pytorch/ao/pull/721
+class TwoLinear(torch.nn.Module):
+    def __init__(self, in_features=64, out_features=128):
+        super().__init__()
+        self.linear1 = torch.nn.Linear(in_features, out_features)
+        self.linear2 = torch.nn.Linear(in_features, out_features)
+
+    def forward(self, x, y):
+        x = self.linear1(x)
+        y = self.linear2(y)
+        return x + y
+
+
+class M(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.two_linear1 = TwoLinear()
+        self.two_linear2 = TwoLinear(128, 256)
+
+    def forward(self, x, y):
+        x1 = self.two_linear1(x, y)
+        x2 = self.two_linear2(x1, x1)
+        return x2
+
+
+def _is_two_linear(mod, fqn):
+    return isinstance(mod, TwoLinear)
+
+
+class TestAutoRound(TestCase):
+
+    @pytest.mark.skip(not TORCH_VERSION_AT_LEAST_2_5, "Requires torch 2.5 or later")
+    @parametrize("device", _AVAILABLE_DEVICES)
+    @torch.no_grad()
+    def test_auto_round(self, device: str):
+        example_inputs = (
+            torch.randn(32, 64).to(device),
+            torch.randn(32, 64).to(device),
+        )
+        m = M().eval().to(device)
+        before_quant = m(*example_inputs)
+        prepare_model_for_applying_auto_round_(
+            m,
+            is_target_module=_is_two_linear,
+            bits=7,
+            group_size=32,
+            iters=20,
+            device=device,
+        )
+        input1 = []
+        input2 = []
+        for _ in range(10):
+            input1.append(torch.randn(32, 64).to(device))
+            input2.append(torch.randn(32, 64).to(device))
+
+        mt_input1 = MultiTensor(input1)
+        mt_input2 = MultiTensor(input2)
+        out = m(mt_input1, mt_input2)
+        quantize_(m, apply_auto_round(), _is_two_linear, device=device)
+        for l in m.modules():
+            if isinstance(l, torch.nn.Linear):
+                assert isinstance(l.weight, AffineQuantizedTensor)
+        after_quant = m(*example_inputs)
+        assert after_quant is not None, "Quantized model forward pass failed"
+
+
+instantiate_parametrized_tests(TestAutoRound)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torchao/_models/llama/benchmarks.sh b/torchao/_models/llama/benchmarks.sh
@@ -12,6 +12,9 @@ python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --co
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int4wo-64 --write_result benchmark_results.txt
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization autoquant --write_result benchmark_results.txt
 
+# For auto-round
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization  autoround --write_result benchmark_results.txt
+
 export MODEL_REPO=meta-llama/Meta-Llama-3-8B
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision torch.float32 --write_result benchmark_results.txt
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt
@@ -30,3 +33,8 @@ python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --co
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --write_result benchmark_results.txt --kv_cache_quantization --max_new_tokens 2048
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --write_result benchmark_results.txt --max_new_tokens 8192
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --write_result benchmark_results.txt --kv_cache_quantization --max_new_tokens 8192
+
+# For auto-round
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization  autoround-cpu --write_result benchmark_results.txt
+# w/ quant_lm_head
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization  autoround-cpu-200-128-1 --write_result benchmark_results.txt
diff --git a/torchao/_models/llama/generate.py b/torchao/_models/llama/generate.py
@@ -30,7 +30,7 @@ def device_sync(device):
 wd = Path(__file__).parent.parent.resolve()
 sys.path.append(str(wd))
 
-from torchao._models.llama.model import Transformer, prepare_inputs_for_model
+from torchao._models.llama.model import Transformer, prepare_inputs_for_model, TransformerBlock
 from torchao._models.llama.tokenizer import get_tokenizer
 
 def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization
@@ -220,6 +220,33 @@ def main(
             groupsize=int(quantization.split("-")[-1])
             assert groupsize in [32,64,128,256], f"int4wo groupsize needs to be one of [32,64,128,256] but got {groupsize}"
             quantize_(model, int4_weight_only(group_size=groupsize))
+
+        if "autoround" in quantization:
+            from torchao.prototype.autoround.autoround_llm import quantize_model_with_autoround_
+            from transformers import AutoTokenizer
+            _tokenizer = AutoTokenizer.from_pretrained(checkpoint_path.parent)
+            # parse args from quantization string:
+            #   autoround-<model_device>-<iters>-<groupsize>-<quant_lm_head>-<batch_size>-<seqlen>
+            #   autoround-cpu-200-128-0-8-2048
+            _quant_args = quantization.split("-")
+            _default_quant_args = [200, 128, False, 8, 2048]
+            _model_devie = _quant_args[1] if len(_quant_args) >1 else device
+            _quant_args = _quant_args[2:]
+            iters, groupsize, quant_lm_head, batch_size, seqlen = [int(x) for x in _quant_args] + _default_quant_args[len(_quant_args):]
+            model = model.to(_model_devie)
+            print(f"Quantizing model with autoround(iters={iters}, groupsize={groupsize}, quant_lm_head={quant_lm_head}, batch_size={batch_size}, seqlen={seqlen})")
+            with torch.device(_model_devie):
+                model.setup_caches(max_batch_size=batch_size, max_seq_length=seqlen, training=True)
+
+            if quant_lm_head:
+                is_target_module = (
+                    lambda mod, fqn: isinstance(mod, TransformerBlock) or "output" in fqn
+                )
+            else:
+                is_target_module = lambda mod, fqn: isinstance(mod, TransformerBlock)
+            quantize_model_with_autoround_(model=model, tokenizer=_tokenizer, is_target_module=is_target_module, bits=4, seqlen=seqlen, bs=batch_size, iters=iters)
+            model.to(device)
+            model.reset_caches()
         if "autoquant" == quantization:
             model = autoquant(model, manual=True)
 
@@ -367,7 +394,7 @@ def callback(x):
     parser.add_argument('--top_k', type=int, default=200, help='Top-k for sampling.')
     parser.add_argument('--temperature', type=float, default=0.8, help='Temperature for sampling.')
     parser.add_argument('--checkpoint_path', type=Path, default=Path("../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth"), help='Model checkpoint path.')
-    parser.add_argument('-q', '--quantization', type=str, help='Which quantization techniques to apply: int8dq, int8wo, int4wo-<groupsize>, autoquant')
+    parser.add_argument('-q', '--quantization', type=str, help='Which quantization techniques to apply: int8dq, int8wo, int4wo-<groupsize>, autoquant, autoround-<model_device>-<iters>-<groupsize>-<quant_lm_head>-<batch_size>-<seqlen>')
     parser.add_argument('--kv_cache_quantization', action='store_true', help='Whether to quantize the KV cache')
     parser.add_argument('--save', action='store_true', help='Whether to save the quantized model.')
     parser.add_argument('--compile', action='store_true', help='Whether to compile the model.')

diff --git a/torchao/_models/llama/model.py b/torchao/_models/llama/model.py
@@ -169,6 +169,12 @@ def setup_caches(self, max_batch_size, max_seq_length, training: bool = False):
 
         self.freqs_cis = precompute_freqs_cis(self.config.block_size, self.config.dim // self.config.n_head, self.config.rope_base, dtype)
         self.causal_mask = torch.tril(torch.ones(self.max_seq_length, self.max_seq_length, dtype=torch.bool))
+
+    def reset_caches(self):
+        self.max_batch_size = -1
+        self.max_seq_length = -1
+        self.freqs_cis: Optional[Tensor] = None
+        self.mask_cache: Optional[Tensor] = None
 
     def forward(self, idx: Tensor, input_pos: Optional[Tensor] = None) -> Tensor:
         assert self.freqs_cis is not None, "Caches must be initialized first"

diff --git a/torchao/prototype/autoround/README.md b/torchao/prototype/autoround/README.md
@@ -0,0 +1,103 @@
+# Auto-Round
+
+Auto-Round is an advanced quantization algorithm designed for low-bit LLM inference. It leverages [sign gradient descent](https://arxiv.org/abs/1905.12938) to fine-tune rounding values and minmax values of weights. This approach competes impressively with recent methods without introducing any additional inference overhead while using low tuning costs. This module provides the end-to-end examples to quantize floating-point models to low-bit and integration with torchao's `quantize_` API and low-bit kernels.
+
+## Usage
+
+### Quick Start
+
+```python
+python autoround_llm.py -m /model/name/or/path
+```
+
+
+> [!NOTE]
+> Before running, ensure you have installed the `auto-round` with `pip install -r requirements.txt`.
+
+
+### Detailed Usage
+
+`Auto-Round` is a calibration-based quantization algorithm. The flow involves three main steps: 1) insert hooks to the modules you want to quantize, 2) Wrap the calibration data with `MultiTensor` and run the model, 3) Replace the optimized weight with `AffineQuantizedTensor` to select the appropriate low-bit kernel.
+
+> [!NOTE]
+> To learn more about the flow and `MultiTensor`, please refer to [this example](https://github.com/pytorch/ao/blob/main/tutorials/calibration_flow/gptq_like.py).
+
+#### Step 1: Prepare the Model
+```python
+model = ...  # Load your model
+model_device = next(model.parameters()).device
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+# Define a function to identify target modules for quantization.
+# For example, to apply Auto-Round to all decoder layers and the `lm-head` in a Llama model:
+decoder_cls = transformers.models.llama.modeling_llama.LlamaDecoderLayer
+is_target_module = lambda mod, fqn: isinstance(mod, decoder_cls) or "lm_head" in fqn
+# Prepare the model for Auto-Round
+from torchao.prototype.autoround.core import prepare_model_for_applying_auto_round_
+
+prepare_model_for_applying_auto_round_(
+    model,
+    is_target_module=is_target_module,
+    bits=4,
+    group_size=128,
+    iters=200,
+    device=device,
+)
+```
+> [!NOTE]
+> To avoid OOM issues, load the model on CPU, and set `device` to `'cuda'`.
+
+#### Step 2: Apply Optimization
+Wrap all inputs as a `MultiTensor` to track all calibration data for optimized modules:
+
+```python
+input_ids_lst = []
+for data in dataloader:
+    input_ids_lst.append(data["input_ids"].to(model_device))
+
+multi_t_input_ids = MultiTensor(input_ids_lst)
+# The optimization is applied during the forward pass
+out = model(multi_t_input_ids)
+```
+#### Step 3: Finalize Quantization
+After obtaining optimized `zero_point` and `scale` values, create the `AffineQuantizedTensor` 
+for each target weight to select the right low-bits kernel.
+
+```python
+from torchao.prototype.autoround.core import apply_auto_round
+
+quantize_(model, apply_auto_round(), is_target_module)
+```
+
+## End-to-End Results
+### [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)
+|                 | Avg.    | Mmlu   | Piqa   | Winogrande | Hellaswag | Lambada_openai |
+| --------------  | ------- | ------ | ------ | ---------- | --------- | -------------- |
+| bf16            | 0.7080  | 0.6783 | 0.8003 | 0.7403     | 0.5910    | 0.7303         |
+| auto-round-4bit | 0.6989  | 0.6566 | 0.7943 | 0.7285     | 0.5856    | 0.7295         |
+| torchao-int4wo  | 0.6883  | 0.6363 | 0.7938 | 0.7348     | 0.5784    | 0.6980          |
+
+### [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)
+|                 | Avg.    | Mmlu   | Piqa   | Winogrande | Hellaswag | Lambada_openai |
+| --------------  | ------- | ------ | ------ | ---------- | --------- | -------------- |
+| bf16            | 0.6881 | 0.6389 | 0.7840 | 0.7222     | 0.5772    | 0.7184         |
+| auto-round-4bit | 0.6811 | 0.6218 | 0.7758 | 0.7285     | 0.5694    | 0.7101         |
+| torchao-int4wo  | 0.6728 | 0.5939 | 0.7737 | 0.7222     | 0.5612    | 0.7132         |
+
+
+### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
+|                 | Avg.    | Mmlu   | Piqa   | Winogrande | Hellaswag | Lambada_openai |
+| --------------  | ------- | ------ | ------ | ---------- | --------- | -------------- |
+| bf16            | 0.6347  | 0.4647 | 0.7644 | 0.6606     | 0.577     | 0.7070         |
+| auto-round-4bit | 0.6335  | 0.4533 | 0.7661 | 0.6685     | 0.5705    | 0.7091         |
+| torchao-int4wo  | 0.6252  | 0.4427 | 0.7617 | 0.6654     | 0.5674    | 0.6889         |
+
+> [!NOTE]
+> - `auto-round-4bit` represents the following configuration: `bits=4`, `iters=200`, `seqlen=2048`, `train_bs=8`, `group_size=128`, `use_optimized_layer_output=True` and `quant_lm_head=False`. <br>
+> - `torchao-int4wo` represents `int4_weight_only(group_size=128)` and `quant_lm_head=False`.
+
+
+## Credits
+
+- Paper: https://arxiv.org/abs/2309.05516
+- Authors: [Intel® Neural Compressor Team](https://github.com/intel/neural-compressor)
diff --git a/torchao/prototype/autoround/__init__.py b/torchao/prototype/autoround/__init__.py
@@ -0,0 +1,5 @@
+from torchao.prototype.autoround.core import (
+    apply_auto_round,
+    prepare_model_for_applying_auto_round_,
+)
+from torchao.prototype.autoround.multi_tensor import MultiTensor