pytorch · jerryzh168 · Sep 18, 2024 · Sep 11, 2024 · Sep 11, 2024 · Sep 11, 2024
diff --git a/torchao/prototype/autoround/README.md b/torchao/prototype/autoround/README.md
@@ -71,31 +71,36 @@ quantize_(model, apply_auto_round(), is_target_module)
 
 ## End-to-End Results
 ### [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)
-|                 | Avg.    | Mmlu   | Piqa   | Winogrande | Hellaswag | Lambada_openai |
-| --------------  | ------- | ------ | ------ | ---------- | --------- | -------------- |
-| bf16            | 0.7080  | 0.6783 | 0.8003 | 0.7403     | 0.5910    | 0.7303         |
-| auto-round-4bit | 0.6988  | 0.6533 | 0.7949 | 0.7372     | 0.5837    | 0.7250         |
-| torchao-int4wo  | 0.6883  | 0.6363 | 0.7938 | 0.7348     | 0.5784    | 0.6980         |
+|                  | Avg.   | Mmlu   | Piqa   | Winogrande | Hellaswag | Lambada_openai |
+| ---------------- | ------ | ------ | ------ | ---------- | --------- | -------------- |
+| bf16             | 0.7080 | 0.6783 | 0.8003 | 0.7403     | 0.5910    | 0.7303         |
+| torchao-int4wo   | 0.6883 | 0.6363 | 0.7938 | 0.7348     | 0.5784    | 0.6980         |
+| autoround-4bit   | 0.6996 | 0.6669 | 0.7916 | 0.7285     | 0.5846    | 0.7262         |
+| autoround-4bit*  | 0.7010 | 0.6621 | 0.7976 | 0.7316     | 0.5847    | 0.7291         |
 
 ### [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)
-|                 | Avg.    | Mmlu   | Piqa   | Winogrande | Hellaswag | Lambada_openai |
-| --------------  | ------- | ------ | ------ | ---------- | --------- | -------------- |
-| bf16            | 0.6881  | 0.6389 | 0.7840 | 0.7222     | 0.5772    | 0.7184         |
-| auto-round-4bit | 0.6818  | 0.6232 | 0.7862 | 0.7230     | 0.5661    | 0.7105         |
-| torchao-int4wo  | 0.6728  | 0.5939 | 0.7737 | 0.7222     | 0.5612    | 0.7132         |
+|                  | Avg.   | Mmlu   | Piqa   | Winogrande | Hellaswag | Lambada_openai |
+| ---------------- | ------ | ------ | ------ | ---------- | --------- | -------------- |
+| bf16             | 0.6881 | 0.6389 | 0.7840 | 0.7222     | 0.5772    | 0.7184         |
+| torchao-int4wo   | 0.6728 | 0.5939 | 0.7737 | 0.7222     | 0.5612    | 0.7132         |
+| autoround-4bit   | 0.6796 | 0.6237 | 0.7758 | 0.7198     | 0.5664    | 0.7122         |
+| autoround-4bit*  | 0.6827 | 0.6273 | 0.7737 | 0.7348     | 0.5657    | 0.7120         |
 
 
 ### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
-|                 | Avg.    | Mmlu   | Piqa   | Winogrande | Hellaswag | Lambada_openai |
-| --------------  | ------- | ------ | ------ | ---------- | --------- | -------------- |
-| bf16            | 0.6347  | 0.4647 | 0.7644 | 0.6606     | 0.577     | 0.7070         |
-| auto-round-4bit | 0.6327  | 0.4534 | 0.7590 | 0.6661     | 0.5706    | 0.7143         |
-| torchao-int4wo  | 0.6252  | 0.4427 | 0.7617 | 0.6654     | 0.5674    | 0.6889         |
+|                  | Avg.   | Mmlu   | Piqa   | Winogrande | Hellaswag | Lambada_openai |
+| ---------------- | ------ | ------ | ------ | ---------- | --------- | -------------- |
+| bf16             | 0.6347 | 0.4647 | 0.7644 | 0.6606     | 0.5770    | 0.7070         |
+| torchao-int4wo   | 0.6252 | 0.4427 | 0.7617 | 0.6654     | 0.5674    | 0.6889         |
+| autoround-4bit   | 0.6311 | 0.4548 | 0.7606 | 0.6614     | 0.5717    | 0.7072         |
+| autoround-4bit*  | 0.6338 | 0.4566 | 0.7661 | 0.6646     | 0.5688    | 0.7130         |
 
 > [!NOTE]
-> - `auto-round-4bit` represents the following configuration: `bits=4`, `iters=200`, `seqlen=2048`, `train_bs=8`, `group_size=128`, and `quant_lm_head=False`. <br>
 > - `torchao-int4wo` represents `int4_weight_only(group_size=128)` and `quant_lm_head=False`.
-> - If the model includes operations without a deterministic implementation (such as Flash Attention), the results may differ slightly.
+> - `auto-round-4bit` represents the following configuration: `bits=4`, `iters=200`, `seqlen=2048`, `train_bs=8`, `group_size=128`, and `quant_lm_head=False`. <br>
+> - `auto-round-4bit*` represents the following configuration: `bits=4`, `iters=200`, `seqlen=2048`, `train_bs=4`,  `gradient_accumulate_steps=2`, `group_size=128`, and `quant_lm_head=False`. <br>
+> - Compared to `auto-round-4bit`(train_bs=8), the `auto-round-4bit*` accumulates two batches(4 samples per batch) before performing the backward pass. <br>
+> - To reproduce results, run `eval_autoround.py` with `AO_USE_DETERMINISTIC_ALGORITHMS=1`.
 
 
 ## Credits

diff --git a/torchao/prototype/autoround/autoround_llm.py b/torchao/prototype/autoround/autoround_llm.py
@@ -5,7 +5,7 @@
 
 import torchao
 import torchao.prototype.autoround.utils as ar_utils
-
+from typing import Optional
 from torchao.prototype.autoround.core import (
     apply_auto_round,
     prepare_model_for_applying_auto_round_,
@@ -29,6 +29,8 @@ def quantize_model_with_autoround_(
     bs: int = 8,
     nsamples: int = 128,
     use_optimized_layer_output: bool = False,
+    gradient_accumulate_steps: Optional[int] = 1,
+    compile_optimization_process:  Optional[bool] = False,
 ):
     # Step 1. Prepare the model for applying auto-round
 
@@ -42,6 +44,8 @@ def quantize_model_with_autoround_(
         group_size,
         iters,
         use_optimized_layer_output,
+        gradient_accumulate_steps,
+        compile_optimization_process,
         device=device,
     )
 
@@ -107,6 +111,8 @@ def main(args):
         bs=args.train_bs,
         nsamples=args.nsamples,
         use_optimized_layer_output=args.use_optimized_layer_output,
+        gradient_accumulate_steps=args.gradient_accumulate_steps,
+        compile_optimization_process=args.compile_optimization_process,
     )
     # Revert the `use_cache` for generation stage.
     model.config.use_cache = True
@@ -156,6 +162,12 @@ def main(args):
         type=int,
         help="Sequence length for calibration process",
     )
+    parser.add_argument(
+        "--gradient_accumulate_steps",
+        default=1,
+        type=int,
+        help="Number of gradient accumulation steps",
+    )
     parser.add_argument(
         "--quant_lm_head",
         default=False,
@@ -168,6 +180,13 @@ def main(args):
         action="store_true",
         help="Use the optimized layer output for next layer or not",
     )
+    parser.add_argument(
+        "-c",
+        "--compile_optimization_process",
+        default=False,
+        action="store_true",
+        help="Whether to compile the optimization process",
+    )
     parser.add_argument(
         "-d",
         "--model_device",

diff --git a/torchao/prototype/autoround/core.py b/torchao/prototype/autoround/core.py
@@ -20,6 +20,8 @@ class _AutoRoundConfig:
     group_size: int = 128
     iters: int = 200
     use_optimized_layer_output: bool = False
+    gradient_accumulate_steps: int = 1
+    compile_optimization_process: bool = False
 
 
 _auto_round_config = _AutoRoundConfig()
@@ -82,6 +84,8 @@ def prepare_model_for_applying_auto_round_(
     group_size: int = 128,
     iters: int = 200,
     use_optimized_layer_output: bool = False,
+    gradient_accumulate_steps: Optional[int] = 1,
+    compile_optimization_process: Optional[bool] = False,
     device: Optional[torch.types.Device] = None,
 ):
     """Prepares the model for applying auto round optimization.
@@ -94,7 +98,9 @@ def prepare_model_for_applying_auto_round_(
         group_size (int, optional): The group size for quantization. Defaults to 128.
         iters (int, optional): The number of iterations for optimization. Defaults to 200.
         use_optimized_layer_output (bool, optional): Whether to use optimized layer output. Defaults to False.
-        device (Optional[torch.types.Device], optional): The device to use for accelrating optimization and calibration.
+        gradient_accumulate_steps (Optional[int]): The number of gradient accumulation steps. Defaults to 1.
+        compile_optimization_process (Optional[bool]): Whether to compile the optimization process. Defaults to False.
+        device (Optional[torch.types.Device]): The device to use for accelrating optimization and calibration.
             Defaults to None.
     """
     _multi_tensor_config.device = device
@@ -105,6 +111,8 @@ def prepare_model_for_applying_auto_round_(
     _auto_round_config.group_size = group_size
     _auto_round_config.iters = iters
     _auto_round_config.use_optimized_layer_output = use_optimized_layer_output
+    _auto_round_config.gradient_accumulate_steps = gradient_accumulate_steps
+    _auto_round_config.compile_optimization_process = compile_optimization_process
 
     logging.warning(f"config {_auto_round_config}")
 
@@ -312,9 +320,12 @@ def _apply_auto_round_optimization(
         bits=config.bits,
         iters=config.iters,
         group_size=config.group_size,
+        gradient_accumulate_steps=config.gradient_accumulate_steps,
         amp=True,
         model_dtype=next(block.parameters()).dtype,
     )
+    if config.compile_optimization_process:
+        rounder.quant_block_v2_ = torch.compile(rounder.quant_block_v2_)
 
     with torch.enable_grad():
         rounder.quant_block_v2_(
@@ -326,7 +337,7 @@ def _apply_auto_round_optimization(
     block.to(orig_device)
 
 
-@ar_utils.dump_elapsed_time()
+@ar_utils.dump_elapsed_time(record=True)
 @torch.no_grad()
 def apply_auto_round_optimization(
     module: torch.nn.Module,

diff --git a/torchao/prototype/autoround/eval_autoround.py b/torchao/prototype/autoround/eval_autoround.py
@@ -1,16 +1,40 @@
 import argparse
+import logging
+import os
 
-import torchao.prototype.autoround.utils as ar_utils
-
-ar_utils.freeze_random(42)
 import torch
 
-torch.use_deterministic_algorithms(True, warn_only=True)
 import torchao
-
+import torchao.prototype.autoround.utils as ar_utils
 import torchao.quantization
 from torchao.utils import TORCH_VERSION_AT_LEAST_2_5
 
+logger = logging.getLogger(__name__)
+
+ar_utils.freeze_random(42)
+
+
+def _use_deterministic():
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    torch.use_deterministic_algorithms(True, warn_only=False)
+    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+    logger.warning(
+        (
+            "Reproducibility is enabled with `AO_USE_DETERMINISTIC_ALGORITHMS=1`, which sets "
+            "`torch.use_deterministic_algorithms(True, warn_only=False)` and "
+            "environment variable `CUBLAS_WORKSPACE_CONFIG` to `:4096:8`.\n"
+            "Please note that this may impact performance, or cause crashes if the model includes non-deterministic operations."
+        )
+    )
+
+
+AO_USE_DETERMINISTIC_ALGORITHMS = (
+    os.environ.get("AO_USE_DETERMINISTIC_ALGORITHMS", "0") == "1"
+)
+if AO_USE_DETERMINISTIC_ALGORITHMS:
+    _use_deterministic()
+
 
 @ar_utils.dump_elapsed_time()
 def run_evaluation(model, tokenizer, tasks, compile=False, batch_size=4):
@@ -62,7 +86,9 @@ def main(args):
         )
         model.eval()
         model_device = args.model_device
-        ar_utils.gen_text(model, tokenizer, "Float model", max_length=50)
+        # `sorted_logits` does not have a deterministic implementation
+        if not AO_USE_DETERMINISTIC_ALGORITHMS:
+            ar_utils.gen_text(model, tokenizer, "Float model", max_length=50)
         model = model.to(model_device)
         model.config.use_cache = False
         msg = "Float-model" if args.eval_float_model else "Quantized-model"
@@ -121,12 +147,15 @@ def main(args):
                     bs=args.train_bs,
                     nsamples=args.nsamples,
                     use_optimized_layer_output=args.use_optimized_layer_output,
+                    gradient_accumulate_steps=args.gradient_accumulate_steps,
+                    compile_optimization_process=args.compile_optimization_process,
                 )
             quantized_layer_cnt = ar_utils.count_tensor_of_type(
                 model, torchao.dtypes.AffineQuantizedTensor
             )
             msg += f" quantized {quantized_layer_cnt} Linear layers "
-        ar_utils.gen_text(model, tokenizer, msg, max_length=50)
+        if not AO_USE_DETERMINISTIC_ALGORITHMS:
+            ar_utils.gen_text(model, tokenizer, msg, max_length=50)
 
         bench_accuracy(model, tokenizer, tasks=args.tasks, msg=msg)
 
@@ -172,6 +201,12 @@ def main(args):
         type=int,
         help="Sequence length for calibration process",
     )
+    parser.add_argument(
+        "--gradient_accumulate_steps",
+        default=1,
+        type=int,
+        help="Number of gradient accumulation steps",
+    )
     parser.add_argument(
         "--quant_lm_head",
         default=False,
@@ -184,6 +219,13 @@ def main(args):
         action="store_true",
         help="Use the optimized layer output for next layer or not",
     )
+    parser.add_argument(
+        "-c",
+        "--compile_optimization_process",
+        default=False,
+        action="store_true",
+        help="Whether to compile the optimization process",
+    )
     parser.add_argument(
         "-d",
         "--model_device",

diff --git a/torchao/prototype/autoround/utils.py b/torchao/prototype/autoround/utils.py
@@ -6,6 +6,7 @@
 
 import numpy as np
 import torch
+import collections
 
 
 def _is_package_available(pkg_name, metadata_name=None):
@@ -149,8 +150,8 @@ def get_float_model_info(model_name_or_path, torch_dtype=torch.float32):
         )
     return model, tokenizer, decoder_cls
 
-
-def dump_elapsed_time(customized_msg=""):
+execution_records = collections.defaultdict(list)
+def dump_elapsed_time(customized_msg="", record=False):
     """Get the elapsed time for decorated functions.
 
     Args:
@@ -164,13 +165,22 @@ def fi(*args, **kwargs):
             start = time.time()
             res = func(*args, **kwargs)
             end = time.time()
+            dur = round((end - start) * 1000, 2)
+            if record:
+                execution_records[func.__qualname__].append(dur)
             logging.warning(
                 "%s elapsed time: %s ms"
                 % (
                     customized_msg if customized_msg else func.__qualname__,
-                    round((end - start) * 1000, 2),
+                    dur,
                 )
             )
+            if record:
+                avg_time = sum(execution_records[func.__qualname__])/len(execution_records[func.__qualname__])
+                std_time = np.std(execution_records[func.__qualname__])
+                logging.warning(
+                    f"For {func.__qualname__}, the average elapsed time: {avg_time: .2f} ms, the std: {std_time: .2f} ms"
+                )
             return res
 
         return fi