Skip to content

Commit

Permalink
Intergrate AutoRound (#1619)
Browse files Browse the repository at this point in the history
Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
Signed-off-by: chensuyue <suyue.chen@intel.com>
  • Loading branch information
Kaihui-intel authored Feb 25, 2024
1 parent 354791d commit 5c7f336
Show file tree
Hide file tree
Showing 9 changed files with 384 additions and 3 deletions.
5 changes: 5 additions & 0 deletions .azure-pipelines/scripts/ut/env_setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,11 @@ elif [[ $(echo "${test_case}" | grep -c "tf pruning") != 0 ]]; then
# so test distribute cases in the env with single fw installed
pip install horovod
fi

if [[ $(echo "${test_case}" | grep -c "api") != 0 ]] || [[ $(echo "${test_case}" | grep -c "adaptor") != 0 ]]; then
pip install git+https://github.com/intel/auto-round.git@a868c805de4be271cfe7403309a64d9bf03a0ecf
fi

# test deps
pip install coverage
pip install pytest
Expand Down
90 changes: 90 additions & 0 deletions neural_compressor/adaptor/pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -4615,6 +4615,9 @@ def quantize(self, tune_cfg, model, dataloader, calib_func=None):
q_model._model = self.awq_quantize(q_model._model, tune_cfg, dataloader, calib_func)
if "RTN" in all_algo:
q_model._model = self.rtn_quantize(q_model._model, tune_cfg)
if "AUTOROUND" in all_algo:
q_model._model, autoround_config = self.autoround_quantize(q_model._model, tune_cfg, dataloader)
q_model.autoround_config = autoround_config

q_model.q_config = copy.deepcopy(self.tune_cfg)
q_model.is_quantized = True
Expand Down Expand Up @@ -4911,6 +4914,93 @@ def awq_quantize(self, model, tune_cfg, dataloader, calib_func):
)
return model

def autoround_quantize(self, model, tune_cfg, dataloader):
logger.info("quantizing with the AutoRound algorithm")
from .torch_utils.weight_only import autoround_quantize

# build weight_config
"""
weight_config={
'layer1':##layer_name
{
'data_type': 'int',
'bits': 4,
'group_size': 32,
'scheme': "asym", ## or sym
}
...
}
"""
weight_config = {}
for key, config in tune_cfg["op"].items():
if config["weight"]["dtype"] == "fp32":
continue
op_name, op_type = key
weight_config[op_name] = {}
weight_config[op_name]["data_type"] = config["weight"]["dtype"]
weight_config[op_name]["bits"] = config["weight"]["bits"]
weight_config[op_name]["group_size"] = config["weight"]["group_size"]
weight_config[op_name]["scheme"] = config["weight"]["scheme"]

# auto round recipes
enable_full_range = self.recipes["autoround_args"].get("enable_full_range", False)
bs = self.recipes["autoround_args"].get("bs", 8)
amp = self.recipes["autoround_args"].get("amp", True)
device = self.recipes["autoround_args"].get("device", "cpu")
lr_scheduler = self.recipes["autoround_args"].get("lr_scheduler", None)
dataset_name = self.recipes["autoround_args"].get("dataset_name", "NeelNanda/pile-10k")
dataset_split = self.recipes["autoround_args"].get("dataset_split", "train")
use_quant_input = self.recipes["autoround_args"].get("use_quant_input", True)
enable_minmax_tuning = self.recipes["autoround_args"].get("enable_minmax_tuning", True)
lr = self.recipes["autoround_args"].get("lr", None)
minmax_lr = self.recipes["autoround_args"].get("minmax_lr", None)
low_gpu_mem_usage = self.recipes["autoround_args"].get("low_gpu_mem_usage", True)
iters = self.recipes["autoround_args"].get("iters", 200)
seqlen = self.recipes["autoround_args"].get("seqlen", 2048)
n_samples = self.recipes["autoround_args"].get("n_samples", 512)
sampler = self.recipes["autoround_args"].get("sampler", "rand")
seed = self.recipes["autoround_args"].get("seed", 42)
n_blocks = self.recipes["autoround_args"].get("n_blocks", 1)
gradient_accumulate_steps = self.recipes["autoround_args"].get("gradient_accumulate_steps", 1)
not_use_best_mse = self.recipes["autoround_args"].get("not_use_best_mse", False)
dynamic_max_gap = self.recipes["autoround_args"].get("dynamic_max_gap", -1)
data_type = self.recipes["autoround_args"].get("data_type", "int") ##only support data_type
scale_dtype = self.recipes["autoround_args"].get("scale_dtype", "fp16")

model, autoround_config = autoround_quantize(
model=model,
tokenizer=None,
bits=4,
group_size=128,
scheme="asym",
weight_config=weight_config,
enable_full_range=enable_full_range,
bs=bs,
amp=amp,
device=device,
lr_scheduler=lr_scheduler,
dataloader=dataloader,
dataset_name=dataset_name,
dataset_split=dataset_split,
use_quant_input=use_quant_input,
enable_minmax_tuning=enable_minmax_tuning,
lr=lr,
minmax_lr=minmax_lr,
low_gpu_mem_usage=low_gpu_mem_usage,
iters=iters,
seqlen=seqlen,
n_samples=n_samples,
sampler=sampler,
seed=seed,
n_blocks=n_blocks,
gradient_accumulate_steps=gradient_accumulate_steps,
not_use_best_mse=not_use_best_mse,
dynamic_max_gap=dynamic_max_gap,
data_type=data_type,
scale_dtype=scale_dtype,
)
return model, autoround_config

def _dump_model_op_stats(self, model, tune_cfg):
"""This is a function to dump quantizable ops of model to user.
Expand Down
2 changes: 1 addition & 1 deletion neural_compressor/adaptor/pytorch_cpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@
# group_size=-1 means per-channel, others means per-group
'group_size': [32, -1, 1, 4, 8, 16, 64, 128, 256, 512, 1024], # [1-inf], # 32
'scheme': ['sym', 'asym'], # sym, no ZP
'algorithm': ['RTN', 'AWQ', 'GPTQ', 'TEQ'], # RTN, [RTN, GPTQ, AWQ,] RTN+AWQ+TEQ order
'algorithm': ['RTN', 'AWQ', 'GPTQ', 'TEQ', 'AUTOROUND'], # RTN, [RTN, GPTQ, AWQ,] RTN+AWQ+TEQ order
},
'activation': {
'dtype': ['fp32'],
Expand Down
25 changes: 25 additions & 0 deletions neural_compressor/adaptor/torch_utils/auto_round.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright (c) 2024 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from auto_round.calib_dataset import CALIB_DATASETS # pylint: disable=E0401


def get_dataloader(
tokenizer, seqlen=2048, seed=42, train_bs=8, dataset_split="train", dataset_name="NeelNanda/pile-10k"
):
get_dataloader = CALIB_DATASETS.get(dataset_name, CALIB_DATASETS["NeelNanda/pile-10k"])
dataloader = get_dataloader(
tokenizer, seqlen=seqlen, seed=seed, bs=train_bs, split=dataset_split, dataset_name=dataset_name
)
return dataloader
118 changes: 118 additions & 0 deletions neural_compressor/adaptor/torch_utils/weight_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -670,3 +670,121 @@ def quant_weight_w_scale(weight, scale, zp, group_size=-1):
int_weight_tmp.add_(zp[:, -1].unsqueeze(1))
int_weight[:, leng * group_size :].copy_(int_weight_tmp.round_())
return int_weight


def autoround_quantize(
model,
tokenizer,
bits: int = 4,
group_size: int = 128,
scheme: str = "asym",
weight_config: dict = {},
enable_full_range: bool = False, ##for symmetric, TODO support later
bs: int = 8,
amp: bool = True,
device="cuda:0",
lr_scheduler=None,
dataloader=None, ## to support later
dataset_name: str = "NeelNanda/pile-10k",
dataset_split: str = "train",
use_quant_input: bool = True,
enable_minmax_tuning: bool = True,
lr: float = None,
minmax_lr: float = None,
low_gpu_mem_usage: bool = True,
iters: int = 200,
seqlen: int = 2048,
n_samples: int = 512,
sampler: str = "rand",
seed: int = 42,
n_blocks: int = 1,
gradient_accumulate_steps: int = 1,
not_use_best_mse: bool = False,
dynamic_max_gap: int = -1,
data_type: str = "int", ##only support data_type
scale_dtype="fp16",
**kwargs,
):
"""Run autoround weight-only quantization.
Args:
model: The PyTorch model to be quantized.
tokenizer: Tokenizer for processing input data. Temporarily set as a mandatory parameter.
bits (int): Number of bits for quantization (default is 4).
group_size (int): Size of the quantization group (default is 128).
scheme (str): The quantization scheme to be used (default is "asym").
weight_config (dict): Configuration for weight quantization (default is an empty dictionary).
weight_config={
'layer1':##layer_name
{
'data_type': 'int',
'bits': 4,
'group_size': 32,
'scheme': "asym", ## or sym
}
...
}
enable_full_range (bool): Whether to enable full range quantization (default is False).
bs (int): Batch size for training (default is 8).
amp (bool): Whether to use automatic mixed precision (default is True).
device: The device to be used for tuning (default is "cuda:0").
lr_scheduler: The learning rate scheduler to be used.
dataloader: The dataloader for input data (to be supported in future).
dataset_name (str): The default dataset name (default is "NeelNanda/pile-10k").
dataset_split (str): The split of the dataset to be used (default is "train").
use_quant_input (bool): Whether to use quantized input data (default is True).
enable_minmax_tuning (bool): Whether to enable min-max tuning (default is True).
lr (float): The learning rate (default is 0.005).
minmax_lr (float): The learning rate for min-max tuning (default is None).
low_gpu_mem_usage (bool): Whether to use low GPU memory (default is True).
iters (int): Number of iterations (default is 200).
seqlen (int): Length of the sequence.
n_samples (int): Number of samples (default is 512).
sampler (str): The sampling method (default is "rand").
seed (int): The random seed (default is 42).
n_blocks (int): Number of blocks (default is 1).
gradient_accumulate_steps (int): Number of gradient accumulation steps (default is 1).
not_use_best_mse (bool): Whether to use mean squared error (default is False).
dynamic_max_gap (int): The dynamic maximum gap (default is -1).
data_type (str): The data type to be used (default is "int").
**kwargs: Additional keyword arguments.
Returns:
The quantized model.
"""
from auto_round import AutoRound # pylint: disable=E0401

rounder = AutoRound(
model=model,
tokenizer=tokenizer,
bits=bits,
group_size=group_size,
scheme=scheme,
weight_config=weight_config,
enable_full_range=enable_full_range, ##for symmetric, TODO support later
bs=bs,
amp=amp,
device=device,
lr_scheduler=lr_scheduler,
dataloader=dataloader, ## to support later
dataset_name=dataset_name,
dataset_split=dataset_split,
use_quant_input=use_quant_input,
enable_minmax_tuning=enable_minmax_tuning,
lr=lr,
minmax_lr=minmax_lr,
low_gpu_mem_usage=low_gpu_mem_usage,
iters=iters,
seqlen=seqlen,
n_samples=n_samples,
sampler=sampler,
seed=seed,
n_blocks=n_blocks,
gradient_accumulate_steps=gradient_accumulate_steps,
not_use_best_mse=not_use_best_mse,
dynamic_max_gap=dynamic_max_gap,
data_type=data_type, ## only support data_type
scale_dtype=scale_dtype,
**kwargs,
)
qdq_model, weight_config = rounder.quantize()
return qdq_model, weight_config
9 changes: 8 additions & 1 deletion neural_compressor/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
),
Optional("algorithm"): And(
list, # TODO: allow AWQ+GPTQ algo
lambda s: all(i in ["minmax", "RTN", "AWQ", "GPTQ", "TEQ"] for i in s),
lambda s: all(i in ["minmax", "RTN", "AWQ", "GPTQ", "TEQ", "AUTOROUND"] for i in s),
),
Optional("bits"): And(list, lambda s: all(0 < i <= 8 and type(i) == int for i in s)),
Optional("group_size"): And(list, lambda s: all(i >= -1 and i != 0 and type(i) == int for i in s)),
Expand Down Expand Up @@ -941,6 +941,12 @@ def teq_args(val=None):
else:
return {}

def autoround_args(val=None):
if val is not None:
return _check_value("autoround_args", val, dict)
else:
return {}

def fast_bias_correction(val=None):
if val is not None:
return _check_value("fast_bias_correction", val, bool)
Expand Down Expand Up @@ -1025,6 +1031,7 @@ def dedicated_qdq_pair(val=None):
"awq_args": awq_args,
"gptq_args": gptq_args,
"teq_args": teq_args,
"autoround_args": autoround_args,
}
self._recipes = {}
for k in RECIPES.keys():
Expand Down
63 changes: 63 additions & 0 deletions test/adaptor/pytorch_adaptor/test_weight_only_adaptor.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,13 @@
from neural_compressor.utils.load_huggingface import export_compressed_model
from neural_compressor.utils.pytorch import load

try:
import auto_round

auto_round_installed = True
except ImportError:
auto_round_installed = False


class Model(torch.nn.Module):
def __init__(self):
Expand Down Expand Up @@ -738,6 +745,62 @@ def __iter__(self):
out2 = q_model.model(input)
self.assertTrue(torch.allclose(out1[0], out2[0], atol=1e-01))

@unittest.skipIf(not auto_round_installed, "auto_round module is not installed")
def test_AutoRound_quant(self):
from neural_compressor.adaptor.torch_utils.auto_round import get_dataloader

tokenizer = transformers.AutoTokenizer.from_pretrained(
"hf-internal-testing/tiny-random-GPTJForCausalLM", trust_remote_code=True
)
dataloader = get_dataloader(
tokenizer, seqlen=10, seed=42, train_bs=8, dataset_split="train", dataset_name="NeelNanda/pile-10k"
)
fp32_model = copy.deepcopy(self.gptj)

conf = PostTrainingQuantConfig(
approach="weight_only",
op_type_dict={
".*": { # re.match
"weight": {
"dtype": "int",
"bits": 4,
"group_size": 32, # -1 (per-channel)
"scheme": "sym",
"algorithm": "AUTOROUND",
},
},
},
op_name_dict={
".*lm_head": { # re.match
"weight": {"dtype": "fp32"},
},
},
recipes={
"autoround_args": {
"n_samples": 20,
"amp": False,
"seq_len": 10,
"iters": 10,
"scale_dtype": "fp32",
"device": "cpu",
},
},
)

input = torch.ones([1, 512], dtype=torch.long)
fp32_model = copy.deepcopy(self.gptj)
out1 = fp32_model(input)
q_model = quantization.fit(
fp32_model,
conf,
calib_dataloader=dataloader,
)
out2 = q_model.model(input)
self.assertTrue(torch.allclose(out1[0], out2[0], atol=1e-01))
self.assertTrue("transformer.h.0.attn.k_proj" in q_model.autoround_config.keys())
self.assertTrue("scale" in q_model.autoround_config["transformer.h.0.attn.k_proj"].keys())
self.assertTrue(torch.float32 == q_model.autoround_config["transformer.h.0.attn.k_proj"]["scale_dtype"])


if __name__ == "__main__":
unittest.main()
Loading

0 comments on commit 5c7f336

Please sign in to comment.