diff --git a/benchmarks/benchmark_hqq.py b/benchmarks/benchmark_hqq.py new file mode 100644 index 0000000000..393481e95b --- /dev/null +++ b/benchmarks/benchmark_hqq.py @@ -0,0 +1,147 @@ + +try: + import triton + import hqq + if int(triton.__version__.split(".")[0]) < 3: + raise "triton >= 3.0.0 is required to run this test" +except ImportError: + raise "triton and hqq required to run this benchmark" + +import torch +from io import StringIO + +import pandas as pd +from hqq.core.quantize import HQQLinear, BaseQuantizeConfig +from torchao.prototype.hqq.hqq_tinygemm_linear import HQQLinearTorchWeightOnlyInt4 +from torchao.prototype.hqq import triton_mixed_mm, pack_2xint4 + +from triton.testing import do_bench + + +BASE_QUANT_CONFIG = { + "optimize": True, + "view_as_float": False, + "nbits": 4, + "bitpack": False, + "axis": 1, +} + + +def bench_custom_kernel(x, W_q, scales, zeros, group_size, kernel_type="max_autotune", fp8_fast_accum=False): + packed_w = pack_2xint4(W_q.T) + + def fn(): + _ = triton_mixed_mm( + x, + packed_w, + scales.T, + zeros.T, + group_size=group_size, + fp8_fast_accum=fp8_fast_accum, + kernel_type=kernel_type, + ) + + t = do_bench(fn) + return t + + +def bench_hqq(x, hqq_linear: HQQLinear): + def fn(): + _ = hqq_linear.forward(x) + + t = do_bench(fn) + return t + + +def run_benchmark(shape, group_size, dtype, axis=1, quant_dtype=torch.uint8): + qcfg = { + **BASE_QUANT_CONFIG, + **dict(group_size=group_size, axis=axis), + } + M, N, K = shape + + x = torch.randn(M, K, dtype=dtype, device="cuda") + linear = torch.nn.Linear(K, N, bias=False, dtype=dtype, device="cuda") + + quant_config = BaseQuantizeConfig( + quant_zero=False, quant_scale=False, offload_meta=False, view_as_float=False + ) + quant_config.update({"weight_quant_params": qcfg}) + + hqq_linear = HQQLinear(linear, quant_config, compute_dtype=dtype, del_orig=False) + + # Reference + ref_time = bench_hqq(x, hqq_linear) + + # Custom kernel + W_q, meta = hqq_linear.W_q, hqq_linear.meta + scales, zeros = meta["scale"], meta["zero"] + + W_q = ( + W_q.reshape(meta["shape"]) + if quant_config["weight_quant_params"]["bitpack"] == False + else W_q + ) + W_q = W_q.to(dtype=quant_dtype) + scales = scales.reshape(N, -1) + zeros = zeros.reshape(N, -1) + tt_time = bench_custom_kernel(x, W_q, scales, zeros, group_size) + + if dtype == torch.bfloat16: + _ = quant_config["weight_quant_params"].pop("bitpack") + hqq_int4mm = HQQLinearTorchWeightOnlyInt4( + linear, quant_config, compute_dtype=dtype, del_orig=False + ) + int4_time = bench_hqq(x, hqq_int4mm) + + print(f"{shape=} {group_size=} {dtype=}:") + + print( + f"Ref: {ref_time:.4f}", + f"Triton: {tt_time:.4f}", + f"Torch int4mm: {int4_time:.4f}" + if dtype == torch.bfloat16 + else "", + ) + print() + return ref_time, tt_time, int4_time if dtype == torch.bfloat16 else None + + +SHAPES = [ + [16, 4096, 4096], + [32, 4096, 4096], + [128, 4096, 4096], + [256, 4096, 4096], + [512, 4096, 4096], + [1024, 4096, 4096], +] + +DTYPES = [torch.bfloat16] # , torch.float16] +GROUP_SIZES = [128] + + +HEADERS = [ + "M", + "N", + "K", + "group_size", + "dtype", + "ref", + "triton", + "tinygemm", +] +data = [] + +if __name__ == "__main__": + print(torch.cuda.get_device_properties(0)) + + for shape in SHAPES: + for group_size in GROUP_SIZES: + for dtype in DTYPES: + timings = run_benchmark(shape, group_size, dtype) + data.append((*shape, group_size, dtype, *timings)) + + output = StringIO() + df = pd.DataFrame(data, columns=HEADERS) + df.to_csv(output, index=False) + print(output.getvalue()) \ No newline at end of file diff --git a/test/hqq/test_triton_mm.py b/test/hqq/test_triton_mm.py new file mode 100644 index 0000000000..23f6c60f70 --- /dev/null +++ b/test/hqq/test_triton_mm.py @@ -0,0 +1,104 @@ +# Skip entire test if triton is not available, otherwise CI failure +import pytest +try: + import triton + import hqq + if int(triton.__version__.split(".")[0]) < 3: + pytest.skip("triton >= 3.0.0 is required to run this test", allow_module_level=True) +except ImportError: + pytest.skip("triton and hqq required to run this test", allow_module_level=True) + +import itertools +import torch + +from hqq.core.quantize import HQQLinear, BaseQuantizeConfig +from torchao.prototype.hqq import triton_mixed_mm, pack_2xint4 + + +#Test configs +SHAPES = [ + [16, 128, 128], + [16, 4096, 4096], +] + +DTYPES = [torch.bfloat16, torch.float16] +GROUP_SIZES = [64, 128] +AXES = [1] #Only axis = 1 supported +TRANSPOSED = [True] +TRITON_KERNEL_TYPE = ["compute_bound"] #["max_autotune", "compute_bound"] + +TEST_CONFIGS = list(itertools.product(SHAPES, GROUP_SIZES, AXES, DTYPES, TRANSPOSED, TRITON_KERNEL_TYPE)) + +BASE_QUANT_CONFIG = { + "optimize": True, + "view_as_float": False, + "nbits": 4, + "bitpack": False, + "axis": 1, +} + + +def check(expected, actual, msg="", max_diff=1e-3, verbose=False): + passed = torch.allclose(expected, actual, atol=max_diff, rtol=max_diff) + if verbose: + max_err = (expected - actual).abs().max() + if not passed: + print(f"{msg}: Failed! Max error: {max_err}") + else: + print(f"{msg}: Passed! Max error: {max_err}") + + return passed + +def _arg_to_id(arg): + if isinstance(arg, list): + return "x".join([str(x) for x in arg]) + return str(arg) + +@pytest.mark.parametrize("shape, group_size, axis, dtype, transposed, kernel_type", TEST_CONFIGS, ids=_arg_to_id) +def test_mixed_mm(shape, group_size, axis, dtype, transposed, kernel_type, quant_dtype=torch.uint8): + qcfg = { + **BASE_QUANT_CONFIG, + **dict(group_size=group_size, axis=axis), + } + M, N, K = shape + + linear = torch.nn.Linear(K, N, bias=False, dtype=dtype, device="cuda") + + quant_config = BaseQuantizeConfig( + quant_zero=False, quant_scale=False, offload_meta=False, view_as_float=False + ) + quant_config.update({"weight_quant_params": qcfg}) + hqq_linear = HQQLinear(linear, quant_config, compute_dtype=dtype, del_orig=False) + W_q, meta = hqq_linear.W_q, hqq_linear.meta + W_q = W_q.to(dtype=quant_dtype) + W_q = ( + W_q.reshape(meta["shape"]) + if quant_config["weight_quant_params"]["bitpack"] == False + else W_q + ) + W_dq = hqq_linear.dequantize() + + scales, zeros = meta["scale"], meta["zero"] + scales = scales.reshape(N, -1) + zeros = zeros.reshape(N, -1) + + if transposed: + x = torch.randn(M, N, dtype=dtype, device="cuda") + hqq_out = x @ W_dq + + #Pack uint8 W_q, then run fused dequant matmul + packed_w = pack_2xint4(W_q) + tt_out = triton_mixed_mm( + x, packed_w, scales, zeros, transposed=True, group_size=group_size, fp8_fast_accum=False, kernel_type=kernel_type + ) + else: + x = torch.randn(M, K, dtype=dtype, device="cuda") + hqq_out = x @ W_dq.T + + packed_w = pack_2xint4(W_q.T) + tt_out = triton_mixed_mm( + x, packed_w, scales.T, zeros.T, transposed=False, group_size=group_size, fp8_fast_accum=False, kernel_type=kernel_type + ) + + assert check(hqq_out, tt_out, max_diff=1e-2 if dtype == torch.bfloat16 else 1e-3) + diff --git a/torchao/prototype/hqq/README.md b/torchao/prototype/hqq/README.md new file mode 100644 index 0000000000..22c40fd246 --- /dev/null +++ b/torchao/prototype/hqq/README.md @@ -0,0 +1,55 @@ +## Fused `int4 / fp16` Quant Matmul + +Fused kernel that combines asymmetric dequantization and gemm. Useful primarily for compute-bound (M > 16) scenarios and not for memory-bound / inference scenarios. + +The kernel fuses two ops: + +- Dequantization: upcasts `u4 / s4` weights to `float16 / bfloat16`, followed by groupwise scaling and shifting by scales / zeropoints +- GEMM: matmul on dequantized weights and activations. + +Tested and benchmarked for `HQQ` but could theoretically be used for any asymmetric quantization scheme. + +> **NOTE**: Benchmark below is only indicative of performance on consumer-grade `Ampere` GPUs (`A6000` specifically). When tested on `H100`, the performance is on par / marginally worse than native / compiled `torch`. +> The intended use is thus for fine-tuning / training models on non-datacenter GPUs (`80 <= compute capability < 90`). If interested in optimizing the kernel for other architectures, please drop a note in the CUDA-MODE Discord channel. + +### Implementation Details + +- Bitpacking is simple row interleave, no need for extensive preprocessing (e.g., `tinygemm` or `fastertransformer`) +- Tested for `float16 / bfloat16` activations, scales, and zeros +- Autotuned for both compute-bound and memory-bound configs +- Assumes operand B of the `gemm` is is the quantized type. +- Requires quantization along `in-features`, i.e., the `K` dimension, or `axis=1`, of `torch.linear.weight`. +- Implementation handles both transposed and non-tranposed quantized weights, useful for forward / backward training passes. + +### Performance + +Initial benchmarking (on `A6000`) demonstrates promising results, scaling well for compute-bound workloads: + +| | M | N | K | group_size | dtype | hqq_ref | triton | tinygemm | +| --- | ---- | ---- | ---- | ---------- | -------------- | ------- | ------ | -------- | +| 0 | 16 | 4096 | 4096 | 128 | torch.bfloat16 | 0.2675 | 0.0633 | 0.0382 | +| 1 | 32 | 4096 | 4096 | 128 | torch.bfloat16 | 0.2669 | 0.0704 | 0.0649 | +| 2 | 128 | 4096 | 4096 | 128 | torch.bfloat16 | 0.2689 | 0.0960 | 0.2523 | +| 3 | 256 | 4096 | 4096 | 128 | torch.bfloat16 | 0.3268 | 0.1355 | 0.5192 | +| 4 | 512 | 4096 | 4096 | 128 | torch.bfloat16 | 0.3628 | 0.2369 | 1.0892 | +| 5 | 1024 | 4096 | 4096 | 128 | torch.bfloat16 | 0.5133 | 0.4753 | 2.2016 | + +- Times are in `ms`, see `benchmarks/benchmark_hqq.py`. +- `hqq_ref` is the base `HQQ_Linear` [module](https://github.com/mobiusml/hqq/blob/6d50eee4bcdd99cc10716f1297c5b2803d2b6da4/hqq/core/quantize.py#L349) that is unfused (dequantization followed by call to torch.matmul). +- `tinygemm` calls `torch.ops.aten._weight_int4pack_mm`. Implementation is a custom HQQLinear layer that wraps the preprocessing necessary for this kernel, adapted from a benchmark script posted by @mobicham from `CUDA-mode` Discord discussions. + +GPU details: + +``` +_CudaDeviceProperties(name='NVIDIA RTX A6000', major=8, minor=6, total_memory=48676MB, multi_processor_count=84) +``` + +### NOTE + +This implementation requires **`triton >= 3.0.0`**. + +- Running tests / benchmarks requires installation of `hqq`: + + ``` + pip install hqq + ``` diff --git a/torchao/prototype/hqq/__init__.py b/torchao/prototype/hqq/__init__.py new file mode 100644 index 0000000000..c97591c475 --- /dev/null +++ b/torchao/prototype/hqq/__init__.py @@ -0,0 +1 @@ +from .mixed_mm import triton_mixed_mm, pack_2xint4 \ No newline at end of file diff --git a/torchao/prototype/hqq/hqq_tinygemm_linear.py b/torchao/prototype/hqq/hqq_tinygemm_linear.py new file mode 100644 index 0000000000..0c4ae45c61 --- /dev/null +++ b/torchao/prototype/hqq/hqq_tinygemm_linear.py @@ -0,0 +1,256 @@ + +#mobicham's tinygemm hqq eval script +import torch + +device = "cuda" + + +import torch, copy +from torch import nn, Tensor + +from hqq.core.quantize import * +from hqq.core.utils import * + +import torch.nn.functional as F + + +class HQQLinearTorchWeightOnlyInt4(torch.nn.Module): + def __init__( + self, + linear_layer: nn.Module | None, + quant_config: dict, + del_orig: bool = True, + compute_dtype: torch.dtype = torch.bfloat16, + device: str = "cuda", + initialize: bool = True, + inner_k_tiles=8, + padding=True, + ): + super().__init__() + + self.ready = False + self.in_gpu = False + self.bias = None + self.device = device + self.compute_dtype = compute_dtype + self.quant_config = copy.deepcopy(quant_config) + self.del_orig = del_orig + + weight_quant_params = self.quant_config["weight_quant_params"] + self.groupsize = weight_quant_params["group_size"] + self.nbits = weight_quant_params["nbits"] + self.inner_k_tiles = inner_k_tiles + self.padding = padding + + assert self.nbits in [1, 2, 4], "Unsupported nbits" + assert self.groupsize in [None, 32, 64, 128, 256], "Unsupported groupsize" + assert self.inner_k_tiles in [2, 4, 8], "Unsupported tile" + + self.linear_layer = linear_layer + self.compute_dtype = compute_dtype + + if initialize: + self.initialize() + + ###################### Initializers ###################### + def initialize_with_hqq_quants(self, W_q, meta, bias=None): + self.padding = ( + False # Force padding off, a bit tricky to post-pad with grouping + ) + + self.set_shape(meta["shape"]) + self.process_hqq_quants(W_q, meta) + self.bias = bias + self.ready = True + self.in_gpu = True + torch.cuda.empty_cache() + + return self + + def initialize(self): + if self.linear_layer is not None: + W = self.linear_layer.weight.data + self.set_shape(W.shape) + + if self.in_features_diff > 0: + W = F.pad(W, pad=(0, self.in_features_diff), value=0) + + W_q, meta = self.quantize(W, **self.quant_config) + self.process_hqq_quants(W_q, meta) + del W_q, meta + + self.bias = ( + None + if (self.linear_layer.bias is None) + else self.linear_layer.bias.to( + dtype=self.compute_dtype, device=self.device + ) + ) + + if self.del_orig: + del self.linear_layer + + self.ready = True + self.in_gpu = True + torch.cuda.empty_cache() + + return self + + ###################### Quantize/packing ###################### + + def quantize( + self, + W: Tensor, + weight_quant_params: dict, + scale_quant_params=dict | None, + zero_quant_params=dict | None, + offload_meta=False, + ): + W_q, meta = Quantizer.quantize( + W, + **weight_quant_params, + device=self.device, + compute_dtype=self.compute_dtype, + bitpack=False, + ) + + # ToDO: meta quantization + + return W_q, meta + + # TODO: move these to utils + @torch.no_grad() + def reshape_meta_axis1(self, meta_tensor, new_group_size, shape): + meta_tensor = meta_tensor.repeat([1, shape[1]]).reshape(shape) + meta_tensor = torch.mean( + meta_tensor.reshape([-1, new_group_size]), axis=1, keepdim=True + ) + return meta_tensor + + def find_multiple(self, n: int, k: int) -> int: + if n % k == 0: + return n + return n + k - (n % k) + + def set_shape(self, shape): + self.shape = shape + self.in_features = shape[1] + self.out_features = shape[0] + + self.origin_in_features = self.in_features + if self.padding: + self.in_features = self.find_multiple(self.in_features, 1024) + + self.in_features_diff = self.in_features - self.origin_in_features + + @torch.no_grad() + def process_hqq_quants(self, W_q, meta): + scales = meta["scale"] + zeros = meta["zero"] + shape = meta["shape"] + + if meta["packing"] is not None: + W_q = Quantizer.unpack[meta["packing"]](W_q) + + if self.groupsize is None: + self.groupsize = 128 + W_q = W_q.reshape([-1, self.groupsize]) + scales = self.reshape_meta_axis1(scales, self.groupsize, shape) + zeros = self.reshape_meta_axis1(zeros, self.groupsize, shape) + + W_q_torch, scales_torch, zeros_torch = self.hqq_quants_to_torch_quants( + W_q=W_q, scales=scales, zeros=zeros, shape=shape, nbits=self.nbits + ) + self.weight_int4pack = torch.ops.aten._convert_weight_to_int4pack( + W_q_torch, self.inner_k_tiles + ) + self.scales_and_zeros = self.pack_scales_and_zeros(scales_torch, zeros_torch) + + del W_q_torch, scales_torch, zeros_torch + torch.cuda.empty_cache() + + @torch.no_grad() + def hqq_quants_to_torch_quants( + self, W_q: Tensor, scales: Tensor, zeros: Tensor, shape, nbits=4 + ): + W_q = W_q.to(dtype=self.compute_dtype, device=self.device) + scales = scales.to(dtype=self.compute_dtype, device=self.device) + zeros = zeros.to(dtype=self.compute_dtype, device=self.device) + + max_int = 2**nbits - 1 + min_int = 0 + dump = 2 ** (nbits - 1) + + # HQQ -> torch logic + new_zeros = (scales * dump) - zeros * scales + + min_val = new_zeros - scales * dump + + # group_quantize_tensor_from_qparams + W_r = (W_q - zeros) * scales + + W_q = ( + W_r.sub(min_val) + .div(scales) + .round() + .clamp_(min_int, max_int) + .to(torch.int32) + .reshape(shape) + .contiguous() + ) + + # group_dequantize_tensor_from_qparams + # W_r = W_q*scales + min_val + + scales = scales.contiguous().reshape(shape[0], -1) + new_zeros = new_zeros.contiguous().reshape(shape[0], -1) + + return W_q, scales, new_zeros + + def pack_scales_and_zeros(self, scales, zeros): + assert scales.shape == zeros.shape + assert scales.dtype == torch.bfloat16 + assert zeros.dtype == torch.bfloat16 + return ( + torch.cat( + [ + scales.reshape(scales.size(0), scales.size(1), 1), + zeros.reshape(zeros.size(0), zeros.size(1), 1), + ], + 2, + ) + .transpose(0, 1) + .contiguous() + ) + + ###################### Forward/matmul ###################### + + @torch.jit.ignore() + def matmul(self, x): + origin_x_size = x.size() + x = x.reshape(-1, origin_x_size[-1]) + c = torch.ops.aten._weight_int4pack_mm( + x, self.weight_int4pack, self.groupsize, self.scales_and_zeros + ) + new_shape = origin_x_size[:-1] + (self.out_features,) + c = c.reshape(new_shape) + return c + + # TODO without matmul + def dequantize(self): + return self.matmul( + torch.eye(self.in_features, dtype=self.compute_dtype, device=self.device) + )[: self.origin_in_features].t() + + # TODO: backward + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x.to(self.compute_dtype) + + if self.in_features_diff > 0: + x = F.pad(x, pad=(0, self.in_features_diff)) + + out = self.matmul(x) + + if self.bias is not None: + out += self.bias + return out diff --git a/torchao/prototype/hqq/kernels.py b/torchao/prototype/hqq/kernels.py new file mode 100644 index 0000000000..077fc94108 --- /dev/null +++ b/torchao/prototype/hqq/kernels.py @@ -0,0 +1,348 @@ +from triton import Config +import triton.language as tl +import triton + +#TODO: add early config prune and estimate_matmul_time to reduce autotuning time +# from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time + + +def get_configs_io_bound(): + configs = [] + for num_stages in [2, 3, 4, 5, 6]: + for block_m in [16, 32]: + for block_k in [32, 64]: + for block_n in [32, 64, 128, 256]: + num_warps = 2 if block_n <= 64 else 4 + configs.append( + Config( + { + "BLOCK_M": block_m, + "BLOCK_N": block_n, + "BLOCK_K": block_k, + "SPLIT_K": 1, + }, + num_stages=num_stages, + num_warps=num_warps, + ) + ) + # split_k + for split_k in [2, 4, 8, 16]: + configs.append( + Config( + { + "BLOCK_M": block_m, + "BLOCK_N": block_n, + "BLOCK_K": block_k, + "SPLIT_K": split_k, + }, + num_stages=num_stages, + num_warps=num_warps, + pre_hook=init_to_zero("C"), + ) + ) + return configs + + +def get_configs_compute_bound(): + configs = [ + # basic configs for compute-bound matmuls + Config( + {"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 32, "SPLIT_K": 1}, + num_stages=3, + num_warps=8, + ), + Config( + {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, + num_stages=3, + num_warps=8, + ), + Config( + {"BLOCK_M": 256, "BLOCK_N": 64, "BLOCK_K": 32, "SPLIT_K": 1}, + num_stages=4, + num_warps=4, + ), + Config( + {"BLOCK_M": 64, "BLOCK_N": 256, "BLOCK_K": 32, "SPLIT_K": 1}, + num_stages=4, + num_warps=4, + ), + Config( + {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, + num_stages=4, + num_warps=4, + ), + Config( + {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 32, "SPLIT_K": 1}, + num_stages=4, + num_warps=4, + ), + Config( + {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, + num_stages=4, + num_warps=4, + ), + Config( + {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 32, "SPLIT_K": 1}, + num_stages=4, + num_warps=4, + ), + Config( + {"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 32, "SPLIT_K": 1}, + num_stages=5, + num_warps=2, + ), + # good for int8 + Config( + {"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 128, "SPLIT_K": 1}, + num_stages=3, + num_warps=8, + ), + Config( + {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "SPLIT_K": 1}, + num_stages=3, + num_warps=8, + ), + Config( + {"BLOCK_M": 256, "BLOCK_N": 64, "BLOCK_K": 128, "SPLIT_K": 1}, + num_stages=4, + num_warps=4, + ), + Config( + {"BLOCK_M": 64, "BLOCK_N": 256, "BLOCK_K": 128, "SPLIT_K": 1}, + num_stages=4, + num_warps=4, + ), + Config( + {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 128, "SPLIT_K": 1}, + num_stages=4, + num_warps=4, + ), + Config( + {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "SPLIT_K": 1}, + num_stages=4, + num_warps=4, + ), + Config( + {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 64, "SPLIT_K": 1}, + num_stages=4, + num_warps=4, + ), + Config( + {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "SPLIT_K": 1}, + num_stages=4, + num_warps=4, + ), + Config( + {"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 64, "SPLIT_K": 1}, + num_stages=5, + num_warps=2, + ), + ] + return configs + + + +def init_to_zero(name): + return lambda nargs: nargs[name].zero_() + + +MIXED_MM_HEURISTICS = { + "EVEN_K": lambda args: args["K"] % (args["BLOCK_K"] * args["SPLIT_K"]) == 0, + "BLOCK_K": lambda args: min(args["BLOCK_K"], args["QGROUP_SIZE"]) if not args["TRANSPOSED"] else args["BLOCK_K"], + "BLOCK_N": lambda args: min(args["BLOCK_N"], args["QGROUP_SIZE"]) if args["TRANSPOSED"] else args["BLOCK_N"], + "SPLIT_K": lambda args: 1 + if args["IS_BFLOAT16"] + else args["SPLIT_K"], # atomic add not supported for bfloat16 +} + + + +@triton.jit +def _mixed_mm_kernel( + # Operands + A, + B, + scales_ptr, + zeros_ptr, + C, + # Matrix dims. + M, + N, + K, + # a, b, c, scales / zeros strides + stride_am, + stride_ak, # + stride_bk, + stride_bn, # + stride_cm, + stride_cn, + stride_scale_k, + stride_scale_n, + # Meta-params + IS_BFLOAT16: tl.constexpr, + QGROUP_SIZE: tl.constexpr, + BLOCK_M: tl.constexpr, # = 32, + BLOCK_N: tl.constexpr, # = 32, + BLOCK_K: tl.constexpr, # = 16, # + SPLIT_K: tl.constexpr, # = 1, + EVEN_K: tl.constexpr, # = True, + TRANSPOSED: tl.constexpr = False, + GROUP_M: tl.constexpr = 8, # 32, + # tl.dot options + acc_dtype: tl.constexpr = tl.float32, + input_precision: tl.constexpr = "ieee", + fp8_fast_accum: tl.constexpr = False, +): + """Mixed matmul kernel + + A has shape (M, K) and is float16, bfloat16, or float32 + + B is i4 / s4 and has shape (K // 2, N) and is packed as uint8 / int8. See `packed_2xint4` for details. + + Scales and zeros are of shape (NUM_GROUPS, N) and are same dtype as A, where NUM_GROUPS = (K // QGROUP_SIZE) + QGROUP_SIZE should be a multiple of BLOCK_K such that a vector of scales / zeros is loaded and broadcasted to block shape + per mainloop iteration. + + NOTE: Assumes that the quantization grouping was done along the K dimension originally (i.e., QGROUP_SIZE consecutive elements + of original weight matrix in the K dimension were grouped together when calculating min / max scaling factors). + """ + + # tl.static_assert(B.dtype.element_ty == tl.int8 or B.dtype.element_ty == tl.uint8) + if not TRANSPOSED: + tl.static_assert(QGROUP_SIZE % BLOCK_K == 0) + else: + tl.static_assert(QGROUP_SIZE % BLOCK_N == 0) + + # Threadblock swizzling + pid = tl.program_id(0) + pid_z = tl.program_id(1) + + grid_m = tl.cdiv(M, BLOCK_M) + grid_n = tl.cdiv(N, BLOCK_N) + + width = GROUP_M * grid_n + group_id = pid // width + group_size = min(grid_m - group_id * GROUP_M, GROUP_M) + pid_m = group_id * GROUP_M + (pid % group_size) + pid_n = (pid % width) // group_size + + rm = (pid_m * BLOCK_M + tl.arange(0, BLOCK_M)) % M + rn = (pid_n * BLOCK_N + tl.arange(0, BLOCK_N)) % N + ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) + rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) + + rak = pid_z * BLOCK_K + tl.arange(0, BLOCK_K) + + # BLOCK_K for b is effectively BLOCK_K // 2 + rbk = pid_z * BLOCK_K // 2 + tl.arange(0, BLOCK_K // 2) + + A = A + (ram[:, None] * stride_am + rak[None, :] * stride_ak) + B = B + (rbk[:, None] * stride_bk + rbn[None, :] * stride_bn) + + #In the forward pass, we have a K x N matrix + #In the transposed (backward) pass, we have an N x K matrix, where N and K refer to the how the weight was originally quantized + #note that N refers to offsets_scale_k and K refers to offsets_scale_n when it comes to the gemm indexing logic below + + #Grouping is along K, so in the forward pass, each block loads a row vector of BLK_K x BLK_N + #where grouping varies along N, hence the mainloop marches down the K dimension, where + #group idx is given by K // QGROUP_SIZE + + # For the transposed case, we load a column vector of BLK_N x BLK_K + # we march down the N dimension during the mainloop ("K" in gemm) + # Hence blocks now load K // QGROUP_SIZE along pid_n (slow varying) + # while each block now loads column vector of groups along "K" gemm dim on each main loop iteration + # scale offsets is thus a single idx along "N" and range along "K" for the transposed case + + if not TRANSPOSED: + # scale_offset_n = pid_n * stride_scale_n * BLOCK_N + offsets_scale_n = pid_n * stride_scale_n * BLOCK_N + tl.arange(0, BLOCK_N) * stride_scale_n + else: + offsets_scale_n = pid_n * stride_scale_n * BLOCK_N // QGROUP_SIZE + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=acc_dtype) + for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)): + if EVEN_K: + a = tl.load(A) + qb = tl.load(B) + else: + k_remaining_a = K - k * (BLOCK_K * SPLIT_K) + k_remaining_b = K - k * (BLOCK_K * SPLIT_K) // 2 # Note the division by 2 + + _0 = tl.zeros((1, 1), dtype=C.dtype.element_ty) + a = tl.load(A, mask=rak[None, :] < k_remaining_a, other=_0) + qb = tl.load(B, mask=rbk[:, None] < k_remaining_b, other=_0) + + if not TRANSPOSED: + scale_offset_k = k * BLOCK_K * SPLIT_K * stride_scale_k // QGROUP_SIZE + else: + scale_offset_k = k * BLOCK_K * SPLIT_K * stride_scale_k + tl.arange(0, BLOCK_K) * stride_scale_k + + scales = tl.load(scales_ptr + offsets_scale_n + scale_offset_k) + zeros = tl.load(zeros_ptr + offsets_scale_n + scale_offset_k) + + # Unpack qweights -- h/t jlebar! + _4_i8 = tl.full((1,), 4, dtype=tl.int8) + qb_lo = (qb << _4_i8) >> _4_i8 + qb_hi = qb >> _4_i8 + + # Upcast to fp16 + # TODO: better bfloat16 conversion? compilation error if direct conversion from int8 to bfloat16 + if IS_BFLOAT16: + dq_b = ( + tl.join( + qb_lo.to(tl.float16).to(A.dtype.element_ty), + qb_hi.to(tl.float16).to(A.dtype.element_ty), + ) + .permute(0, 2, 1) + .reshape(BLOCK_K, BLOCK_N) + ) + else: + dq_b = ( + tl.join( + qb_lo.to(A.dtype.element_ty), + qb_hi.to(A.dtype.element_ty), + ) + .permute(0, 2, 1) + .reshape(BLOCK_K, BLOCK_N) + ) + + # Scale upcasted weights + # Note that we broadcast the scales --> the assumption is that all scales fall within a single QGROUP + # This condition is statically check (see assertions above) + if not TRANSPOSED: + zeros = zeros[None, :] + scales = scales[None, :] + else: + zeros = zeros[:, None] + scales = scales[:, None] + + dq_b = (dq_b - zeros) * scales + + # dq_b = (dq_b - zeros[None, :]) * scales[None, :] + + if fp8_fast_accum: + acc = tl.dot( + a, dq_b, acc, out_dtype=acc_dtype, input_precision=input_precision + ) + else: + acc += tl.dot(a, dq_b, out_dtype=acc_dtype, input_precision=input_precision) + A += BLOCK_K * SPLIT_K * stride_ak + # Advance by half the block size, since each block is unpacked and upcasted into two fp16 values + B += BLOCK_K * SPLIT_K * stride_bk // 2 + + acc = acc.to(C.dtype.element_ty) + + offs_cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + C = C + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :] + mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + + if SPLIT_K == 1: + tl.store(C, acc, mask=mask) + else: + tl.atomic_add(C, acc, mask=mask) + + +_mixed_mm = triton.heuristics(MIXED_MM_HEURISTICS)(_mixed_mm_kernel) +mixed_mm_kernel_max_autotune = triton.autotune(configs=get_configs_compute_bound() + get_configs_io_bound(), key=["M", "N", "K"])(_mixed_mm) +mixed_mm_kernel_compute_bound = triton.autotune(configs=get_configs_compute_bound(), key=["M", "N", "K"])(_mixed_mm) diff --git a/torchao/prototype/hqq/mixed_mm.py b/torchao/prototype/hqq/mixed_mm.py new file mode 100644 index 0000000000..e3ccaeb46e --- /dev/null +++ b/torchao/prototype/hqq/mixed_mm.py @@ -0,0 +1,102 @@ +import torch +from triton import cdiv +import triton.language as tl +from .kernels import mixed_mm_kernel_compute_bound, mixed_mm_kernel_max_autotune + +#h/t jlebar for the bit packing / unpacking logic (source: Triton Slack thread) +#https://gist.github.com/jlebar/3435b2c00deea53258887ce37231e5e2 +def pack_2xint4(t): + """ + The packing format is such that consecutive rows are packed into a lower / upper bits + E.g., + Original, unpacked B (dtype i8): + [ + [0, 1, 2, 3] + [4, 5, 6, 7] + [8, 9, 10, 11] + [12, 13, 14, 15] + ] + Packed B: + [ + [0|4, 1|5, 2|6, 3|7] + [8|12, 9|13, 10|14, 11|15] + ] + (Note each entry in `Packed B` is shown lsb->msb) + """ + assert t.dtype == torch.int8 or t.dtype == torch.uint8 + t = t.reshape(t.shape[0] // 2, 2, t.shape[1]).permute(1, 0, 2) + return (t[0] & 0xF) | (t[1] << 4) + +def triton_mixed_mm( + a, + b, + scales, + zeros, + group_size, + transposed=False, + acc_dtype=None, + input_precision="ieee", + fp8_fast_accum=False, + kernel_type="compute_bound", +): + device = a.device + # handle non-contiguous inputs if necessary + if a.stride(0) > 1 and a.stride(1) > 1: + a = a.contiguous() + if b.stride(0) > 1 and b.stride(1) > 1: + b = b.contiguous() + # checks constraints + assert a.shape[1] == b.shape[0] * 2, "incompatible dimensions" + assert b.dtype == torch.int8 or b.dtype == torch.uint8, "b must be int8 or uint8" + assert scales.ndim == 2 + assert kernel_type in ["max_autotune", "compute_bound"] + + M, K = a.shape + _, N = b.shape + # N = b.shape[1] if not transposed else b.shape[0] + # assert scales.shape[1] == N if not transposed else scales.shape[0] == N + # assert scales.shape[0] == K // group_size if not transposed else scales.shape[1] == K // group_size + assert scales.dtype == a.dtype + assert scales.shape == zeros.shape + assert zeros.dtype == a.dtype + + # Assumes c is same type as a + c = torch.empty((M, N), device=device, dtype=a.dtype) + if acc_dtype is None: + acc_dtype = tl.float32 + + grid = lambda META: ( + cdiv(M, META["BLOCK_M"]) * cdiv(N, META["BLOCK_N"]), + META["SPLIT_K"], + ) + + if kernel_type == "max_autotune": + kernel = mixed_mm_kernel_max_autotune + else: + kernel = mixed_mm_kernel_compute_bound + + kernel[grid]( + a, + b, + scales, + zeros, + c, + M, + N, + K, # + a.stride(0), + a.stride(1), # + b.stride(0), + b.stride(1), # + c.stride(0), + c.stride(1), + scales.stride(0), + scales.stride(1), + TRANSPOSED=transposed, + IS_BFLOAT16=a.dtype == torch.bfloat16, + QGROUP_SIZE=group_size, + acc_dtype=acc_dtype, + input_precision=input_precision, + fp8_fast_accum=fp8_fast_accum, + ) + return c