[int4-quant] Execute weights shuffling on CPU until MPS memory issue …

…is resolved (pytorch#552)
dbyoung18 · Jul 29, 2024 · 4abe4b8 · 4abe4b8
1 parent 4563492
commit 4abe4b8
Showing 1 changed file with 6 additions and 0 deletions.
diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py
@@ -351,7 +351,13 @@ def groupwise_affine_quantize_tensor_from_qparams(
 
     int_data = quantize_affine(w, block_size, scales, zeros, output_dtype, quant_min, quant_max, zero_point_domain = ZeroPointDomain.FLOAT)
     if TORCH_VERSION_AFTER_2_5:
+        int_data_device_type = int_data.device.type
+        # Move to cpu, until issue with MPS memory management of temporary tensors is resolved
+        if int_data_device_type == 'mps':
+            int_data = int_data.cpu()
         int_data = (int_data[::, ::2] << 4 | int_data[::, 1::2]).to(torch.uint8)
+        if int_data_device_type == 'mps':
+            int_data = int_data.to(device='mps')
     return int_data
 
 def groupwise_affine_dequantize_tensor_from_qparams(