Merge branch 'main' into docathon

pytorch · May 25, 2024 · 5730ed8 · 5730ed8
2 parents 97e8d9d + bea1927
commit 5730ed8
Show file tree

Hide file tree

Showing 13 changed files with 537 additions and 693 deletions.
diff --git a/README.md b/README.md
@@ -29,9 +29,23 @@ git clone https://github.com/pytorch/ao
 cd ao
 pip install -r requirements.txt
 pip install -r dev-requirements.txt
-pip install .
 ```
 
+There are two options;
+-If you plan to be developing the library run:
+```Shell
+python setup.py develop
+```
+
+If you want to install from source run
+```Shell
+python setup.py install
+```
+
+** Note:
+Since we are building pytorch c++/cuda extensions by default, running `pip install .` will
+not work.
+
 ### Quantization
 
 ```python

diff --git a/docs/static/pruning_ecosystem_diagram.png b/docs/static/pruning_ecosystem_diagram.png
diff --git a/docs/static/pruning_flow.png b/docs/static/pruning_flow.png
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
@@ -18,17 +18,18 @@
     get_symmetric_quantization_config,
 )
 
-from torchao.quantization.subclass import (
-    to_aqt,
-    to_laqt,
+from torchao.dtypes import (
+    to_aq,
     AffineQuantizedTensor,
-    LinearActQuantizedTensor,
 )
 from torchao.quantization.quant_primitives import (
     MappingType,
     ZeroPointDomain,
 )
-
+from torchao.quantization.subclass import (
+    to_laq,
+    LinearActQuantizedTensor,
+)
 from torchao.quantization.quant_api import (
     _replace_with_custom_fn_if_matches_filter,
     apply_dynamic_quant,
@@ -429,17 +430,17 @@ def get_per_token_block_size(x):
         # input settings
         input_mapping_type = MappingType.ASYMMETRIC
         input_target_dtype = torch.int8
-        input_quant_func = lambda x: to_aqt(x, input_mapping_type, get_per_token_block_size(x), input_target_dtype)
+        input_quant_func = lambda x: to_aq(x, input_mapping_type, get_per_token_block_size(x), input_target_dtype)
 
         m = ToyLinearModel().eval()
         m_copy = copy.deepcopy(m)
         example_inputs = m.example_inputs()
 
         def apply_weight_quant(weight):
-            return to_aqt(weight, mapping_type, block_size, target_dtype, quant_min, quant_max, eps)
+            return to_aq(weight, mapping_type, block_size, target_dtype, quant_min, quant_max, eps)
 
         def apply_act_quant(weight):
-            return to_laqt(weight, input_quant_func)
+            return to_laq(weight, input_quant_func)
 
         # note: order is important
         m = quantize(m, apply_weight_quant)
@@ -484,7 +485,7 @@ def test_quantized_tensor_subclass_int4(self):
         example_inputs = tuple(map(lambda x: x.to(torch.bfloat16).to("cuda"), m.example_inputs()))
 
         def apply_weight_quant(weight):
-            return to_aqt(weight, mapping_type, block_size, target_dtype, quant_min, quant_max, eps, zero_point_dtype=zero_point_dtype, preserve_zero=preserve_zero, zero_point_domain=zero_point_domain)
+            return to_aq(weight, mapping_type, block_size, target_dtype, quant_min, quant_max, eps, zero_point_dtype=zero_point_dtype, preserve_zero=preserve_zero, zero_point_domain=zero_point_domain)
 
         m = quantize(m, apply_weight_quant)
         assert isinstance(m.linear1.weight, AffineQuantizedTensor)
@@ -515,7 +516,7 @@ def test_quantized_tensor_subclass_int8(self):
 
         def apply_weight_quant(weight):
             block_size = (1, weight.shape[1])
-            return to_aqt(weight, mapping_type, block_size, target_dtype, eps=eps, zero_point_dtype=zero_point_dtype)
+            return to_aq(weight, mapping_type, block_size, target_dtype, eps=eps, zero_point_dtype=zero_point_dtype)
 
         m = quantize(m, apply_weight_quant)
 
@@ -555,7 +556,7 @@ def get_per_token_block_size(x):
         input_eps = 1e-5
         input_quant_min = -127
         input_quant_max = 127
-        input_quant_func = lambda x: to_aqt(x, input_mapping_type, get_per_token_block_size(x), input_target_dtype, eps=input_eps, quant_min=input_quant_min, quant_max=input_quant_max, scale_dtype=torch.float32 if x.dtype == torch.float16 else None)
+        input_quant_func = lambda x: to_aq(x, input_mapping_type, get_per_token_block_size(x), input_target_dtype, eps=input_eps, quant_min=input_quant_min, quant_max=input_quant_max, scale_dtype=torch.float32 if x.dtype == torch.float16 else None)
 
         # use 1024 so that we don't need padding
         m = ToyLinearModel(1024, 1024, 1024).eval().to(torch.bfloat16).to("cuda")
@@ -565,10 +566,10 @@ def get_per_token_block_size(x):
 
         def apply_weight_quant(weight):
             block_size = get_weight_block_size(weight)
-            return to_aqt(weight, mapping_type, block_size, target_dtype, eps=eps, zero_point_dtype=zero_point_dtype)
+            return to_aq(weight, mapping_type, block_size, target_dtype, eps=eps, zero_point_dtype=zero_point_dtype)
 
         def apply_act_quant(weight):
-            return to_laqt(weight, input_quant_func)
+            return to_laq(weight, input_quant_func)
 
         m = quantize(m, apply_weight_quant)
         m = quantize(m, apply_act_quant)

diff --git a/test/test_ops.py b/test/test_ops.py
@@ -30,21 +30,6 @@ def _create_tensors_with_iou(self, N, iou_thresh):
         scores = torch.rand(N)
         return boxes, scores
 
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
-    @unittest.skipIf(not TORCH_VERSION_AFTER_2_4, "skipping when torch verion is 2.3 or lower")
-    def test_nms(self):
-        iou = 0.2
-        boxes, scores = self._create_tensors_with_iou(1000, iou)
-        boxes = boxes.cuda()
-        scores = scores.cuda()
-
-        # smoke test
-        _ = torchao.ops.nms(boxes, scores, iou)
-
-        # comprehensive testing
-        test_utils = ["test_schema", "test_autograd_registration", "test_faketensor", "test_aot_dispatch_dynamic"]
-        opcheck(torch.ops.torchao.nms, (boxes, scores, iou), test_utils=test_utils)
-
     def _create_fp6_inputs(self, BS: int, OC: int, IC: int):
         # Randomly initialize each bytes. The highest value for randint() is set the the max value of uint32_t.
         fp6_weight = torch.randint(4294967295, (OC, IC // 16 * 3)).to(torch.int)

diff --git a/torchao/csrc/cuda/nms.cu b/torchao/csrc/cuda/nms.cu
diff --git a/torchao/csrc/nms.cpp b/torchao/csrc/nms.cpp
diff --git a/torchao/dtypes/__init__.py b/torchao/dtypes/__init__.py
@@ -1,8 +1,11 @@
 from .nf4tensor import NF4Tensor, to_nf4
 from .uint4 import UInt4Tensor
+from .aqt import AffineQuantizedTensor, to_aq
 
 __all__ = [
     "NF4Tensor",
     "to_nf4",
     "UInt4Tensor"
+    "AffineQuantizedTensor",
+    "to_aq",
 ]