From 000a0fdc8235422bb5018f86e5eae86c713b293a Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Fri, 7 Jun 2024 15:19:14 -0700
Subject: [PATCH] Move some util functions from quantization.utils to
 torchao.utils (#337)

Summary:

Moved
```
TORCH_VERSION_AFTER_2_(2/3/4)
get_model_size_in_bytes
unwrap_tensor_subclass
```

from quantization/utils.py to torchao/utils.py

Test Plan:
python test/integration/test_integration.py

Reviewers:

Subscribers:

Tasks:

Tags:
---
 test/dtypes/test_fp8.py                       |  2 +-
 test/integration/test_integration.py          |  2 +-
 test/prototype/mx_formats/test_custom_cast.py |  2 +-
 test/prototype/mx_formats/test_mx_linear.py   |  3 +-
 test/prototype/mx_formats/test_mx_tensor.py   |  3 +-
 test/prototype/test_bitpacking.py             | 10 +--
 test/quantization/model.py                    |  2 +-
 test/quantization/test_qat.py                 |  2 +-
 test/quantization/test_quant_api.py           |  4 +-
 test/quantization/test_quant_primitives.py    |  2 +-
 test/sparsity/test_fast_sparse_training.py    |  4 +-
 test/sparsity/test_sparse_api.py              |  2 +-
 test/test_ops.py                              |  2 +-
 torchao/_executorch_ops.py                    | 10 +--
 torchao/kernel/intmm.py                       |  2 +-
 torchao/ops.py                                |  2 +-
 torchao/prototype/mx_formats/custom_cast.py   |  2 +-
 torchao/quantization/GPTQ.py                  |  4 +-
 torchao/quantization/__init__.py              |  1 -
 torchao/quantization/autoquant.py             |  2 +-
 torchao/quantization/quant_api.py             |  2 +-
 torchao/quantization/quant_primitives.py      |  2 +-
 torchao/quantization/subclass.py              |  2 +-
 torchao/quantization/utils.py                 | 88 ------------------
 torchao/sparsity/training/__init__.py         |  2 +-
 torchao/sparsity/training/autograd.py         |  6 +-
 torchao/utils.py                              | 89 +++++++++++++++++++
 27 files changed, 129 insertions(+), 125 deletions(-)

diff --git a/test/dtypes/test_fp8.py b/test/dtypes/test_fp8.py
index 811de3a4c3..ae008fc91e 100644
--- a/test/dtypes/test_fp8.py
+++ b/test/dtypes/test_fp8.py
@@ -7,7 +7,7 @@
     parametrize,
     run_tests,
 )
-from torchao.quantization.utils import TORCH_VERSION_AFTER_2_4
+from torchao.utils import TORCH_VERSION_AFTER_2_4
 
 try:
     from torchao.prototype.fp8 import gemm_split_k, to_float8
diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py
index c770f455fe..dc30e39b8f 100644
--- a/test/integration/test_integration.py
+++ b/test/integration/test_integration.py
@@ -70,7 +70,7 @@
 from parameterized import parameterized
 import itertools
 import logging
-from torchao.quantization.utils import TORCH_VERSION_AFTER_2_3, TORCH_VERSION_AFTER_2_4
+from torchao.utils import TORCH_VERSION_AFTER_2_3, TORCH_VERSION_AFTER_2_4
 
 logger = logging.getLogger("INFO")
 
diff --git a/test/prototype/mx_formats/test_custom_cast.py b/test/prototype/mx_formats/test_custom_cast.py
index 892d5b57f7..d247c70881 100644
--- a/test/prototype/mx_formats/test_custom_cast.py
+++ b/test/prototype/mx_formats/test_custom_cast.py
@@ -44,7 +44,7 @@
 )
 
 from torchao.prototype.mx_formats.mx_tensor import MXTensor
-from torchao.quantization.utils import TORCH_VERSION_AFTER_2_4
+from torchao.utils import TORCH_VERSION_AFTER_2_4
 
 if not TORCH_VERSION_AFTER_2_4:
     pytest.skip("Unsupported PyTorch version", allow_module_level=True)
diff --git a/test/prototype/mx_formats/test_mx_linear.py b/test/prototype/mx_formats/test_mx_linear.py
index 65f6002dbf..c453b0fe38 100644
--- a/test/prototype/mx_formats/test_mx_linear.py
+++ b/test/prototype/mx_formats/test_mx_linear.py
@@ -19,7 +19,8 @@
     swap_linear_with_mx_linear,
 )
 
-from torchao.quantization.utils import compute_error, TORCH_VERSION_AFTER_2_4
+from torchao.quantization.utils import compute_error
+from torchao.utils import TORCH_VERSION_AFTER_2_4
 
 # trying to outsmart flake8
 __has_cuda = torch.cuda.is_available()
diff --git a/test/prototype/mx_formats/test_mx_tensor.py b/test/prototype/mx_formats/test_mx_tensor.py
index f1b82e376b..a311f0f050 100644
--- a/test/prototype/mx_formats/test_mx_tensor.py
+++ b/test/prototype/mx_formats/test_mx_tensor.py
@@ -23,7 +23,8 @@
     to_dtype,
 )
 
-from torchao.quantization.utils import compute_error, TORCH_VERSION_AFTER_2_4
+from torchao.quantization.utils import compute_error
+from torchao.utils import TORCH_VERSION_AFTER_2_4
 
 # trying to outsmart flake8
 __has_cuda = torch.cuda.is_available()
diff --git a/test/prototype/test_bitpacking.py b/test/prototype/test_bitpacking.py
index c1b60e07f8..d1c1d261d1 100644
--- a/test/prototype/test_bitpacking.py
+++ b/test/prototype/test_bitpacking.py
@@ -2,7 +2,7 @@
 from torchao.prototype.common.bitpacking import pack, unpack
 import pytest
 from torch.utils._triton import has_triton
-from torchao.quantization.utils import TORCH_VERSION_AFTER_2_4
+from torchao.utils import TORCH_VERSION_AFTER_2_4
 
 if not TORCH_VERSION_AFTER_2_4:
     pytest.skip("Unsupported PyTorch version", allow_module_level=True)
@@ -20,7 +20,7 @@ def test_uint3_to_int16_col_wise_cpu():
     unpacked = unpack(packed, 3, False, device='cpu')
     unpadded = unpacked[:test_tensor.shape[0], ...]
     assert(unpadded.allclose(test_tensor))
-    
+
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 def test_uint4_to_uint8():
     test_tensor = torch.randint(0, 15, (4, 4), dtype=torch.uint8).cuda()
@@ -28,7 +28,7 @@ def test_uint4_to_uint8():
     unpacked = unpack(packed, 4)
     unpadded = unpacked[:test_tensor.shape[0], ...]
     assert(unpadded.allclose(test_tensor))
-     
+
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.skipif(not has_triton(), reason="unsupported without triton")
 def test_uint4_to_uint8_compile():
@@ -40,7 +40,7 @@ def test_uint4_to_uint8_compile():
     unpacked = unpack_compiled(packed, 4)
     unpadded = unpacked[:test_tensor.shape[0], ...]
     assert(unpadded.allclose(test_tensor))
-    
+
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 def test_uint3_to_int16():
     test_tensor = torch.randint(0, 7, (5, 8), dtype=torch.int16).cuda()
@@ -67,4 +67,4 @@ def test_uint3_to_int16_col_wise():
     packed = pack(test_tensor,16, 3, False)
     unpacked = unpack(packed, 3, False)
     unpadded = unpacked[:test_tensor.shape[0], ...]
-    assert(unpadded.allclose(test_tensor))
\ No newline at end of file
+    assert(unpadded.allclose(test_tensor))
diff --git a/test/quantization/model.py b/test/quantization/model.py
index e851901c41..94835fc0c3 100644
--- a/test/quantization/model.py
+++ b/test/quantization/model.py
@@ -10,7 +10,7 @@
 import torch.nn as nn
 from torch import Tensor
 from torch.nn import functional as F
-from torchao.quantization.utils import find_multiple
+from torchao.utils import find_multiple
 
 def prepare_inputs_for_model(inps, max_new_tokens=1):
     # this is because input from lm-eval is 2d
diff --git a/test/quantization/test_qat.py b/test/quantization/test_qat.py
index 93323df0f1..f5be66f50a 100644
--- a/test/quantization/test_qat.py
+++ b/test/quantization/test_qat.py
@@ -19,7 +19,7 @@
     fake_quantize_per_token,
 )
 from torchao.quantization.quant_primitives import get_group_qparams_symmetric
-from torchao.quantization.utils import TORCH_VERSION_AFTER_2_4
+from torchao.utils import TORCH_VERSION_AFTER_2_4
 
 
 # TODO: put this in a common test utils file
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
index 68df4f29fe..8a96124f1c 100644
--- a/test/quantization/test_quant_api.py
+++ b/test/quantization/test_quant_api.py
@@ -44,7 +44,7 @@
     get_apply_int8wo_quant,
     get_apply_int8dyn_quant,
 )
-from torchao.quantization.utils import (
+from torchao.utils import (
     TORCH_VERSION_AFTER_2_3,
     TORCH_VERSION_AFTER_2_4,
 )
@@ -556,7 +556,7 @@ def test_quantized_tensor_subclass_int8_dyn_quant(self):
         self.assertTrue(torch.equal(res, ref))
 
         # workaround for export path
-        from torchao.quantization.utils import unwrap_tensor_subclass
+        from torchao.utils import unwrap_tensor_subclass
         m_unwrapped = unwrap_tensor_subclass(m)
 
         m = torch.export.export(m_unwrapped, example_inputs).module()
diff --git a/test/quantization/test_quant_primitives.py b/test/quantization/test_quant_primitives.py
index 6054c6e66f..8cecdf32ea 100644
--- a/test/quantization/test_quant_primitives.py
+++ b/test/quantization/test_quant_primitives.py
@@ -19,7 +19,7 @@
     MappingType,
 )
 
-from torchao.quantization.utils import (
+from torchao.utils import (
     TORCH_VERSION_AFTER_2_3,
     TORCH_VERSION_AFTER_2_4,
 )
diff --git a/test/sparsity/test_fast_sparse_training.py b/test/sparsity/test_fast_sparse_training.py
index b195534664..081f0e4d2f 100644
--- a/test/sparsity/test_fast_sparse_training.py
+++ b/test/sparsity/test_fast_sparse_training.py
@@ -12,7 +12,7 @@
     swap_semi_sparse_linear_with_linear,
     SemiSparseLinear
 )
-from torchao.quantization.utils import TORCH_VERSION_AFTER_2_4
+from torchao.utils import TORCH_VERSION_AFTER_2_4
 
 class TestModel(nn.Module):
     def __init__(self):
@@ -42,7 +42,7 @@ def test_runtime_weight_sparsification(self):
             if isinstance(mod, torch.nn.Linear):
                 sparse = SparseSemiStructuredTensorCUSPARSELT.prune_dense_static_sort(mod.weight.detach()).to_dense()
                 mod.weight = nn.Parameter(sparse)
-        
+
         dense_result = model(input)
 
         # map from fqn to replacement linear module
diff --git a/test/sparsity/test_sparse_api.py b/test/sparsity/test_sparse_api.py
index 83c0544f6e..c7bc2700df 100644
--- a/test/sparsity/test_sparse_api.py
+++ b/test/sparsity/test_sparse_api.py
@@ -11,7 +11,7 @@
     _get_subclass_inserter,
     _is_linear,
 )
-from torchao.quantization.utils import TORCH_VERSION_AFTER_2_3
+from torchao.utils import TORCH_VERSION_AFTER_2_3
 from torch.testing._internal.common_utils import TestCase
 
 
diff --git a/test/test_ops.py b/test/test_ops.py
index b20e029380..cd833359eb 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -2,7 +2,7 @@
 from torch.testing._internal.common_utils import TestCase, IS_FBCODE
 from torch.testing._internal.optests import opcheck
 import torchao
-from torchao.quantization.utils import TORCH_VERSION_AFTER_2_4
+from torchao.utils import TORCH_VERSION_AFTER_2_4
 import unittest
 from parameterized import parameterized
 import pytest
diff --git a/torchao/_executorch_ops.py b/torchao/_executorch_ops.py
index 33444b55fd..3ec2506ea6 100644
--- a/torchao/_executorch_ops.py
+++ b/torchao/_executorch_ops.py
@@ -9,7 +9,7 @@ def _quantized_decomposed_quantize_per_channel_group_wrapper(*args, **kwargs):
     torch.ops.quantized_decomposed.quantize_per_channel_group is only available
     in PyTorch 2.3+ and recently changed signatures.
     """
-    from torchao.quantization.utils import TORCH_VERSION_AFTER_2_3
+    from torchao.utils import TORCH_VERSION_AFTER_2_3
     if TORCH_VERSION_AFTER_2_3:
         return torch.ops.quantized_decomposed.quantize_per_channel_group(*args, **kwargs)
     raise ImportError("Need torch.ops.quantized_decomposed.quantize_per_channel_group, which is only available with PyTorch 2.3 or later.")
@@ -23,7 +23,7 @@ def _quantized_decomposed_choose_qparams_per_token_asymmetric_wrapper(*args, **k
     torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric is only available
     in PyTorch 2.3+ and recently changed signatures.
     """
-    from torchao.quantization.utils import TORCH_VERSION_AFTER_2_3
+    from torchao.utils import TORCH_VERSION_AFTER_2_3
     if TORCH_VERSION_AFTER_2_3:
         return torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric(*args, **kwargs)
     raise ImportError("Need torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric, which is only available with PyTorch 2.3 or later.")
@@ -37,7 +37,7 @@ def _quantized_decomposed_dequantize_per_channel_group_wrapper(*args, **kwargs):
     torch.ops.quantized_decomposed.dequantize_per_channel_group is only available
     in PyTorch 2.3+ and recently changed signatures.
     """
-    from torchao.quantization.utils import TORCH_VERSION_AFTER_2_3
+    from torchao.utils import TORCH_VERSION_AFTER_2_3
     if TORCH_VERSION_AFTER_2_3:
         return torch.ops.quantized_decomposed.dequantize_per_channel_group(*args, **kwargs)
     raise ImportError("Need torch.ops.quantized_decomposed.dequantize_per_channel_group, which is only available with PyTorch 2.3 or later.")
@@ -51,7 +51,7 @@ def _quantized_decomposed_quantize_per_token_wrapper(*args, **kwargs):
     torch.ops.quantized_decomposed.quantize_per_token is only available
     in PyTorch 2.3+ and recently changed signatures.
     """
-    from torchao.quantization.utils import TORCH_VERSION_AFTER_2_3
+    from torchao.utils import TORCH_VERSION_AFTER_2_3
     if TORCH_VERSION_AFTER_2_3:
         return torch.ops.quantized_decomposed.quantize_per_token(*args, **kwargs)
     raise ImportError("Need torch.ops.quantized_decomposed.quantize_per_token, which is only available with PyTorch 2.3 or later.")
@@ -65,7 +65,7 @@ def _quantized_decomposed_dequantize_per_token_wrapper(*args, **kwargs):
     torch.ops.quantized_decomposed.dequantize_per_token is only available
     in PyTorch 2.3+ and recently changed signatures.
     """
-    from torchao.quantization.utils import TORCH_VERSION_AFTER_2_3
+    from torchao.utils import TORCH_VERSION_AFTER_2_3
     if TORCH_VERSION_AFTER_2_3:
         return torch.ops.quantized_decomposed.dequantize_per_token(*args, **kwargs)
     raise ImportError("Need torch.ops.quantized_decomposed.dequantize_per_token, which is only available with PyTorch 2.3 or later.")
diff --git a/torchao/kernel/intmm.py b/torchao/kernel/intmm.py
index 8491a2ba6c..28827c543d 100644
--- a/torchao/kernel/intmm.py
+++ b/torchao/kernel/intmm.py
@@ -2,7 +2,7 @@
 import os
 import torch
 
-from torchao.quantization.utils import TORCH_VERSION_AFTER_2_2
+from torchao.utils import TORCH_VERSION_AFTER_2_2
 
 try:
     # Only works for torch2.2 or newer.
diff --git a/torchao/ops.py b/torchao/ops.py
index 7fce2de22f..51adb24100 100644
--- a/torchao/ops.py
+++ b/torchao/ops.py
@@ -1,6 +1,6 @@
 import torch
 from torch import Tensor
-from torchao.quantization.utils import TORCH_VERSION_AFTER_2_4
+from torchao.utils import TORCH_VERSION_AFTER_2_4
 
 
 def register_custom_op(name):
diff --git a/torchao/prototype/mx_formats/custom_cast.py b/torchao/prototype/mx_formats/custom_cast.py
index 60aaa336ba..91aea9275a 100644
--- a/torchao/prototype/mx_formats/custom_cast.py
+++ b/torchao/prototype/mx_formats/custom_cast.py
@@ -11,7 +11,7 @@
 import torch
 from torch.utils._triton import has_triton
 
-from torchao.quantization.utils import TORCH_VERSION_AFTER_2_4
+from torchao.utils import TORCH_VERSION_AFTER_2_4
 
 # TODO(future): if needed, make the below work on previous PyTorch versions,
 # just need to hunt down the previous location of `libdevice`. An assert
diff --git a/torchao/quantization/GPTQ.py b/torchao/quantization/GPTQ.py
index 2e23767a84..6c3f41b834 100644
--- a/torchao/quantization/GPTQ.py
+++ b/torchao/quantization/GPTQ.py
@@ -22,9 +22,11 @@
 from .utils import (
     _lm_eval_available,
     _MultiInput,
-    TORCH_VERSION_AFTER_2_3,
+)
+from torchao.utils import (
     find_multiple,
 )
+from torchao.utils import TORCH_VERSION_AFTER_2_3
 from typing import Any, Dict, Optional
 from .unified import Quantizer
 
diff --git a/torchao/quantization/__init__.py b/torchao/quantization/__init__.py
index ab51dbb3a5..aa265daaf5 100644
--- a/torchao/quantization/__init__.py
+++ b/torchao/quantization/__init__.py
@@ -44,7 +44,6 @@
     "Int8WeightOnlyQuantizedLinearWeight",
     "Int4WeightOnlyQuantizedLinearWeight",
     "compute_error",
-    "get_model_size_in_bytes",
     "WeightOnlyInt8QuantLinear",
     "Int4WeightOnlyGPTQQuantizer",
     "Int4WeightOnlyQuantizer",
diff --git a/torchao/quantization/autoquant.py b/torchao/quantization/autoquant.py
index 9eeb146f55..6ad60e042f 100644
--- a/torchao/quantization/autoquant.py
+++ b/torchao/quantization/autoquant.py
@@ -9,7 +9,7 @@
     quantize_activation_per_token_absmax,
     safe_int_mm,
 )
-from .utils import TORCH_VERSION_AFTER_2_4
+from torchao.utils import TORCH_VERSION_AFTER_2_4
 import torch.nn.functional as F
 try:
     from torch._inductor.utils import do_bench
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
index 84e8c151c2..510db85512 100644
--- a/torchao/quantization/quant_api.py
+++ b/torchao/quantization/quant_api.py
@@ -25,7 +25,7 @@
 from typing import Any, Callable
 
 from .dynamic_quant import DynamicallyPerAxisQuantizedLinear
-from .utils import (
+from torchao.utils import (
     TORCH_VERSION_AFTER_2_4,
     unwrap_tensor_subclass,
 )
diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py
index d1ad1e7403..5f5ba39d66 100644
--- a/torchao/quantization/quant_primitives.py
+++ b/torchao/quantization/quant_primitives.py
@@ -14,7 +14,7 @@
 
 from torchao.kernel.intmm import int_scaled_matmul
 from torchao.kernel.intmm import safe_int_mm
-from .utils import TORCH_VERSION_AFTER_2_3
+from torchao.utils import TORCH_VERSION_AFTER_2_3
 
 
 __all__ = [
diff --git a/torchao/quantization/subclass.py b/torchao/quantization/subclass.py
index 972699f0bf..75c68cdf82 100644
--- a/torchao/quantization/subclass.py
+++ b/torchao/quantization/subclass.py
@@ -18,7 +18,7 @@
     groupwise_affine_quantize_tensor_from_qparams,
     MappingType,
 )
-from .utils import find_multiple
+from torchao.utils import find_multiple
 from typing import Tuple, Optional, Callable, Dict, Any
 
 
diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py
index 1a08f9901a..355be5045e 100644
--- a/torchao/quantization/utils.py
+++ b/torchao/quantization/utils.py
@@ -7,20 +7,10 @@
 
 import torch
 from torch.utils._python_dispatch import TorchDispatchMode
-from packaging import version
-import torch.nn.utils.parametrize as parametrize
-from torchao.utils import find_multiple
-
 
 __all__ = [
-    "find_multiple",
     "compute_error",
     "_apply_logging_hook",
-    "get_model_size_in_bytes",
-    "unwrap_tensor_subclass",
-    "TORCH_VERSION_AFTER_2_2",
-    "TORCH_VERSION_AFTER_2_3",
-    "TORCH_VERSION_AFTER_2_4",
 ]
 
 try:
@@ -87,67 +77,6 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
 
         return rs
 
-
-class UnwrapTensorSubclass(torch.nn.Module):
-    def forward(self, *tensors):
-        todo = list(tensors)
-        for tp, meta, inner_tensors in reversed(self.rebuild_stack):
-            nb_tensor = len(inner_tensors)
-            inner_tensors = {a: b for a, b in zip(inner_tensors, todo[-nb_tensor:])}
-            todo = todo[nb_tensor:]
-            rebuilt = tp.__tensor_unflatten__(inner_tensors, meta, None, None)
-            todo.append(rebuilt)
-
-        assert len(todo) == 1
-        return todo[0]
-
-    def right_inverse(self, tensor):
-        assert type(tensor) is not torch.Tensor
-        rebuild_stack = []
-        plain_tensors = []
-        todo = [tensor]
-        while todo:
-            obj = todo.pop()
-            inner_tensors, metadata = obj.__tensor_flatten__()
-            rebuild_stack.append((type(obj), metadata, inner_tensors))
-            for attr_name in inner_tensors:
-                val = getattr(obj, attr_name)
-                if type(val) is torch.Tensor:
-                    plain_tensors.append(val)
-                else:
-                    assert isinstance(val, torch.Tensor)
-                    todo.append(val)
-
-        self.rebuild_stack = rebuild_stack
-
-        return plain_tensors
-
-def unwrap_tensor_subclass(model, filter_fn=None):
-    for name, child in model.named_children():
-        # make sure child.weight is a tensor subclass
-        if (
-            isinstance(child, torch.nn.Linear) and
-            hasattr(child, "weight") and
-            type(child.weight) is not torch.Tensor and
-            type(child.weight) is not torch.nn.Parameter and
-            isinstance(child.weight, torch.Tensor) and
-            issubclass(type(child.weight), torch.Tensor)
-        ):
-            parametrize.register_parametrization(child, "weight", UnwrapTensorSubclass())
-        unwrap_tensor_subclass(child)
-    return model
-
-
-# https://discuss.pytorch.org/t/finding-model-size/130275
-def get_model_size_in_bytes(model):
-    s = 0
-    for p in model.parameters():
-        s += p.nelement() * p.element_size()
-    for b in model.buffers():
-        s += b.nelement() * b.element_size()
-    return s
-
-
 class _MultiInput:
 
     def __init__(self, inputs):
@@ -165,20 +94,3 @@ def cuda(self):
         self.values = [
             val.cuda() if isinstance(val, torch.Tensor) else val for val in self.values
         ]
-
-
-# TODO: quantization namespace is not the right place ot have this
-if version.parse(torch.__version__) >= version.parse("2.4.0.dev"):
-    TORCH_VERSION_AFTER_2_4 = True
-else:
-    TORCH_VERSION_AFTER_2_4 = False
-
-if version.parse(torch.__version__) >= version.parse("2.3.0.dev"):
-    TORCH_VERSION_AFTER_2_3 = True
-else:
-    TORCH_VERSION_AFTER_2_3 = False
-
-if version.parse(torch.__version__) >= version.parse("2.2.0.dev"):
-    TORCH_VERSION_AFTER_2_2 = True
-else:
-    TORCH_VERSION_AFTER_2_2 = False
diff --git a/torchao/sparsity/training/__init__.py b/torchao/sparsity/training/__init__.py
index 16035fe62b..044f6d7515 100644
--- a/torchao/sparsity/training/__init__.py
+++ b/torchao/sparsity/training/__init__.py
@@ -7,7 +7,7 @@
 from torchao.sparsity.training.autograd import semi_structured_sparsify
 from torchao.sparsity.training.pointwise_ops import CUTLASS_POINTWISE_OP_DISPATCH_TABLE
 
-from torchao.quantization.utils import TORCH_VERSION_AFTER_2_3
+from torchao.utils import TORCH_VERSION_AFTER_2_3
 
 # load pointwise op support, which exists only for CUTLASS
 if TORCH_VERSION_AFTER_2_3:
diff --git a/torchao/sparsity/training/autograd.py b/torchao/sparsity/training/autograd.py
index 8e22cad9fb..e920b72859 100644
--- a/torchao/sparsity/training/autograd.py
+++ b/torchao/sparsity/training/autograd.py
@@ -2,7 +2,7 @@
 import torch
 from torch.sparse import SparseSemiStructuredTensor
 
-from torchao.quantization.utils import TORCH_VERSION_AFTER_2_3
+from torchao.utils import TORCH_VERSION_AFTER_2_3
 
 if TORCH_VERSION_AFTER_2_3:
     from torch.sparse import SparseSemiStructuredTensorCUTLASS, SparseSemiStructuredTensorCUSPARSELT
@@ -120,7 +120,7 @@ def semi_structured_sparsify(
     backend: str = "cutlass",
 ) -> SparseSemiStructuredTensor:
     """
-    Sparsifies a dense tensor into a semi-structured tensor, according to the algo and backend passed. 
+    Sparsifies a dense tensor into a semi-structured tensor, according to the algo and backend passed.
     """
     return _SparsifyFunc.apply(x, algo, backend)
 
@@ -131,6 +131,6 @@ def semi_structured_sparsify_like(
     gradient: GRADIENT_TYPE = GRADIENT_TYPE.SPARSE,
 ) -> SparseSemiStructuredTensor:
     """
-    Sparsifies a dense tensor into a semi-structured tensor, using the mask of the provided pattern. 
+    Sparsifies a dense tensor into a semi-structured tensor, using the mask of the provided pattern.
     """
     return _SparsifyLikeFunc.apply(x, pattern, gradient)
diff --git a/torchao/utils.py b/torchao/utils.py
index 0a3fe5ba97..27650dae1c 100644
--- a/torchao/utils.py
+++ b/torchao/utils.py
@@ -3,6 +3,22 @@
 from typing import Tuple
 from functools import reduce
 from math import gcd
+from packaging import version
+import torch.nn.utils.parametrize as parametrize
+
+__all__ = [
+    "benchmark_model",
+    "profiler_runner",
+    "get_compute_capability",
+    "skip_if_compute_capability_less_than",
+    "benchmark_torch_function_in_microseconds",
+    "find_multiple",
+    "get_model_size_in_bytes",
+    "unwrap_tensor_subclass",
+    "TORCH_VERSION_AFTER_2_2",
+    "TORCH_VERSION_AFTER_2_3",
+    "TORCH_VERSION_AFTER_2_4",
+]
 
 
 def benchmark_model(model, num_runs, input_tensor):
@@ -65,3 +81,76 @@ def find_multiple(n: int, *args: Tuple[int]) -> int:
     if n % k == 0:
         return n
     return n + k - (n % k)
+
+# https://discuss.pytorch.org/t/finding-model-size/130275
+def get_model_size_in_bytes(model):
+    s = 0
+    for p in model.parameters():
+        s += p.nelement() * p.element_size()
+    for b in model.buffers():
+        s += b.nelement() * b.element_size()
+    return s
+
+class UnwrapTensorSubclass(torch.nn.Module):
+    def forward(self, *tensors):
+        todo = list(tensors)
+        for tp, meta, inner_tensors in reversed(self.rebuild_stack):
+            nb_tensor = len(inner_tensors)
+            inner_tensors = {a: b for a, b in zip(inner_tensors, todo[-nb_tensor:])}
+            todo = todo[nb_tensor:]
+            rebuilt = tp.__tensor_unflatten__(inner_tensors, meta, None, None)
+            todo.append(rebuilt)
+
+        assert len(todo) == 1
+        return todo[0]
+
+    def right_inverse(self, tensor):
+        assert type(tensor) is not torch.Tensor
+        rebuild_stack = []
+        plain_tensors = []
+        todo = [tensor]
+        while todo:
+            obj = todo.pop()
+            inner_tensors, metadata = obj.__tensor_flatten__()
+            rebuild_stack.append((type(obj), metadata, inner_tensors))
+            for attr_name in inner_tensors:
+                val = getattr(obj, attr_name)
+                if type(val) is torch.Tensor:
+                    plain_tensors.append(val)
+                else:
+                    assert isinstance(val, torch.Tensor)
+                    todo.append(val)
+
+        self.rebuild_stack = rebuild_stack
+
+        return plain_tensors
+
+def unwrap_tensor_subclass(model, filter_fn=None):
+    for name, child in model.named_children():
+        # make sure child.weight is a tensor subclass
+        if (
+            isinstance(child, torch.nn.Linear) and
+            hasattr(child, "weight") and
+            type(child.weight) is not torch.Tensor and
+            type(child.weight) is not torch.nn.Parameter and
+            isinstance(child.weight, torch.Tensor) and
+            issubclass(type(child.weight), torch.Tensor)
+        ):
+            parametrize.register_parametrization(child, "weight", UnwrapTensorSubclass())
+        unwrap_tensor_subclass(child)
+    return model
+
+if version.parse(torch.__version__) >= version.parse("2.4.0.dev"):
+    TORCH_VERSION_AFTER_2_4 = True
+else:
+    TORCH_VERSION_AFTER_2_4 = False
+
+if version.parse(torch.__version__) >= version.parse("2.3.0.dev"):
+    TORCH_VERSION_AFTER_2_3 = True
+else:
+    TORCH_VERSION_AFTER_2_3 = False
+
+if version.parse(torch.__version__) >= version.parse("2.2.0.dev"):
+    TORCH_VERSION_AFTER_2_2 = True
+else:
+    TORCH_VERSION_AFTER_2_2 = False