From e6d61591845bce9a533f9e02b19bb581093345b0 Mon Sep 17 00:00:00 2001
From: "Wu, Chunyuan" <chunyuan.wu@intel.com>
Date: Mon, 9 Oct 2023 15:01:05 +0800
Subject: [PATCH 1/6] add autocast for nms, roi_align on CPU

---
 setup.py                                          |  1 +
 torchvision/csrc/ops/autocast/nms_kernel.cpp      | 14 ++++++++++----
 .../csrc/ops/autocast/roi_align_kernel.cpp        | 15 +++++++++++----
 3 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/setup.py b/setup.py
index ce67413f410..985a3b90aab 100644
--- a/setup.py
+++ b/setup.py
@@ -135,6 +135,7 @@ def get_extensions():
     source_cpu = (
         glob.glob(os.path.join(extensions_dir, "ops", "autograd", "*.cpp"))
         + glob.glob(os.path.join(extensions_dir, "ops", "cpu", "*.cpp"))
+        + glob.glob(os.path.join(extensions_dir, "ops", "autocast", "*.cpp"))
         + glob.glob(os.path.join(extensions_dir, "ops", "quantized", "cpu", "*.cpp"))
     )
     source_mps = glob.glob(os.path.join(extensions_dir, "ops", "mps", "*.mm"))
diff --git a/torchvision/csrc/ops/autocast/nms_kernel.cpp b/torchvision/csrc/ops/autocast/nms_kernel.cpp
index 96c9ad041de..e3ee94d390a 100644
--- a/torchvision/csrc/ops/autocast/nms_kernel.cpp
+++ b/torchvision/csrc/ops/autocast/nms_kernel.cpp
@@ -9,21 +9,27 @@ namespace ops {
 
 namespace {
 
+template<c10::DispatchKey autocast_key, c10::DeviceType device_type>
 at::Tensor nms_autocast(
     const at::Tensor& dets,
     const at::Tensor& scores,
     double iou_threshold) {
-  c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::Autocast);
+  c10::impl::ExcludeDispatchKeyGuard no_autocast(autocast_key);
+  
   return nms(
-      at::autocast::cached_cast(at::kFloat, dets),
-      at::autocast::cached_cast(at::kFloat, scores),
+      at::autocast::cached_cast(at::kFloat, dets, device_type),
+      at::autocast::cached_cast(at::kFloat, scores, device_type),
       iou_threshold);
 }
 
 } // namespace
 
 TORCH_LIBRARY_IMPL(torchvision, Autocast, m) {
-  m.impl(TORCH_SELECTIVE_NAME("torchvision::nms"), TORCH_FN(nms_autocast));
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::nms"), TORCH_FN((nms_autocast<c10::DispatchKey::Autocast, c10::DeviceType::CUDA>)));
+}
+
+TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) {
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::nms"), TORCH_FN((nms_autocast<c10::DispatchKey::AutocastCPU, c10::DeviceType::CPU>)));
 }
 
 } // namespace ops
diff --git a/torchvision/csrc/ops/autocast/roi_align_kernel.cpp b/torchvision/csrc/ops/autocast/roi_align_kernel.cpp
index 78cb2309bbe..8748ef73c1d 100644
--- a/torchvision/csrc/ops/autocast/roi_align_kernel.cpp
+++ b/torchvision/csrc/ops/autocast/roi_align_kernel.cpp
@@ -9,6 +9,7 @@ namespace ops {
 
 namespace {
 
+template<c10::DispatchKey autocast_key, c10::DeviceType device_type>
 at::Tensor roi_align_autocast(
     const at::Tensor& input,
     const at::Tensor& rois,
@@ -17,10 +18,10 @@ at::Tensor roi_align_autocast(
     int64_t pooled_width,
     int64_t sampling_ratio,
     bool aligned) {
-  c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::Autocast);
+  c10::impl::ExcludeDispatchKeyGuard no_autocast(autocast_key);
   return roi_align(
-             at::autocast::cached_cast(at::kFloat, input),
-             at::autocast::cached_cast(at::kFloat, rois),
+             at::autocast::cached_cast(at::kFloat, input, device_type),
+             at::autocast::cached_cast(at::kFloat, rois, device_type),
              spatial_scale,
              pooled_height,
              pooled_width,
@@ -34,7 +35,13 @@ at::Tensor roi_align_autocast(
 TORCH_LIBRARY_IMPL(torchvision, Autocast, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("torchvision::roi_align"),
-      TORCH_FN(roi_align_autocast));
+      TORCH_FN((roi_align_autocast<c10::DispatchKey::Autocast, c10::DeviceType::CUDA>)));
+}
+
+TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("torchvision::roi_align"),
+      TORCH_FN((roi_align_autocast<c10::DispatchKey::AutocastCPU, c10::DeviceType::CPU>)));
 }
 
 } // namespace ops

From 58bc1d6c1897681c454a507b6dd8422433056279 Mon Sep 17 00:00:00 2001
From: "Wu, Chunyuan" <chunyuan.wu@intel.com>
Date: Mon, 16 Oct 2023 16:18:21 +0800
Subject: [PATCH 2/6] add UT

---
 test/test_ops.py | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/test/test_ops.py b/test/test_ops.py
index 743fe159e37..1af7e1c94b3 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -122,6 +122,9 @@ def test_forward(self, device, contiguous, x_dtype, rois_dtype=None, determinist
                 tol = 5e-3
             else:
                 tol = 4e-3
+        
+        if x_dtype == torch.bfloat16:
+            tol = 6e-3
 
         pool_size = 5
         # n_channels % (pool_size ** 2) == 0 required for PS operations.
@@ -493,6 +496,21 @@ def test_autocast(self, aligned, deterministic, x_dtype, rois_dtype):
                 rois_dtype=rois_dtype,
             )
 
+    @pytest.mark.parametrize("aligned", (True, False))
+    @pytest.mark.parametrize("deterministic", (True, False))
+    @pytest.mark.parametrize("x_dtype", (torch.float, torch.bfloat16))
+    @pytest.mark.parametrize("rois_dtype", (torch.float, torch.bfloat16))
+    def test_autocast_cpu(self, aligned, deterministic, x_dtype, rois_dtype):
+        with torch.cpu.amp.autocast():
+            self.test_forward(
+                torch.device("cpu"),
+                contiguous=False,
+                deterministic=deterministic,
+                aligned=aligned,
+                x_dtype=x_dtype,
+                rois_dtype=rois_dtype,
+            )            
+
     @pytest.mark.parametrize("seed", range(10))
     @pytest.mark.parametrize("device", cpu_and_cuda_and_mps())
     @pytest.mark.parametrize("contiguous", (True, False))
@@ -751,6 +769,17 @@ def test_qnms(self, iou, scale, zero_point):
 
         torch.testing.assert_close(qkeep, keep, msg=err_msg.format(iou))
 
+    @pytest.mark.parametrize("iou", (0.2, 0.5, 0.8))
+    def test_nms_cpu(self, iou, dtype=torch.float):
+        err_msg = "NMS incompatible between float and {dtype} for IoU={}"
+
+        boxes, scores = self._create_tensors_with_iou(1000, iou)
+        r_ref = ops.nms(boxes.to(dtype).float(), scores.to(dtype).float(), iou)
+        r_dtype = ops.nms(boxes.to(dtype), scores.to(dtype), iou)
+
+        is_eq = torch.allclose(r_ref, r_dtype)
+        assert is_eq, err_msg.format(iou)
+
     @pytest.mark.parametrize(
         "device",
         (
@@ -782,6 +811,12 @@ def test_autocast(self, iou, dtype):
         with torch.cuda.amp.autocast():
             self.test_nms_gpu(iou=iou, dtype=dtype, device="cuda")
 
+    @pytest.mark.parametrize("iou", (0.2, 0.5, 0.8))
+    @pytest.mark.parametrize("dtype", (torch.float, torch.bfloat16))
+    def test_autocast_cpu(self, iou, dtype):
+        with torch.cpu.amp.autocast():
+            self.test_nms_cpu(iou=iou, dtype=dtype)        
+
     @pytest.mark.parametrize(
         "device",
         (

From 134af6132e4a158b71d6371bb29d1ab44cbf27ba Mon Sep 17 00:00:00 2001
From: "Wu, Chunyuan" <chunyuan.wu@intel.com>
Date: Tue, 17 Oct 2023 15:02:57 +0800
Subject: [PATCH 3/6] update tol

---
 test/test_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 1af7e1c94b3..5747038a3cb 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -124,7 +124,7 @@ def test_forward(self, device, contiguous, x_dtype, rois_dtype=None, determinist
                 tol = 4e-3
         
         if x_dtype == torch.bfloat16:
-            tol = 6e-3
+            tol = 5e-3
 
         pool_size = 5
         # n_channels % (pool_size ** 2) == 0 required for PS operations.

From b4de21ec9860a26b16b66898d066bfdfbd096636 Mon Sep 17 00:00:00 2001
From: "Wu, Chunyuan" <chunyuan.wu@intel.com>
Date: Wed, 18 Oct 2023 16:48:27 +0800
Subject: [PATCH 4/6] fix "multiple rules generate" error on CUDA

---
 setup.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index 985a3b90aab..e18283622c4 100644
--- a/setup.py
+++ b/setup.py
@@ -131,11 +131,10 @@ def get_extensions():
 
     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) + glob.glob(
         os.path.join(extensions_dir, "ops", "*.cpp")
-    )
+    ) + glob.glob(os.path.join(extensions_dir, "ops", "autocast", "*.cpp"))
     source_cpu = (
         glob.glob(os.path.join(extensions_dir, "ops", "autograd", "*.cpp"))
         + glob.glob(os.path.join(extensions_dir, "ops", "cpu", "*.cpp"))
-        + glob.glob(os.path.join(extensions_dir, "ops", "autocast", "*.cpp"))
         + glob.glob(os.path.join(extensions_dir, "ops", "quantized", "cpu", "*.cpp"))
     )
     source_mps = glob.glob(os.path.join(extensions_dir, "ops", "mps", "*.mm"))
@@ -185,8 +184,6 @@ def get_extensions():
     else:
         source_cuda = glob.glob(os.path.join(extensions_dir, "ops", "cuda", "*.cu"))
 
-    source_cuda += glob.glob(os.path.join(extensions_dir, "ops", "autocast", "*.cpp"))
-
     sources = main_file + source_cpu
     extension = CppExtension
 

From a792605b5e47852dbb9870773fd9d1edf3442500 Mon Sep 17 00:00:00 2001
From: "Wu, Chunyuan" <chunyuan.wu@intel.com>
Date: Thu, 19 Oct 2023 09:50:48 +0800
Subject: [PATCH 5/6] update UT and fix format

---
 setup.py                                      |  8 +++---
 test/test_ops.py                              | 25 +++++++------------
 torchvision/csrc/ops/autocast/nms_kernel.cpp  | 14 ++++++++---
 .../csrc/ops/autocast/roi_align_kernel.cpp    | 10 +++++---
 4 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/setup.py b/setup.py
index e18283622c4..7818a598244 100644
--- a/setup.py
+++ b/setup.py
@@ -129,9 +129,11 @@ def get_extensions():
     this_dir = os.path.dirname(os.path.abspath(__file__))
     extensions_dir = os.path.join(this_dir, "torchvision", "csrc")
 
-    main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) + glob.glob(
-        os.path.join(extensions_dir, "ops", "*.cpp")
-    ) + glob.glob(os.path.join(extensions_dir, "ops", "autocast", "*.cpp"))
+    main_file = (
+        glob.glob(os.path.join(extensions_dir, "*.cpp"))
+        + glob.glob(os.path.join(extensions_dir, "ops", "*.cpp"))
+        + glob.glob(os.path.join(extensions_dir, "ops", "autocast", "*.cpp"))
+    )
     source_cpu = (
         glob.glob(os.path.join(extensions_dir, "ops", "autograd", "*.cpp"))
         + glob.glob(os.path.join(extensions_dir, "ops", "cpu", "*.cpp"))
diff --git a/test/test_ops.py b/test/test_ops.py
index 5747038a3cb..787521722c5 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -122,8 +122,7 @@ def test_forward(self, device, contiguous, x_dtype, rois_dtype=None, determinist
                 tol = 5e-3
             else:
                 tol = 4e-3
-        
-        if x_dtype == torch.bfloat16:
+        elif x_dtype == torch.bfloat16:
             tol = 5e-3
 
         pool_size = 5
@@ -509,7 +508,7 @@ def test_autocast_cpu(self, aligned, deterministic, x_dtype, rois_dtype):
                 aligned=aligned,
                 x_dtype=x_dtype,
                 rois_dtype=rois_dtype,
-            )            
+            )
 
     @pytest.mark.parametrize("seed", range(10))
     @pytest.mark.parametrize("device", cpu_and_cuda_and_mps())
@@ -730,7 +729,7 @@ def _create_tensors_with_iou(self, N, iou_thresh):
 
     @pytest.mark.parametrize("iou", (0.2, 0.5, 0.8))
     @pytest.mark.parametrize("seed", range(10))
-    def test_nms_ref(self, iou, seed):
+    def test_nms_ref(self, iou, seed, dtype=torch.float):
         torch.random.manual_seed(seed)
         err_msg = "NMS incompatible between CPU and reference implementation for IoU={}"
         boxes, scores = self._create_tensors_with_iou(1000, iou)
@@ -738,6 +737,11 @@ def test_nms_ref(self, iou, seed):
         keep = ops.nms(boxes, scores, iou)
         torch.testing.assert_close(keep, keep_ref, msg=err_msg.format(iou))
 
+        if dtype == torch.bfloat16:
+            keep_ref_float = ops.nms(boxes.to(dtype).float(), scores.to(dtype).float(), iou)
+            keep_dtype = ops.nms(boxes.to(dtype), scores.to(dtype), iou)
+            torch.testing.assert_close(keep_ref_float, keep_dtype)
+
     def test_nms_input_errors(self):
         with pytest.raises(RuntimeError):
             ops.nms(torch.rand(4), torch.rand(3), 0.5)
@@ -769,17 +773,6 @@ def test_qnms(self, iou, scale, zero_point):
 
         torch.testing.assert_close(qkeep, keep, msg=err_msg.format(iou))
 
-    @pytest.mark.parametrize("iou", (0.2, 0.5, 0.8))
-    def test_nms_cpu(self, iou, dtype=torch.float):
-        err_msg = "NMS incompatible between float and {dtype} for IoU={}"
-
-        boxes, scores = self._create_tensors_with_iou(1000, iou)
-        r_ref = ops.nms(boxes.to(dtype).float(), scores.to(dtype).float(), iou)
-        r_dtype = ops.nms(boxes.to(dtype), scores.to(dtype), iou)
-
-        is_eq = torch.allclose(r_ref, r_dtype)
-        assert is_eq, err_msg.format(iou)
-
     @pytest.mark.parametrize(
         "device",
         (
@@ -815,7 +808,7 @@ def test_autocast(self, iou, dtype):
     @pytest.mark.parametrize("dtype", (torch.float, torch.bfloat16))
     def test_autocast_cpu(self, iou, dtype):
         with torch.cpu.amp.autocast():
-            self.test_nms_cpu(iou=iou, dtype=dtype)        
+            self.test_nms_ref(iou=iou, seed=0, dtype=dtype)
 
     @pytest.mark.parametrize(
         "device",
diff --git a/torchvision/csrc/ops/autocast/nms_kernel.cpp b/torchvision/csrc/ops/autocast/nms_kernel.cpp
index e3ee94d390a..2acd0f5d0dc 100644
--- a/torchvision/csrc/ops/autocast/nms_kernel.cpp
+++ b/torchvision/csrc/ops/autocast/nms_kernel.cpp
@@ -9,13 +9,13 @@ namespace ops {
 
 namespace {
 
-template<c10::DispatchKey autocast_key, c10::DeviceType device_type>
+template <c10::DispatchKey autocast_key, c10::DeviceType device_type>
 at::Tensor nms_autocast(
     const at::Tensor& dets,
     const at::Tensor& scores,
     double iou_threshold) {
   c10::impl::ExcludeDispatchKeyGuard no_autocast(autocast_key);
-  
+
   return nms(
       at::autocast::cached_cast(at::kFloat, dets, device_type),
       at::autocast::cached_cast(at::kFloat, scores, device_type),
@@ -25,11 +25,17 @@ at::Tensor nms_autocast(
 } // namespace
 
 TORCH_LIBRARY_IMPL(torchvision, Autocast, m) {
-  m.impl(TORCH_SELECTIVE_NAME("torchvision::nms"), TORCH_FN((nms_autocast<c10::DispatchKey::Autocast, c10::DeviceType::CUDA>)));
+  m.impl(
+      TORCH_SELECTIVE_NAME("torchvision::nms"),
+      TORCH_FN(
+          (nms_autocast<c10::DispatchKey::Autocast, c10::DeviceType::CUDA>)));
 }
 
 TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) {
-  m.impl(TORCH_SELECTIVE_NAME("torchvision::nms"), TORCH_FN((nms_autocast<c10::DispatchKey::AutocastCPU, c10::DeviceType::CPU>)));
+  m.impl(
+      TORCH_SELECTIVE_NAME("torchvision::nms"),
+      TORCH_FN(
+          (nms_autocast<c10::DispatchKey::AutocastCPU, c10::DeviceType::CPU>)));
 }
 
 } // namespace ops
diff --git a/torchvision/csrc/ops/autocast/roi_align_kernel.cpp b/torchvision/csrc/ops/autocast/roi_align_kernel.cpp
index 8748ef73c1d..919393a5ef0 100644
--- a/torchvision/csrc/ops/autocast/roi_align_kernel.cpp
+++ b/torchvision/csrc/ops/autocast/roi_align_kernel.cpp
@@ -9,7 +9,7 @@ namespace ops {
 
 namespace {
 
-template<c10::DispatchKey autocast_key, c10::DeviceType device_type>
+template <c10::DispatchKey autocast_key, c10::DeviceType device_type>
 at::Tensor roi_align_autocast(
     const at::Tensor& input,
     const at::Tensor& rois,
@@ -35,13 +35,17 @@ at::Tensor roi_align_autocast(
 TORCH_LIBRARY_IMPL(torchvision, Autocast, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("torchvision::roi_align"),
-      TORCH_FN((roi_align_autocast<c10::DispatchKey::Autocast, c10::DeviceType::CUDA>)));
+      TORCH_FN((roi_align_autocast<
+                c10::DispatchKey::Autocast,
+                c10::DeviceType::CUDA>)));
 }
 
 TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("torchvision::roi_align"),
-      TORCH_FN((roi_align_autocast<c10::DispatchKey::AutocastCPU, c10::DeviceType::CPU>)));
+      TORCH_FN((roi_align_autocast<
+                c10::DispatchKey::AutocastCPU,
+                c10::DeviceType::CPU>)));
 }
 
 } // namespace ops

From 8918d2b1ae60db38c9d1ddad662fd01eb6076bf7 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 20 Oct 2023 18:12:40 +0100
Subject: [PATCH 6/6] Put back test_autocast_cpu as standalone

---
 test/test_ops.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/test/test_ops.py b/test/test_ops.py
index 787521722c5..3a613c9b767 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -729,7 +729,7 @@ def _create_tensors_with_iou(self, N, iou_thresh):
 
     @pytest.mark.parametrize("iou", (0.2, 0.5, 0.8))
     @pytest.mark.parametrize("seed", range(10))
-    def test_nms_ref(self, iou, seed, dtype=torch.float):
+    def test_nms_ref(self, iou, seed):
         torch.random.manual_seed(seed)
         err_msg = "NMS incompatible between CPU and reference implementation for IoU={}"
         boxes, scores = self._create_tensors_with_iou(1000, iou)
@@ -737,11 +737,6 @@ def test_nms_ref(self, iou, seed, dtype=torch.float):
         keep = ops.nms(boxes, scores, iou)
         torch.testing.assert_close(keep, keep_ref, msg=err_msg.format(iou))
 
-        if dtype == torch.bfloat16:
-            keep_ref_float = ops.nms(boxes.to(dtype).float(), scores.to(dtype).float(), iou)
-            keep_dtype = ops.nms(boxes.to(dtype), scores.to(dtype), iou)
-            torch.testing.assert_close(keep_ref_float, keep_dtype)
-
     def test_nms_input_errors(self):
         with pytest.raises(RuntimeError):
             ops.nms(torch.rand(4), torch.rand(3), 0.5)
@@ -807,8 +802,11 @@ def test_autocast(self, iou, dtype):
     @pytest.mark.parametrize("iou", (0.2, 0.5, 0.8))
     @pytest.mark.parametrize("dtype", (torch.float, torch.bfloat16))
     def test_autocast_cpu(self, iou, dtype):
+        boxes, scores = self._create_tensors_with_iou(1000, iou)
         with torch.cpu.amp.autocast():
-            self.test_nms_ref(iou=iou, seed=0, dtype=dtype)
+            keep_ref_float = ops.nms(boxes.to(dtype).float(), scores.to(dtype).float(), iou)
+            keep_dtype = ops.nms(boxes.to(dtype), scores.to(dtype), iou)
+        torch.testing.assert_close(keep_ref_float, keep_dtype)
 
     @pytest.mark.parametrize(
         "device",