From e6d61591845bce9a533f9e02b19bb581093345b0 Mon Sep 17 00:00:00 2001 From: "Wu, Chunyuan" Date: Mon, 9 Oct 2023 15:01:05 +0800 Subject: [PATCH 1/6] add autocast for nms, roi_align on CPU --- setup.py | 1 + torchvision/csrc/ops/autocast/nms_kernel.cpp | 14 ++++++++++---- .../csrc/ops/autocast/roi_align_kernel.cpp | 15 +++++++++++---- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/setup.py b/setup.py index ce67413f410..985a3b90aab 100644 --- a/setup.py +++ b/setup.py @@ -135,6 +135,7 @@ def get_extensions(): source_cpu = ( glob.glob(os.path.join(extensions_dir, "ops", "autograd", "*.cpp")) + glob.glob(os.path.join(extensions_dir, "ops", "cpu", "*.cpp")) + + glob.glob(os.path.join(extensions_dir, "ops", "autocast", "*.cpp")) + glob.glob(os.path.join(extensions_dir, "ops", "quantized", "cpu", "*.cpp")) ) source_mps = glob.glob(os.path.join(extensions_dir, "ops", "mps", "*.mm")) diff --git a/torchvision/csrc/ops/autocast/nms_kernel.cpp b/torchvision/csrc/ops/autocast/nms_kernel.cpp index 96c9ad041de..e3ee94d390a 100644 --- a/torchvision/csrc/ops/autocast/nms_kernel.cpp +++ b/torchvision/csrc/ops/autocast/nms_kernel.cpp @@ -9,21 +9,27 @@ namespace ops { namespace { +template at::Tensor nms_autocast( const at::Tensor& dets, const at::Tensor& scores, double iou_threshold) { - c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::Autocast); + c10::impl::ExcludeDispatchKeyGuard no_autocast(autocast_key); + return nms( - at::autocast::cached_cast(at::kFloat, dets), - at::autocast::cached_cast(at::kFloat, scores), + at::autocast::cached_cast(at::kFloat, dets, device_type), + at::autocast::cached_cast(at::kFloat, scores, device_type), iou_threshold); } } // namespace TORCH_LIBRARY_IMPL(torchvision, Autocast, m) { - m.impl(TORCH_SELECTIVE_NAME("torchvision::nms"), TORCH_FN(nms_autocast)); + m.impl(TORCH_SELECTIVE_NAME("torchvision::nms"), TORCH_FN((nms_autocast))); +} + +TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) { + m.impl(TORCH_SELECTIVE_NAME("torchvision::nms"), TORCH_FN((nms_autocast))); } } // namespace ops diff --git a/torchvision/csrc/ops/autocast/roi_align_kernel.cpp b/torchvision/csrc/ops/autocast/roi_align_kernel.cpp index 78cb2309bbe..8748ef73c1d 100644 --- a/torchvision/csrc/ops/autocast/roi_align_kernel.cpp +++ b/torchvision/csrc/ops/autocast/roi_align_kernel.cpp @@ -9,6 +9,7 @@ namespace ops { namespace { +template at::Tensor roi_align_autocast( const at::Tensor& input, const at::Tensor& rois, @@ -17,10 +18,10 @@ at::Tensor roi_align_autocast( int64_t pooled_width, int64_t sampling_ratio, bool aligned) { - c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::Autocast); + c10::impl::ExcludeDispatchKeyGuard no_autocast(autocast_key); return roi_align( - at::autocast::cached_cast(at::kFloat, input), - at::autocast::cached_cast(at::kFloat, rois), + at::autocast::cached_cast(at::kFloat, input, device_type), + at::autocast::cached_cast(at::kFloat, rois, device_type), spatial_scale, pooled_height, pooled_width, @@ -34,7 +35,13 @@ at::Tensor roi_align_autocast( TORCH_LIBRARY_IMPL(torchvision, Autocast, m) { m.impl( TORCH_SELECTIVE_NAME("torchvision::roi_align"), - TORCH_FN(roi_align_autocast)); + TORCH_FN((roi_align_autocast))); +} + +TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) { + m.impl( + TORCH_SELECTIVE_NAME("torchvision::roi_align"), + TORCH_FN((roi_align_autocast))); } } // namespace ops From 58bc1d6c1897681c454a507b6dd8422433056279 Mon Sep 17 00:00:00 2001 From: "Wu, Chunyuan" Date: Mon, 16 Oct 2023 16:18:21 +0800 Subject: [PATCH 2/6] add UT --- test/test_ops.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/test/test_ops.py b/test/test_ops.py index 743fe159e37..1af7e1c94b3 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -122,6 +122,9 @@ def test_forward(self, device, contiguous, x_dtype, rois_dtype=None, determinist tol = 5e-3 else: tol = 4e-3 + + if x_dtype == torch.bfloat16: + tol = 6e-3 pool_size = 5 # n_channels % (pool_size ** 2) == 0 required for PS operations. @@ -493,6 +496,21 @@ def test_autocast(self, aligned, deterministic, x_dtype, rois_dtype): rois_dtype=rois_dtype, ) + @pytest.mark.parametrize("aligned", (True, False)) + @pytest.mark.parametrize("deterministic", (True, False)) + @pytest.mark.parametrize("x_dtype", (torch.float, torch.bfloat16)) + @pytest.mark.parametrize("rois_dtype", (torch.float, torch.bfloat16)) + def test_autocast_cpu(self, aligned, deterministic, x_dtype, rois_dtype): + with torch.cpu.amp.autocast(): + self.test_forward( + torch.device("cpu"), + contiguous=False, + deterministic=deterministic, + aligned=aligned, + x_dtype=x_dtype, + rois_dtype=rois_dtype, + ) + @pytest.mark.parametrize("seed", range(10)) @pytest.mark.parametrize("device", cpu_and_cuda_and_mps()) @pytest.mark.parametrize("contiguous", (True, False)) @@ -751,6 +769,17 @@ def test_qnms(self, iou, scale, zero_point): torch.testing.assert_close(qkeep, keep, msg=err_msg.format(iou)) + @pytest.mark.parametrize("iou", (0.2, 0.5, 0.8)) + def test_nms_cpu(self, iou, dtype=torch.float): + err_msg = "NMS incompatible between float and {dtype} for IoU={}" + + boxes, scores = self._create_tensors_with_iou(1000, iou) + r_ref = ops.nms(boxes.to(dtype).float(), scores.to(dtype).float(), iou) + r_dtype = ops.nms(boxes.to(dtype), scores.to(dtype), iou) + + is_eq = torch.allclose(r_ref, r_dtype) + assert is_eq, err_msg.format(iou) + @pytest.mark.parametrize( "device", ( @@ -782,6 +811,12 @@ def test_autocast(self, iou, dtype): with torch.cuda.amp.autocast(): self.test_nms_gpu(iou=iou, dtype=dtype, device="cuda") + @pytest.mark.parametrize("iou", (0.2, 0.5, 0.8)) + @pytest.mark.parametrize("dtype", (torch.float, torch.bfloat16)) + def test_autocast_cpu(self, iou, dtype): + with torch.cpu.amp.autocast(): + self.test_nms_cpu(iou=iou, dtype=dtype) + @pytest.mark.parametrize( "device", ( From 134af6132e4a158b71d6371bb29d1ab44cbf27ba Mon Sep 17 00:00:00 2001 From: "Wu, Chunyuan" Date: Tue, 17 Oct 2023 15:02:57 +0800 Subject: [PATCH 3/6] update tol --- test/test_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_ops.py b/test/test_ops.py index 1af7e1c94b3..5747038a3cb 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -124,7 +124,7 @@ def test_forward(self, device, contiguous, x_dtype, rois_dtype=None, determinist tol = 4e-3 if x_dtype == torch.bfloat16: - tol = 6e-3 + tol = 5e-3 pool_size = 5 # n_channels % (pool_size ** 2) == 0 required for PS operations. From b4de21ec9860a26b16b66898d066bfdfbd096636 Mon Sep 17 00:00:00 2001 From: "Wu, Chunyuan" Date: Wed, 18 Oct 2023 16:48:27 +0800 Subject: [PATCH 4/6] fix "multiple rules generate" error on CUDA --- setup.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 985a3b90aab..e18283622c4 100644 --- a/setup.py +++ b/setup.py @@ -131,11 +131,10 @@ def get_extensions(): main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) + glob.glob( os.path.join(extensions_dir, "ops", "*.cpp") - ) + ) + glob.glob(os.path.join(extensions_dir, "ops", "autocast", "*.cpp")) source_cpu = ( glob.glob(os.path.join(extensions_dir, "ops", "autograd", "*.cpp")) + glob.glob(os.path.join(extensions_dir, "ops", "cpu", "*.cpp")) - + glob.glob(os.path.join(extensions_dir, "ops", "autocast", "*.cpp")) + glob.glob(os.path.join(extensions_dir, "ops", "quantized", "cpu", "*.cpp")) ) source_mps = glob.glob(os.path.join(extensions_dir, "ops", "mps", "*.mm")) @@ -185,8 +184,6 @@ def get_extensions(): else: source_cuda = glob.glob(os.path.join(extensions_dir, "ops", "cuda", "*.cu")) - source_cuda += glob.glob(os.path.join(extensions_dir, "ops", "autocast", "*.cpp")) - sources = main_file + source_cpu extension = CppExtension From a792605b5e47852dbb9870773fd9d1edf3442500 Mon Sep 17 00:00:00 2001 From: "Wu, Chunyuan" Date: Thu, 19 Oct 2023 09:50:48 +0800 Subject: [PATCH 5/6] update UT and fix format --- setup.py | 8 +++--- test/test_ops.py | 25 +++++++------------ torchvision/csrc/ops/autocast/nms_kernel.cpp | 14 ++++++++--- .../csrc/ops/autocast/roi_align_kernel.cpp | 10 +++++--- 4 files changed, 31 insertions(+), 26 deletions(-) diff --git a/setup.py b/setup.py index e18283622c4..7818a598244 100644 --- a/setup.py +++ b/setup.py @@ -129,9 +129,11 @@ def get_extensions(): this_dir = os.path.dirname(os.path.abspath(__file__)) extensions_dir = os.path.join(this_dir, "torchvision", "csrc") - main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) + glob.glob( - os.path.join(extensions_dir, "ops", "*.cpp") - ) + glob.glob(os.path.join(extensions_dir, "ops", "autocast", "*.cpp")) + main_file = ( + glob.glob(os.path.join(extensions_dir, "*.cpp")) + + glob.glob(os.path.join(extensions_dir, "ops", "*.cpp")) + + glob.glob(os.path.join(extensions_dir, "ops", "autocast", "*.cpp")) + ) source_cpu = ( glob.glob(os.path.join(extensions_dir, "ops", "autograd", "*.cpp")) + glob.glob(os.path.join(extensions_dir, "ops", "cpu", "*.cpp")) diff --git a/test/test_ops.py b/test/test_ops.py index 5747038a3cb..787521722c5 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -122,8 +122,7 @@ def test_forward(self, device, contiguous, x_dtype, rois_dtype=None, determinist tol = 5e-3 else: tol = 4e-3 - - if x_dtype == torch.bfloat16: + elif x_dtype == torch.bfloat16: tol = 5e-3 pool_size = 5 @@ -509,7 +508,7 @@ def test_autocast_cpu(self, aligned, deterministic, x_dtype, rois_dtype): aligned=aligned, x_dtype=x_dtype, rois_dtype=rois_dtype, - ) + ) @pytest.mark.parametrize("seed", range(10)) @pytest.mark.parametrize("device", cpu_and_cuda_and_mps()) @@ -730,7 +729,7 @@ def _create_tensors_with_iou(self, N, iou_thresh): @pytest.mark.parametrize("iou", (0.2, 0.5, 0.8)) @pytest.mark.parametrize("seed", range(10)) - def test_nms_ref(self, iou, seed): + def test_nms_ref(self, iou, seed, dtype=torch.float): torch.random.manual_seed(seed) err_msg = "NMS incompatible between CPU and reference implementation for IoU={}" boxes, scores = self._create_tensors_with_iou(1000, iou) @@ -738,6 +737,11 @@ def test_nms_ref(self, iou, seed): keep = ops.nms(boxes, scores, iou) torch.testing.assert_close(keep, keep_ref, msg=err_msg.format(iou)) + if dtype == torch.bfloat16: + keep_ref_float = ops.nms(boxes.to(dtype).float(), scores.to(dtype).float(), iou) + keep_dtype = ops.nms(boxes.to(dtype), scores.to(dtype), iou) + torch.testing.assert_close(keep_ref_float, keep_dtype) + def test_nms_input_errors(self): with pytest.raises(RuntimeError): ops.nms(torch.rand(4), torch.rand(3), 0.5) @@ -769,17 +773,6 @@ def test_qnms(self, iou, scale, zero_point): torch.testing.assert_close(qkeep, keep, msg=err_msg.format(iou)) - @pytest.mark.parametrize("iou", (0.2, 0.5, 0.8)) - def test_nms_cpu(self, iou, dtype=torch.float): - err_msg = "NMS incompatible between float and {dtype} for IoU={}" - - boxes, scores = self._create_tensors_with_iou(1000, iou) - r_ref = ops.nms(boxes.to(dtype).float(), scores.to(dtype).float(), iou) - r_dtype = ops.nms(boxes.to(dtype), scores.to(dtype), iou) - - is_eq = torch.allclose(r_ref, r_dtype) - assert is_eq, err_msg.format(iou) - @pytest.mark.parametrize( "device", ( @@ -815,7 +808,7 @@ def test_autocast(self, iou, dtype): @pytest.mark.parametrize("dtype", (torch.float, torch.bfloat16)) def test_autocast_cpu(self, iou, dtype): with torch.cpu.amp.autocast(): - self.test_nms_cpu(iou=iou, dtype=dtype) + self.test_nms_ref(iou=iou, seed=0, dtype=dtype) @pytest.mark.parametrize( "device", diff --git a/torchvision/csrc/ops/autocast/nms_kernel.cpp b/torchvision/csrc/ops/autocast/nms_kernel.cpp index e3ee94d390a..2acd0f5d0dc 100644 --- a/torchvision/csrc/ops/autocast/nms_kernel.cpp +++ b/torchvision/csrc/ops/autocast/nms_kernel.cpp @@ -9,13 +9,13 @@ namespace ops { namespace { -template +template at::Tensor nms_autocast( const at::Tensor& dets, const at::Tensor& scores, double iou_threshold) { c10::impl::ExcludeDispatchKeyGuard no_autocast(autocast_key); - + return nms( at::autocast::cached_cast(at::kFloat, dets, device_type), at::autocast::cached_cast(at::kFloat, scores, device_type), @@ -25,11 +25,17 @@ at::Tensor nms_autocast( } // namespace TORCH_LIBRARY_IMPL(torchvision, Autocast, m) { - m.impl(TORCH_SELECTIVE_NAME("torchvision::nms"), TORCH_FN((nms_autocast))); + m.impl( + TORCH_SELECTIVE_NAME("torchvision::nms"), + TORCH_FN( + (nms_autocast))); } TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) { - m.impl(TORCH_SELECTIVE_NAME("torchvision::nms"), TORCH_FN((nms_autocast))); + m.impl( + TORCH_SELECTIVE_NAME("torchvision::nms"), + TORCH_FN( + (nms_autocast))); } } // namespace ops diff --git a/torchvision/csrc/ops/autocast/roi_align_kernel.cpp b/torchvision/csrc/ops/autocast/roi_align_kernel.cpp index 8748ef73c1d..919393a5ef0 100644 --- a/torchvision/csrc/ops/autocast/roi_align_kernel.cpp +++ b/torchvision/csrc/ops/autocast/roi_align_kernel.cpp @@ -9,7 +9,7 @@ namespace ops { namespace { -template +template at::Tensor roi_align_autocast( const at::Tensor& input, const at::Tensor& rois, @@ -35,13 +35,17 @@ at::Tensor roi_align_autocast( TORCH_LIBRARY_IMPL(torchvision, Autocast, m) { m.impl( TORCH_SELECTIVE_NAME("torchvision::roi_align"), - TORCH_FN((roi_align_autocast))); + TORCH_FN((roi_align_autocast< + c10::DispatchKey::Autocast, + c10::DeviceType::CUDA>))); } TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) { m.impl( TORCH_SELECTIVE_NAME("torchvision::roi_align"), - TORCH_FN((roi_align_autocast))); + TORCH_FN((roi_align_autocast< + c10::DispatchKey::AutocastCPU, + c10::DeviceType::CPU>))); } } // namespace ops From 8918d2b1ae60db38c9d1ddad662fd01eb6076bf7 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 20 Oct 2023 18:12:40 +0100 Subject: [PATCH 6/6] Put back test_autocast_cpu as standalone --- test/test_ops.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/test/test_ops.py b/test/test_ops.py index 787521722c5..3a613c9b767 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -729,7 +729,7 @@ def _create_tensors_with_iou(self, N, iou_thresh): @pytest.mark.parametrize("iou", (0.2, 0.5, 0.8)) @pytest.mark.parametrize("seed", range(10)) - def test_nms_ref(self, iou, seed, dtype=torch.float): + def test_nms_ref(self, iou, seed): torch.random.manual_seed(seed) err_msg = "NMS incompatible between CPU and reference implementation for IoU={}" boxes, scores = self._create_tensors_with_iou(1000, iou) @@ -737,11 +737,6 @@ def test_nms_ref(self, iou, seed, dtype=torch.float): keep = ops.nms(boxes, scores, iou) torch.testing.assert_close(keep, keep_ref, msg=err_msg.format(iou)) - if dtype == torch.bfloat16: - keep_ref_float = ops.nms(boxes.to(dtype).float(), scores.to(dtype).float(), iou) - keep_dtype = ops.nms(boxes.to(dtype), scores.to(dtype), iou) - torch.testing.assert_close(keep_ref_float, keep_dtype) - def test_nms_input_errors(self): with pytest.raises(RuntimeError): ops.nms(torch.rand(4), torch.rand(3), 0.5) @@ -807,8 +802,11 @@ def test_autocast(self, iou, dtype): @pytest.mark.parametrize("iou", (0.2, 0.5, 0.8)) @pytest.mark.parametrize("dtype", (torch.float, torch.bfloat16)) def test_autocast_cpu(self, iou, dtype): + boxes, scores = self._create_tensors_with_iou(1000, iou) with torch.cpu.amp.autocast(): - self.test_nms_ref(iou=iou, seed=0, dtype=dtype) + keep_ref_float = ops.nms(boxes.to(dtype).float(), scores.to(dtype).float(), iou) + keep_dtype = ops.nms(boxes.to(dtype), scores.to(dtype), iou) + torch.testing.assert_close(keep_ref_float, keep_dtype) @pytest.mark.parametrize( "device",