Add atomic_ref support for 8 and 16b types. (#2255)

* Support fetch_add and CAS on 8/16b * Add 16b test * Fix issues found when enabling 8/16b in a heterogeneous test, PTX seems to be invalid though * Remove 16b cas and use only 32b cas. * Get several tests passing for 8/16b atomics * Remove todo and ifdefs from tests covering 8b/16b atomics * Fix bug in 16b atomic load * Move store close to fetch_update since it is a derived primitive * Fix bug in minmax due to s64 overload missing for arithmetic types * Add more 8/16b tests for atomic_ref * Fixup remove debug prints * Cleanup bitmask hell, fix bug where lower mask was ignored * Add test covering interleaved CAS onto same atomic window * Fixup documentation mistake. * Make atomics enable_if uses match rest of libcudacxx. * Verify fetch_add sequential load behavior in 8b/16b atomics * Remove 8b/16b add PTX tests * Optimize fetch_update CAS loops * Fix name of preset for PTX codegen test * Fix signed/unsigned comparison * Fix atomics codegen tests not being built * Fix CMake target for libcudacxx ptx tests. * Make dump_and_check executable again * Work around inconsistent parsing of [[[ in FileCheck versions * Make min/max match algorith.min/max. * Work around NVCC 11.X using different syntax for inline ptx * Fix warnings in the codegen tests. * Use PTX 16b ld/st instead of 32b CAS * Switch 8b ld/st to 16b ld * Make atomic min/max match std::min/max behavior * Atomic codegen tests: clang-cuda reorders some arguments, include them in the DAG to fix CI failures
NVIDIA · Oct 16, 2024 · 8ecb3c1 · 8ecb3c1
1 parent 3b61b07
commit 8ecb3c1
Show file tree

Hide file tree

Showing 22 changed files with 540 additions and 559 deletions.
diff --git a/CMakePresets.json b/CMakePresets.json
@@ -350,17 +350,23 @@
     {
       "name": "libcudacxx-codegen",
       "configurePreset": "libcudacxx-codegen",
-      "targets": ["libcudacxx.atomics.codegen"]
+      "targets": [
+        "libcudacxx.atomics.codegen"
+      ]
     },
     {
       "name": "libcudacxx-codegen-install",
       "configurePreset": "libcudacxx-codegen",
-      "targets": ["libcudacxx.atomics.codegen.install"]
+      "targets": [
+        "libcudacxx.atomics.codegen.install"
+      ]
     },
     {
       "name": "libcudacxx-nvrtcc",
       "hidden": true,
-      "targets": ["libcudacxx.nvrtcc"]
+      "targets": [
+        "libcudacxx.nvrtcc"
+      ]
     },
     {
       "name": "libcudacxx-base",
@@ -369,52 +375,69 @@
         "libcudacxx.test.internal_headers",
         "libcudacxx.test.public_headers",
         "libcudacxx.test.public_headers_host_only",
-        "libcudacxx.test.lit.precompile"
+        "libcudacxx.test.lit.precompile",
+        "libcudacxx.test.atomics.ptx"
       ]
     },
     {
       "name": "libcudacxx-nvrtc-cpp11",
       "hidden": false,
-      "inherits": ["libcudacxx-nvrtcc"],
+      "inherits": [
+        "libcudacxx-nvrtcc"
+      ],
       "configurePreset": "libcudacxx-nvrtc-cpp11"
     },
     {
       "name": "libcudacxx-nvrtc-cpp14",
       "hidden": false,
-      "inherits": ["libcudacxx-nvrtcc"],
+      "inherits": [
+        "libcudacxx-nvrtcc"
+      ],
       "configurePreset": "libcudacxx-nvrtc-cpp14"
     },
     {
       "name": "libcudacxx-nvrtc-cpp17",
       "hidden": false,
-      "inherits": ["libcudacxx-nvrtcc"],
+      "inherits": [
+        "libcudacxx-nvrtcc"
+      ],
       "configurePreset": "libcudacxx-nvrtc-cpp17"
     },
     {
       "name": "libcudacxx-nvrtc-cpp20",
       "hidden": false,
-      "inherits": ["libcudacxx-nvrtcc"],
+      "inherits": [
+        "libcudacxx-nvrtcc"
+      ],
       "configurePreset": "libcudacxx-nvrtc-cpp20"
     },
     {
       "name": "libcudacxx-cpp11",
       "configurePreset": "libcudacxx-cpp11",
-      "inherits": ["libcudacxx-base"]
+      "inherits": [
+        "libcudacxx-base"
+      ]
     },
     {
       "name": "libcudacxx-cpp14",
       "configurePreset": "libcudacxx-cpp14",
-      "inherits": ["libcudacxx-base"]
+      "inherits": [
+        "libcudacxx-base"
+      ]
     },
     {
       "name": "libcudacxx-cpp17",
       "configurePreset": "libcudacxx-cpp17",
-      "inherits": ["libcudacxx-base"]
+      "inherits": [
+        "libcudacxx-base"
+      ]
     },
     {
       "name": "libcudacxx-cpp20",
       "configurePreset": "libcudacxx-cpp20",
-      "inherits": ["libcudacxx-base"]
+      "inherits": [
+        "libcudacxx-base"
+      ]
     },
     {
       "name": "cub-cpp11",
@@ -487,6 +510,18 @@
       "configurePreset": "all-dev-debug",
       "inherits": "all-dev"
     },
+    {
+      "name": "libcudacxx-ctest-base",
+      "hidden": true,
+      "inherits": [
+        "base"
+      ],
+      "filter": {
+        "exclude": {
+          "name": "^libcudacxx\\.test\\.lit$"
+        }
+      }
+    },
     {
       "name": "libcudacxx-codegen",
       "configurePreset": "libcudacxx-codegen",

diff --git a/libcudacxx/codegen/generators/compare_and_swap.h b/libcudacxx/codegen/generators/compare_and_swap.h
@@ -83,7 +83,6 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
   };
 
   constexpr size_t supported_sizes[] = {
-    16,
     32,
     64,
     128,

diff --git a/libcudacxx/codegen/generators/exchange.h b/libcudacxx/codegen/generators/exchange.h
@@ -81,7 +81,6 @@ static inline _CCCL_DEVICE void __cuda_atomic_exchange(
   };
 
   constexpr size_t supported_sizes[] = {
-    16,
     32,
     64,
     128,