halide · steven-johnson · Jul 26, 2024 · Jul 15, 2024 · Jul 15, 2024 · Jul 15, 2024
diff --git a/Makefile b/Makefile
@@ -1053,9 +1053,9 @@ $(BIN_DIR)/build_halide_h: $(ROOT_DIR)/tools/build_halide_h.cpp
 .SECONDARY:
 
 # Compile generic 32- or 64-bit code
-# (The 'nacl' is a red herring. This is just a generic 32-bit little-endian target.)
-RUNTIME_TRIPLE_32 = "le32-unknown-nacl-unknown"
-RUNTIME_TRIPLE_64 = "le64-unknown-unknown-unknown"
+# (The 'x86_64' is a red herring. This is just a generic 32-bit little-endian target.)
+RUNTIME_TRIPLE_32 = "i386-unknown-unknown-unknown"
+RUNTIME_TRIPLE_64 = "x86_64-unknown-unknown-unknown"
 
 # Windows requires special handling.  The generic windows_* modules must have -fpic elided
 # and (for 64 bit) must set wchar to be 2 bytes.  The windows_*_x86 and windows_*_arm
@@ -1068,7 +1068,8 @@ RUNTIME_TRIPLE_WIN_X86_32 = "i386-unknown-windows-unknown"
 RUNTIME_TRIPLE_WIN_X86_64 = "x86_64-unknown-windows-unknown"
 RUNTIME_TRIPLE_WIN_ARM_32 = "arm-unknown-windows-unknown"
 RUNTIME_TRIPLE_WIN_ARM_64 = "aarch64-unknown-windows-unknown"
-RUNTIME_TRIPLE_WIN_GENERIC_64 = "le64-unknown-windows-unknown"
+# TODO: was le64 here, not sure if this is correct or not
+RUNTIME_TRIPLE_WIN_GENERIC_64 = "x86_64-unknown-windows-unknown"
 
 # `-fno-threadsafe-statics` is very important here (note that it allows us to use a 'modern' C++
 # standard but still skip threadsafe guards for static initialization in our runtime code)

diff --git a/src/runtime/CMakeLists.txt b/src/runtime/CMakeLists.txt
@@ -217,17 +217,32 @@ foreach (i IN LISTS RUNTIME_CPP)
                     # unfortunately, clang doesn't automatically set this flag even though the
                     # ABI is msvc on windows
                     set(fshort-wchar -fshort-wchar)
-                    set(TARGET "le64-unknown-windows-unknown")
+                    if (LLVM_PACKAGE_VERSION VERSION_LESS 19.0)
+                        set(TARGET "le64-unknown-windows-unknown")
+                    else ()
+                        # TODO: was le64 here, not sure if this is correct or not
+                        set(TARGET "x86_64-unknown-windows-unknown")
+                    endif ()
                 endif ()
             endif()
         # Everything else
         else()
-            if (j EQUAL 32)
-                # (The 'nacl' is a red herring. This is just a generic 32-bit little-endian target.)
-                set(TARGET "le32-unknown-nacl-unknown")
+            if (LLVM_PACKAGE_VERSION VERSION_LESS 19.0)
+                if (j EQUAL 32)
+                    # generic 32-bit code
+                    set(TARGET "le32-unknown-nacl-unknown")
+                else ()
+                    # generic 64-bit code
+                    set(TARGET "le64-unknown-unknown-unknown")
+                endif ()
             else ()
-                # generic 64-bit code
-                set(TARGET "le64-unknown-unknown-unknown")
+                if (j EQUAL 32)
+                    # generic 32-bit code
+                    set(TARGET "i386-unknown-unknown-unknown")
+                else ()
+                    # generic 64-bit code
+                    set(TARGET "x86_64-unknown-unknown-unknown")
+                endif ()
             endif ()
         endif ()
 

diff --git a/src/runtime/profiler_common.cpp b/src/runtime/profiler_common.cpp
@@ -180,11 +180,10 @@ WEAK void sampling_profiler_thread(void *) {
 
 namespace {
 
-template<typename T>
-void sync_compare_max_and_swap(T *ptr, T val) {
+void sync_compare_max_and_swap(uintptr_t *ptr, uintptr_t val) {
     using namespace Halide::Runtime::Internal::Synchronization;
 
-    T old_val = *ptr;
+    uintptr_t old_val = *ptr;
     while (val > old_val) {
         if (atomic_cas_strong_sequentially_consistent(ptr, &old_val, &val)) {
             return;
@@ -350,7 +349,11 @@ WEAK void halide_profiler_stack_peak_update(void *user_context,
     // Update per-func memory stats
     for (int i = 0; i < instance->pipeline_stats->num_funcs; ++i) {
         if (f_values[i] != 0) {
-            sync_compare_max_and_swap(&(instance->funcs[i]).stack_peak, f_values[i]);
+            // On 32-bit platforms we don't want to use 64-bit
+            // atomics. Fortunately on these platforms memory usage fits into
+            // 32-bit integers.
+            sync_compare_max_and_swap((uintptr_t *)(&(instance->funcs[i]).stack_peak),
+                                      (uintptr_t)(f_values[i]));
         }
     }
 }
@@ -382,15 +385,15 @@ WEAK void halide_profiler_memory_allocate(void *user_context,
 
     // Update per-instance memory stats
     atomic_add_fetch_sequentially_consistent(&instance->num_allocs, 1);
-    atomic_add_fetch_sequentially_consistent(&instance->memory_total, incr);
-    uint64_t p_mem_current = atomic_add_fetch_sequentially_consistent(&instance->memory_current, incr);
-    sync_compare_max_and_swap(&instance->memory_peak, p_mem_current);
+    atomic_add_fetch_sequentially_consistent((uintptr_t *)(&instance->memory_total), (uintptr_t)incr);
+    uint64_t p_mem_current = atomic_add_fetch_sequentially_consistent((uintptr_t *)(&instance->memory_current), (uintptr_t)incr);
+    sync_compare_max_and_swap((uintptr_t *)(&instance->memory_peak), (uintptr_t)p_mem_current);
 
     // Update per-func memory stats
     atomic_add_fetch_sequentially_consistent(&func->num_allocs, 1);
-    atomic_add_fetch_sequentially_consistent(&func->memory_total, incr);
-    uint64_t f_mem_current = atomic_add_fetch_sequentially_consistent(&func->memory_current, incr);
-    sync_compare_max_and_swap(&func->memory_peak, f_mem_current);
+    atomic_add_fetch_sequentially_consistent((uintptr_t *)(&func->memory_total), (uintptr_t)incr);
+    uint64_t f_mem_current = atomic_add_fetch_sequentially_consistent((uintptr_t *)(&func->memory_current), (uintptr_t)incr);
+    sync_compare_max_and_swap((uintptr_t *)(&func->memory_peak), (uintptr_t)f_mem_current);
 }
 
 WEAK void halide_profiler_memory_free(void *user_context,
@@ -418,10 +421,10 @@ WEAK void halide_profiler_memory_free(void *user_context,
     // unless user specifically calls halide_profiler_reset().
 
     // Update per-pipeline memory stats
-    atomic_sub_fetch_sequentially_consistent(&instance->memory_current, decr);
+    atomic_sub_fetch_sequentially_consistent((uintptr_t *)(&instance->memory_current), (uintptr_t)decr);
 
     // Update per-func memory stats
-    atomic_sub_fetch_sequentially_consistent(&func->memory_current, decr);
+    atomic_sub_fetch_sequentially_consistent((uintptr_t *)(&func->memory_current), (uintptr_t)decr);
 }
 
 WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_state *s) {

diff --git a/src/runtime/runtime_atomics.h b/src/runtime/runtime_atomics.h
@@ -35,36 +35,43 @@ ALWAYS_INLINE uintptr_t atomic_and_fetch_release(uintptr_t *addr, uintptr_t val)
 
 template<typename T>
 ALWAYS_INLINE T atomic_fetch_add_acquire_release(T *addr, T val) {
+    static_assert(sizeof(T) == 4);
     return __sync_fetch_and_add(addr, val);
 }
 
 template<typename T, typename TV = typename remove_volatile<T>::type>
 ALWAYS_INLINE T atomic_fetch_add_sequentially_consistent(T *addr, TV val) {
+    static_assert(sizeof(T) == 4);
     return __sync_fetch_and_add(addr, val);
 }
 
 template<typename T, typename TV = typename remove_volatile<T>::type>
 ALWAYS_INLINE T atomic_fetch_sub_sequentially_consistent(T *addr, TV val) {
+    static_assert(sizeof(T) == 4);
     return __sync_fetch_and_sub(addr, val);
 }
 
 template<typename T, typename TV = typename remove_volatile<T>::type>
 ALWAYS_INLINE T atomic_fetch_or_sequentially_consistent(T *addr, TV val) {
+    static_assert(sizeof(T) == 4);
     return __sync_fetch_and_or(addr, val);
 }
 
 template<typename T>
 ALWAYS_INLINE T atomic_add_fetch_sequentially_consistent(T *addr, T val) {
+    static_assert(sizeof(T) == 4);
     return __sync_add_and_fetch(addr, val);
 }
 
 template<typename T>
 ALWAYS_INLINE T atomic_sub_fetch_sequentially_consistent(T *addr, T val) {
+    static_assert(sizeof(T) == 4);
     return __sync_sub_and_fetch(addr, val);
 }
 
 template<typename T, typename TV = typename remove_volatile<T>::type>
 ALWAYS_INLINE bool cas_strong_sequentially_consistent_helper(T *addr, TV *expected, TV *desired) {
+    static_assert(sizeof(T) == 4);
     TV oldval = *expected;
     TV gotval = __sync_val_compare_and_swap(addr, oldval, *desired);
     *expected = gotval;
@@ -99,11 +106,13 @@ ALWAYS_INLINE bool atomic_cas_weak_acquire_relaxed(uintptr_t *addr, uintptr_t *e
 
 template<typename T>
 ALWAYS_INLINE T atomic_fetch_and_release(T *addr, T val) {
+    static_assert(sizeof(T) == 4);
     return __sync_fetch_and_and(addr, val);
 }
 
 template<typename T, typename TV = typename remove_volatile<T>::type>
 ALWAYS_INLINE T atomic_fetch_and_sequentially_consistent(T *addr, TV val) {
+    static_assert(sizeof(T) == 4);
     return __sync_fetch_and_and(addr, val);
 }
 
@@ -121,6 +130,7 @@ ALWAYS_INLINE void atomic_load_acquire(T *addr, T *val) {
 template<typename T>
 ALWAYS_INLINE T atomic_exchange_acquire(T *addr, T val) {
     // Despite the name, this is really just an exchange operation with acquire ordering.
+    static_assert(sizeof(T) == 4);
     return __sync_lock_test_and_set(addr, val);
 }
 
@@ -130,17 +140,20 @@ ALWAYS_INLINE uintptr_t atomic_or_fetch_relaxed(uintptr_t *addr, uintptr_t val)
 
 template<typename T>
 ALWAYS_INLINE void atomic_store_relaxed(T *addr, T *val) {
+    static_assert(sizeof(T) == 4);
     *addr = *val;
 }
 
 template<typename T>
 ALWAYS_INLINE void atomic_store_release(T *addr, T *val) {
+    static_assert(sizeof(T) == 4);
     *addr = *val;
     __sync_synchronize();
 }
 
 template<typename T, typename TV = typename remove_volatile<T>::type>
 ALWAYS_INLINE void atomic_store_sequentially_consistent(T *addr, TV *val) {
+    static_assert(sizeof(T) == 4);
     *addr = *val;
     __sync_synchronize();
 }