Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Don't use le32/le64 #8344

Merged
merged 41 commits into from
Jul 26, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
ddd0189
Use wasm32/wasm64 instead of le32/le64
steven-johnson Jul 15, 2024
60b46ee
Update Makefile
steven-johnson Jul 15, 2024
3cce91f
Update CMakeLists.txt
steven-johnson Jul 15, 2024
24fd1b2
Update CMakeLists.txt
steven-johnson Jul 15, 2024
67d80ae
trigger buildbots
steven-johnson Jul 15, 2024
301c610
Try x86 instead of wasm
steven-johnson Jul 16, 2024
09fba5f
-Wno-sync-alignment
steven-johnson Jul 16, 2024
af4af91
Revert "-Wno-sync-alignment"
abadams Jul 16, 2024
8002729
Don't do 64-bit atomics in 32-bit runtimes.
abadams Jul 16, 2024
30a755c
Reapply "-Wno-sync-alignment"
steven-johnson Jul 17, 2024
2cfd588
Revert "Don't do 64-bit atomics in 32-bit runtimes."
steven-johnson Jul 17, 2024
9325a2b
try arm instead
steven-johnson Jul 17, 2024
a9b0b61
sync warning
steven-johnson Jul 17, 2024
4e5bd58
gnueabihf
steven-johnson Jul 17, 2024
bfbe718
gnueabihf
steven-johnson Jul 17, 2024
2e8a6e8
Comments
steven-johnson Jul 17, 2024
6401755
Update CMakeLists.txt
steven-johnson Jul 17, 2024
def7da0
Let's try x86 instead
steven-johnson Jul 17, 2024
0e766ae
-Wno-sync-alignment
steven-johnson Jul 17, 2024
3e99fb8
Back to wasm
steven-johnson Jul 18, 2024
678164f
Update LLVM_Runtime_Linker.cpp
steven-johnson Jul 18, 2024
0c050b2
Fix RUNTIME_TRIPLE_WIN_GENERIC_64
steven-johnson Jul 19, 2024
3ca975b
Reenable warning
steven-johnson Jul 19, 2024
622c9cd
Update LLVM_Runtime_Linker.cpp
steven-johnson Jul 23, 2024
9885b30
Update LLVM_Runtime_Linker.cpp
steven-johnson Jul 23, 2024
6b6a76e
Merge branch 'main' into srj/no-le64
steven-johnson Jul 23, 2024
5b0ccc0
Update LLVM_Runtime_Linker.cpp
steven-johnson Jul 23, 2024
c88ab38
Merge branch 'main' into srj/no-le64
steven-johnson Jul 24, 2024
fe54bfe
Fixes
steven-johnson Jul 24, 2024
b28acbc
warnings
steven-johnson Jul 24, 2024
97046ce
wasm64->x86_64
steven-johnson Jul 24, 2024
007186b
back to wasm64
steven-johnson Jul 24, 2024
38f9ab6
Use wasm32/wasm64 for webgpu
steven-johnson Jul 25, 2024
2d5033b
Update Makefile
steven-johnson Jul 25, 2024
39cd51d
trigger buildbots
steven-johnson Jul 25, 2024
55c3610
Revert "Update Makefile"
steven-johnson Jul 25, 2024
a2064e4
Revert "Use wasm32/wasm64 for webgpu"
steven-johnson Jul 25, 2024
09c86a2
Revert "back to wasm64"
steven-johnson Jul 25, 2024
abbc1cd
-fno-jump-tables
steven-johnson Jul 25, 2024
b9497bd
Reapply "Use wasm32/wasm64 for webgpu"
steven-johnson Jul 26, 2024
d6affab
tabs
steven-johnson Jul 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -1053,9 +1053,9 @@ $(BIN_DIR)/build_halide_h: $(ROOT_DIR)/tools/build_halide_h.cpp
.SECONDARY:

# Compile generic 32- or 64-bit code
# (The 'nacl' is a red herring. This is just a generic 32-bit little-endian target.)
RUNTIME_TRIPLE_32 = "le32-unknown-nacl-unknown"
RUNTIME_TRIPLE_64 = "le64-unknown-unknown-unknown"
# (The 'x86_64' is a red herring. This is just a generic 32-bit little-endian target.)
RUNTIME_TRIPLE_32 = "i386-unknown-unknown-unknown"
RUNTIME_TRIPLE_64 = "x86_64-unknown-unknown-unknown"

# Windows requires special handling. The generic windows_* modules must have -fpic elided
# and (for 64 bit) must set wchar to be 2 bytes. The windows_*_x86 and windows_*_arm
Expand All @@ -1068,7 +1068,8 @@ RUNTIME_TRIPLE_WIN_X86_32 = "i386-unknown-windows-unknown"
RUNTIME_TRIPLE_WIN_X86_64 = "x86_64-unknown-windows-unknown"
RUNTIME_TRIPLE_WIN_ARM_32 = "arm-unknown-windows-unknown"
RUNTIME_TRIPLE_WIN_ARM_64 = "aarch64-unknown-windows-unknown"
RUNTIME_TRIPLE_WIN_GENERIC_64 = "le64-unknown-windows-unknown"
# TODO: was le64 here, not sure if this is correct or not
RUNTIME_TRIPLE_WIN_GENERIC_64 = "x86_64-unknown-windows-unknown"

# `-fno-threadsafe-statics` is very important here (note that it allows us to use a 'modern' C++
# standard but still skip threadsafe guards for static initialization in our runtime code)
Expand Down
27 changes: 21 additions & 6 deletions src/runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -217,17 +217,32 @@ foreach (i IN LISTS RUNTIME_CPP)
# unfortunately, clang doesn't automatically set this flag even though the
# ABI is msvc on windows
set(fshort-wchar -fshort-wchar)
alexreinking marked this conversation as resolved.
Show resolved Hide resolved
set(TARGET "le64-unknown-windows-unknown")
if (LLVM_PACKAGE_VERSION VERSION_LESS 19.0)
set(TARGET "le64-unknown-windows-unknown")
else ()
# TODO: was le64 here, not sure if this is correct or not
set(TARGET "x86_64-unknown-windows-unknown")
endif ()
endif ()
endif()
# Everything else
else()
if (j EQUAL 32)
# (The 'nacl' is a red herring. This is just a generic 32-bit little-endian target.)
set(TARGET "le32-unknown-nacl-unknown")
if (LLVM_PACKAGE_VERSION VERSION_LESS 19.0)
if (j EQUAL 32)
# generic 32-bit code
set(TARGET "le32-unknown-nacl-unknown")
else ()
# generic 64-bit code
set(TARGET "le64-unknown-unknown-unknown")
endif ()
else ()
# generic 64-bit code
set(TARGET "le64-unknown-unknown-unknown")
if (j EQUAL 32)
# generic 32-bit code
set(TARGET "i386-unknown-unknown-unknown")
else ()
# generic 64-bit code
set(TARGET "x86_64-unknown-unknown-unknown")
endif ()
endif ()
endif ()

Expand Down
27 changes: 15 additions & 12 deletions src/runtime/profiler_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,11 +180,10 @@ WEAK void sampling_profiler_thread(void *) {

namespace {

template<typename T>
void sync_compare_max_and_swap(T *ptr, T val) {
void sync_compare_max_and_swap(uintptr_t *ptr, uintptr_t val) {
using namespace Halide::Runtime::Internal::Synchronization;

T old_val = *ptr;
uintptr_t old_val = *ptr;
while (val > old_val) {
if (atomic_cas_strong_sequentially_consistent(ptr, &old_val, &val)) {
return;
Expand Down Expand Up @@ -350,7 +349,11 @@ WEAK void halide_profiler_stack_peak_update(void *user_context,
// Update per-func memory stats
for (int i = 0; i < instance->pipeline_stats->num_funcs; ++i) {
if (f_values[i] != 0) {
sync_compare_max_and_swap(&(instance->funcs[i]).stack_peak, f_values[i]);
// On 32-bit platforms we don't want to use 64-bit
// atomics. Fortunately on these platforms memory usage fits into
// 32-bit integers.
sync_compare_max_and_swap((uintptr_t *)(&(instance->funcs[i]).stack_peak),
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't safe or correct. stack_peak is a uint64, so the result will be incorrect on 32-bit platforms.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This assumes little-endian. It will just update the low 32 bits of stack_peak. The high bits will be zero on 32-bit targets anyway.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ouch.. pretty hacky, needs better documentation. I think the warnings here are a bit of a red herring; I'd like to deal with them but as a separate PR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't it be cleaner to just change stack_peak to be a uintptr_t?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was worried about breaking existing users of the profiler stats.

(uintptr_t)(f_values[i]));
}
}
}
Expand Down Expand Up @@ -382,15 +385,15 @@ WEAK void halide_profiler_memory_allocate(void *user_context,

// Update per-instance memory stats
atomic_add_fetch_sequentially_consistent(&instance->num_allocs, 1);
atomic_add_fetch_sequentially_consistent(&instance->memory_total, incr);
uint64_t p_mem_current = atomic_add_fetch_sequentially_consistent(&instance->memory_current, incr);
sync_compare_max_and_swap(&instance->memory_peak, p_mem_current);
atomic_add_fetch_sequentially_consistent((uintptr_t *)(&instance->memory_total), (uintptr_t)incr);
uint64_t p_mem_current = atomic_add_fetch_sequentially_consistent((uintptr_t *)(&instance->memory_current), (uintptr_t)incr);
sync_compare_max_and_swap((uintptr_t *)(&instance->memory_peak), (uintptr_t)p_mem_current);

// Update per-func memory stats
atomic_add_fetch_sequentially_consistent(&func->num_allocs, 1);
atomic_add_fetch_sequentially_consistent(&func->memory_total, incr);
uint64_t f_mem_current = atomic_add_fetch_sequentially_consistent(&func->memory_current, incr);
sync_compare_max_and_swap(&func->memory_peak, f_mem_current);
atomic_add_fetch_sequentially_consistent((uintptr_t *)(&func->memory_total), (uintptr_t)incr);
uint64_t f_mem_current = atomic_add_fetch_sequentially_consistent((uintptr_t *)(&func->memory_current), (uintptr_t)incr);
sync_compare_max_and_swap((uintptr_t *)(&func->memory_peak), (uintptr_t)f_mem_current);
}

WEAK void halide_profiler_memory_free(void *user_context,
Expand Down Expand Up @@ -418,10 +421,10 @@ WEAK void halide_profiler_memory_free(void *user_context,
// unless user specifically calls halide_profiler_reset().

// Update per-pipeline memory stats
atomic_sub_fetch_sequentially_consistent(&instance->memory_current, decr);
atomic_sub_fetch_sequentially_consistent((uintptr_t *)(&instance->memory_current), (uintptr_t)decr);

// Update per-func memory stats
atomic_sub_fetch_sequentially_consistent(&func->memory_current, decr);
atomic_sub_fetch_sequentially_consistent((uintptr_t *)(&func->memory_current), (uintptr_t)decr);
}

WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_state *s) {
Expand Down
13 changes: 13 additions & 0 deletions src/runtime/runtime_atomics.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,36 +35,43 @@ ALWAYS_INLINE uintptr_t atomic_and_fetch_release(uintptr_t *addr, uintptr_t val)

template<typename T>
ALWAYS_INLINE T atomic_fetch_add_acquire_release(T *addr, T val) {
static_assert(sizeof(T) == 4);
return __sync_fetch_and_add(addr, val);
}

template<typename T, typename TV = typename remove_volatile<T>::type>
ALWAYS_INLINE T atomic_fetch_add_sequentially_consistent(T *addr, TV val) {
static_assert(sizeof(T) == 4);
return __sync_fetch_and_add(addr, val);
}

template<typename T, typename TV = typename remove_volatile<T>::type>
ALWAYS_INLINE T atomic_fetch_sub_sequentially_consistent(T *addr, TV val) {
static_assert(sizeof(T) == 4);
return __sync_fetch_and_sub(addr, val);
}

template<typename T, typename TV = typename remove_volatile<T>::type>
ALWAYS_INLINE T atomic_fetch_or_sequentially_consistent(T *addr, TV val) {
static_assert(sizeof(T) == 4);
return __sync_fetch_and_or(addr, val);
}

template<typename T>
ALWAYS_INLINE T atomic_add_fetch_sequentially_consistent(T *addr, T val) {
static_assert(sizeof(T) == 4);
return __sync_add_and_fetch(addr, val);
}

template<typename T>
ALWAYS_INLINE T atomic_sub_fetch_sequentially_consistent(T *addr, T val) {
static_assert(sizeof(T) == 4);
return __sync_sub_and_fetch(addr, val);
}

template<typename T, typename TV = typename remove_volatile<T>::type>
ALWAYS_INLINE bool cas_strong_sequentially_consistent_helper(T *addr, TV *expected, TV *desired) {
static_assert(sizeof(T) == 4);
TV oldval = *expected;
TV gotval = __sync_val_compare_and_swap(addr, oldval, *desired);
*expected = gotval;
Expand Down Expand Up @@ -99,11 +106,13 @@ ALWAYS_INLINE bool atomic_cas_weak_acquire_relaxed(uintptr_t *addr, uintptr_t *e

template<typename T>
ALWAYS_INLINE T atomic_fetch_and_release(T *addr, T val) {
static_assert(sizeof(T) == 4);
return __sync_fetch_and_and(addr, val);
}

template<typename T, typename TV = typename remove_volatile<T>::type>
ALWAYS_INLINE T atomic_fetch_and_sequentially_consistent(T *addr, TV val) {
static_assert(sizeof(T) == 4);
return __sync_fetch_and_and(addr, val);
}

Expand All @@ -121,6 +130,7 @@ ALWAYS_INLINE void atomic_load_acquire(T *addr, T *val) {
template<typename T>
ALWAYS_INLINE T atomic_exchange_acquire(T *addr, T val) {
// Despite the name, this is really just an exchange operation with acquire ordering.
static_assert(sizeof(T) == 4);
return __sync_lock_test_and_set(addr, val);
}

Expand All @@ -130,17 +140,20 @@ ALWAYS_INLINE uintptr_t atomic_or_fetch_relaxed(uintptr_t *addr, uintptr_t val)

template<typename T>
ALWAYS_INLINE void atomic_store_relaxed(T *addr, T *val) {
static_assert(sizeof(T) == 4);
*addr = *val;
}

template<typename T>
ALWAYS_INLINE void atomic_store_release(T *addr, T *val) {
static_assert(sizeof(T) == 4);
*addr = *val;
__sync_synchronize();
}

template<typename T, typename TV = typename remove_volatile<T>::type>
ALWAYS_INLINE void atomic_store_sequentially_consistent(T *addr, TV *val) {
static_assert(sizeof(T) == 4);
*addr = *val;
__sync_synchronize();
}
Expand Down
Loading