diff --git a/core/os/spin_lock.h b/core/os/spin_lock.h index acd07f54904d..5b28d9d525af 100644 --- a/core/os/spin_lock.h +++ b/core/os/spin_lock.h @@ -31,27 +31,6 @@ #ifndef SPIN_LOCK_H #define SPIN_LOCK_H -#include "core/typedefs.h" - -#if defined(__APPLE__) - -#include - -class SpinLock { - mutable os_unfair_lock _lock = OS_UNFAIR_LOCK_INIT; - -public: - _ALWAYS_INLINE_ void lock() const { - os_unfair_lock_lock(&_lock); - } - - _ALWAYS_INLINE_ void unlock() const { - os_unfair_lock_unlock(&_lock); - } -}; - -#else - #include "core/os/thread.h" #include @@ -59,14 +38,14 @@ class SpinLock { static_assert(std::atomic_bool::is_always_lock_free); -class alignas(Thread::CACHE_LINE_BYTES) SpinLock { +class alignas(Thread::CACHE_LINE_BYTES) AcqRelSpinLock { mutable std::atomic locked = ATOMIC_VAR_INIT(false); public: _ALWAYS_INLINE_ void lock() const { while (true) { bool expected = false; - if (locked.compare_exchange_weak(expected, true, std::memory_order_acq_rel, std::memory_order_relaxed)) { + if (locked.compare_exchange_weak(expected, true, std::memory_order_acquire, std::memory_order_relaxed)) { break; } do { @@ -78,8 +57,46 @@ class alignas(Thread::CACHE_LINE_BYTES) SpinLock { _ALWAYS_INLINE_ void unlock() const { locked.store(false, std::memory_order_release); } + + _ALWAYS_INLINE_ void acquire() const { + (void)locked.load(std::memory_order_acquire); + } + + _ALWAYS_INLINE_ void release() const { + // Do as little as possible to issue a release on the atomic + // without changing its value. + while (true) { + for (int i = 0; i < 2; i++) { + bool expected = (bool)i; + if (locked.compare_exchange_weak(expected, expected, std::memory_order_release, std::memory_order_relaxed)) { + return; + } + } + } + } }; +#if defined(__APPLE__) + +#include + +class SpinLock { + mutable os_unfair_lock _lock = OS_UNFAIR_LOCK_INIT; + +public: + _ALWAYS_INLINE_ void lock() const { + os_unfair_lock_lock(&_lock); + } + + _ALWAYS_INLINE_ void unlock() const { + os_unfair_lock_unlock(&_lock); + } +}; + +#else + +using SpinLock = AcqRelSpinLock; + #endif // __APPLE__ #endif // SPIN_LOCK_H diff --git a/core/templates/rid_owner.h b/core/templates/rid_owner.h index 42001590546c..8cd135b07226 100644 --- a/core/templates/rid_owner.h +++ b/core/templates/rid_owner.h @@ -32,7 +32,7 @@ #define RID_OWNER_H #include "core/os/memory.h" -#include "core/os/mutex.h" +#include "core/os/spin_lock.h" #include "core/string/print_string.h" #include "core/templates/hash_set.h" #include "core/templates/list.h" @@ -43,6 +43,20 @@ #include #include +#ifdef SANITIZERS_ENABLED +#ifdef __has_feature +#if __has_feature(thread_sanitizer) +#define TSAN_ENABLED +#endif +#elif defined(__SANITIZE_THREAD__) +#define TSAN_ENABLED +#endif +#endif + +#ifdef TSAN_ENABLED +#include +#endif + class RID_AllocBase { static SafeNumeric base_id; @@ -83,18 +97,18 @@ class RID_Alloc : public RID_AllocBase { const char *description = nullptr; - mutable Mutex mutex; + AcqRelSpinLock spin; _FORCE_INLINE_ RID _allocate_rid() { if constexpr (THREAD_SAFE) { - mutex.lock(); + spin.lock(); } if (alloc_count == max_alloc) { //allocate a new chunk uint32_t chunk_count = alloc_count == 0 ? 0 : (max_alloc / elements_in_chunk); if (THREAD_SAFE && chunk_count == chunk_limit) { - mutex.unlock(); + spin.unlock(); if (description != nullptr) { ERR_FAIL_V_MSG(RID(), vformat("Element limit for RID of type '%s' reached.", String(description))); } else { @@ -120,7 +134,8 @@ class RID_Alloc : public RID_AllocBase { free_list_chunks[chunk_count][i] = alloc_count + i; } - max_alloc += elements_in_chunk; + // Store atomically to avoid data race with the load in get_or_null(). + ((std::atomic *)&max_alloc)->store(max_alloc + elements_in_chunk, std::memory_order_relaxed); } uint32_t free_index = free_list_chunks[alloc_count / elements_in_chunk][alloc_count % elements_in_chunk]; @@ -140,7 +155,7 @@ class RID_Alloc : public RID_AllocBase { alloc_count++; if constexpr (THREAD_SAFE) { - mutex.unlock(); + spin.unlock(); } return _make_from_id(id); @@ -168,9 +183,13 @@ class RID_Alloc : public RID_AllocBase { return nullptr; } + spin.acquire(); + uint64_t id = p_rid.get_id(); uint32_t idx = uint32_t(id & 0xFFFFFFFF); - if (unlikely(idx >= max_alloc)) { + // Read atomically to avoid data race with the store in _allocate_rid(). + uint32_t ma = ((std::atomic *)&max_alloc)->load(std::memory_order_acquire); + if (unlikely(idx >= ma)) { return nullptr; } @@ -180,6 +199,9 @@ class RID_Alloc : public RID_AllocBase { uint32_t validator = uint32_t(id >> 32); Chunk &c = chunks[idx_chunk][idx_element]; +#ifdef TSAN_ENABLED + __tsan_acquire(&c.validator); // We know not a race in practice. +#endif if (unlikely(p_initialize)) { if (unlikely(!(c.validator & 0x80000000))) { ERR_FAIL_V_MSG(nullptr, "Initializing already initialized RID"); @@ -189,7 +211,8 @@ class RID_Alloc : public RID_AllocBase { ERR_FAIL_V_MSG(nullptr, "Attempting to initialize the wrong RID"); } - c.validator &= 0x7FFFFFFF; //initialized + // Mark initialized. + c.validator &= 0x7FFFFFFF; } else if (unlikely(c.validator != validator)) { if ((c.validator & 0x80000000) && c.validator != 0xFFFFFFFF) { @@ -197,6 +220,9 @@ class RID_Alloc : public RID_AllocBase { } return nullptr; } +#ifdef TSAN_ENABLED + __tsan_release(&c.validator); +#endif T *ptr = &c.data; @@ -205,24 +231,39 @@ class RID_Alloc : public RID_AllocBase { void initialize_rid(RID p_rid) { T *mem = get_or_null(p_rid, true); ERR_FAIL_NULL(mem); +#ifdef TSAN_ENABLED + __tsan_acquire(mem); // We know not a race in practice. +#endif memnew_placement(mem, T); +#ifdef TSAN_ENABLED + __tsan_release(mem); +#endif + spin.release(); } + void initialize_rid(RID p_rid, const T &p_value) { T *mem = get_or_null(p_rid, true); ERR_FAIL_NULL(mem); +#ifdef TSAN_ENABLED + __tsan_acquire(mem); // We know not a race in practice. +#endif memnew_placement(mem, T(p_value)); +#ifdef TSAN_ENABLED + __tsan_release(mem); +#endif + spin.release(); } _FORCE_INLINE_ bool owns(const RID &p_rid) const { if constexpr (THREAD_SAFE) { - mutex.lock(); + spin.lock(); } uint64_t id = p_rid.get_id(); uint32_t idx = uint32_t(id & 0xFFFFFFFF); if (unlikely(idx >= max_alloc)) { if constexpr (THREAD_SAFE) { - mutex.unlock(); + spin.unlock(); } return false; } @@ -235,7 +276,7 @@ class RID_Alloc : public RID_AllocBase { bool owned = (validator != 0x7FFFFFFF) && (chunks[idx_chunk][idx_element].validator & 0x7FFFFFFF) == validator; if constexpr (THREAD_SAFE) { - mutex.unlock(); + spin.unlock(); } return owned; @@ -243,14 +284,14 @@ class RID_Alloc : public RID_AllocBase { _FORCE_INLINE_ void free(const RID &p_rid) { if constexpr (THREAD_SAFE) { - mutex.lock(); + spin.lock(); } uint64_t id = p_rid.get_id(); uint32_t idx = uint32_t(id & 0xFFFFFFFF); if (unlikely(idx >= max_alloc)) { if constexpr (THREAD_SAFE) { - mutex.unlock(); + spin.unlock(); } ERR_FAIL(); } @@ -261,12 +302,12 @@ class RID_Alloc : public RID_AllocBase { uint32_t validator = uint32_t(id >> 32); if (unlikely(chunks[idx_chunk][idx_element].validator & 0x80000000)) { if constexpr (THREAD_SAFE) { - mutex.unlock(); + spin.unlock(); } ERR_FAIL_MSG("Attempted to free an uninitialized or invalid RID"); } else if (unlikely(chunks[idx_chunk][idx_element].validator != validator)) { if constexpr (THREAD_SAFE) { - mutex.unlock(); + spin.unlock(); } ERR_FAIL(); } @@ -278,7 +319,7 @@ class RID_Alloc : public RID_AllocBase { free_list_chunks[alloc_count / elements_in_chunk][alloc_count % elements_in_chunk] = idx; if constexpr (THREAD_SAFE) { - mutex.unlock(); + spin.unlock(); } } @@ -287,35 +328,35 @@ class RID_Alloc : public RID_AllocBase { } void get_owned_list(List *p_owned) const { if constexpr (THREAD_SAFE) { - mutex.lock(); + spin.lock(); } for (size_t i = 0; i < max_alloc; i++) { - uint64_t validator = chunks[i / elements_in_chunk][i % elements_in_chunk].validator; + uint32_t validator = chunks[i / elements_in_chunk][i % elements_in_chunk].validator; if (validator != 0xFFFFFFFF) { - p_owned->push_back(_make_from_id((validator << 32) | i)); + p_owned->push_back(_make_from_id(((uint64_t)validator << 32) | i)); } } if constexpr (THREAD_SAFE) { - mutex.unlock(); + spin.unlock(); } } //used for fast iteration in the elements or RIDs void fill_owned_buffer(RID *p_rid_buffer) const { if constexpr (THREAD_SAFE) { - mutex.lock(); + spin.lock(); } uint32_t idx = 0; for (size_t i = 0; i < max_alloc; i++) { - uint64_t validator = chunks[i / elements_in_chunk][i % elements_in_chunk].validator; + uint32_t validator = chunks[i / elements_in_chunk][i % elements_in_chunk].validator; if (validator != 0xFFFFFFFF) { - p_rid_buffer[idx] = _make_from_id((validator << 32) | i); + p_rid_buffer[idx] = _make_from_id(((uint64_t)validator << 32) | i); idx++; } } if constexpr (THREAD_SAFE) { - mutex.unlock(); + spin.unlock(); } } @@ -329,16 +370,21 @@ class RID_Alloc : public RID_AllocBase { chunk_limit = (p_maximum_number_of_elements / elements_in_chunk) + 1; chunks = (Chunk **)memalloc(sizeof(Chunk *) * chunk_limit); free_list_chunks = (uint32_t **)memalloc(sizeof(uint32_t *) * chunk_limit); + spin.release(); } } ~RID_Alloc() { + if constexpr (THREAD_SAFE) { + spin.lock(); + } + if (alloc_count) { print_error(vformat("ERROR: %d RID allocations of type '%s' were leaked at exit.", alloc_count, description ? description : typeid(T).name())); for (size_t i = 0; i < max_alloc; i++) { - uint64_t validator = chunks[i / elements_in_chunk][i % elements_in_chunk].validator; + uint32_t validator = chunks[i / elements_in_chunk][i % elements_in_chunk].validator; if (validator & 0x80000000) { continue; //uninitialized } diff --git a/tests/core/templates/test_rid.h b/tests/core/templates/test_rid.h index ba9a2bb5e267..c35dfb8c7bab 100644 --- a/tests/core/templates/test_rid.h +++ b/tests/core/templates/test_rid.h @@ -31,7 +31,10 @@ #ifndef TEST_RID_H #define TEST_RID_H +#include "core/os/thread.h" +#include "core/templates/local_vector.h" #include "core/templates/rid.h" +#include "core/templates/rid_owner.h" #include "tests/test_macros.h" @@ -96,6 +99,136 @@ TEST_CASE("[RID] 'get_local_index'") { CHECK(RID::from_uint64(4'294'967'295).get_local_index() == 4'294'967'295); CHECK(RID::from_uint64(4'294'967'297).get_local_index() == 1); } + +// This case would let sanitizers realize data races. +// Additionally, on purely weakly ordered architectures, it would detect synchronization issues +// if RID_Alloc failed to impose proper memory ordering and the test's threads are distributed +// among multiple L1 caches. +TEST_CASE("[RID_Owner] Thread safety") { + struct DataHolder { + char data[Thread::CACHE_LINE_BYTES]; + }; + + struct RID_OwnerTester { + uint32_t thread_count = 0; + RID_Owner rid_owner; + TightLocalVector threads; + SafeNumeric next_thread_idx; + // Using std::atomic directly since SafeNumeric doesn't support relaxed ordering. + TightLocalVector> rids; // Atomic here to prevent false data race warnings. + std::atomic sync[2] = {}; + std::atomic correct = 0; + + // A barrier that doesn't introduce memory ordering constraints, only compiler ones. + // The idea is not to cause any sync traffic that would make the code we want to test + // seem correct as a side effect. + void lockstep(uint32_t p_step) { + uint32_t buf_idx = p_step % 2; + uint32_t target = (p_step / 2 + 1) * threads.size(); + sync[buf_idx].fetch_add(1, std::memory_order_relaxed); + do { + std::this_thread::yield(); + } while (sync[buf_idx].load(std::memory_order_relaxed) != target); + } + + explicit RID_OwnerTester(bool p_chunk_for_all, bool p_chunks_preallocated) : + thread_count(OS::get_singleton()->get_processor_count()), + rid_owner(sizeof(DataHolder) * (p_chunk_for_all ? thread_count : 1)) { + threads.resize(thread_count); + rids.resize(threads.size()); + if (p_chunks_preallocated) { + LocalVector prealloc_rids; + for (uint32_t i = 0; i < (p_chunk_for_all ? 1 : threads.size()); i++) { + prealloc_rids.push_back(rid_owner.make_rid()); + } + for (uint32_t i = 0; i < prealloc_rids.size(); i++) { + rid_owner.free(prealloc_rids[i]); + } + } + } + + ~RID_OwnerTester() { + for (uint32_t i = 0; i < threads.size(); i++) { + rid_owner.free(RID::from_uint64(rids[i].load(std::memory_order_relaxed))); + } + } + + void test() { + for (uint32_t i = 0; i < threads.size(); i++) { + threads[i].start( + [](void *p_data) { + RID_OwnerTester *rot = (RID_OwnerTester *)p_data; + + auto _compute_thread_unique_byte = [](uint32_t p_idx) -> char { + return ((p_idx & 0xff) ^ (0b11111110 << (p_idx % 8))); + }; + + // 1. Each thread gets a zero-based index. + uint32_t self_th_idx = rot->next_thread_idx.postincrement(); + + rot->lockstep(0); + + // 2. Each thread makes a RID holding unique data. + DataHolder initial_data; + memset(&initial_data, _compute_thread_unique_byte(self_th_idx), Thread::CACHE_LINE_BYTES); + RID my_rid = rot->rid_owner.make_rid(initial_data); + rot->rids[self_th_idx].store(my_rid.get_id(), std::memory_order_relaxed); + + rot->lockstep(1); + + // 3. Each thread verifies all the others. + uint32_t local_correct = 0; + for (uint32_t th_idx = 0; th_idx < rot->threads.size(); th_idx++) { + if (th_idx == self_th_idx) { + continue; + } + char expected_unique_byte = _compute_thread_unique_byte(th_idx); + RID rid = RID::from_uint64(rot->rids[th_idx].load(std::memory_order_relaxed)); + DataHolder *data = rot->rid_owner.get_or_null(rid); + bool ok = true; + for (uint32_t j = 0; j < Thread::CACHE_LINE_BYTES; j++) { + if (data->data[j] != expected_unique_byte) { + ok = false; + break; + } + } + if (ok) { + local_correct++; + } + } + + rot->lockstep(2); + + rot->correct.fetch_add(local_correct, std::memory_order_acq_rel); + }, + this); + } + + for (uint32_t i = 0; i < threads.size(); i++) { + threads[i].wait_to_finish(); + } + + CHECK_EQ(correct.load(), threads.size() * (threads.size() - 1)); + } + }; + + SUBCASE("All items in one chunk, pre-allocated") { + RID_OwnerTester tester(true, true); + tester.test(); + } + SUBCASE("All items in one chunk, NOT pre-allocated") { + RID_OwnerTester tester(true, false); + tester.test(); + } + SUBCASE("One item per chunk, pre-allocated") { + RID_OwnerTester tester(false, true); + tester.test(); + } + SUBCASE("One item per chunk, NOT pre-allocated") { + RID_OwnerTester tester(false, false); + tester.test(); + } +} } // namespace TestRID #endif // TEST_RID_H