diff --git a/core/os/spin_lock.h b/core/os/spin_lock.h
index acd07f54904d..5b28d9d525af 100644
--- a/core/os/spin_lock.h
+++ b/core/os/spin_lock.h
@@ -31,27 +31,6 @@
 #ifndef SPIN_LOCK_H
 #define SPIN_LOCK_H
 
-#include "core/typedefs.h"
-
-#if defined(__APPLE__)
-
-#include <os/lock.h>
-
-class SpinLock {
-	mutable os_unfair_lock _lock = OS_UNFAIR_LOCK_INIT;
-
-public:
-	_ALWAYS_INLINE_ void lock() const {
-		os_unfair_lock_lock(&_lock);
-	}
-
-	_ALWAYS_INLINE_ void unlock() const {
-		os_unfair_lock_unlock(&_lock);
-	}
-};
-
-#else
-
 #include "core/os/thread.h"
 
 #include <atomic>
@@ -59,14 +38,14 @@ class SpinLock {
 
 static_assert(std::atomic_bool::is_always_lock_free);
 
-class alignas(Thread::CACHE_LINE_BYTES) SpinLock {
+class alignas(Thread::CACHE_LINE_BYTES) AcqRelSpinLock {
 	mutable std::atomic<bool> locked = ATOMIC_VAR_INIT(false);
 
 public:
 	_ALWAYS_INLINE_ void lock() const {
 		while (true) {
 			bool expected = false;
-			if (locked.compare_exchange_weak(expected, true, std::memory_order_acq_rel, std::memory_order_relaxed)) {
+			if (locked.compare_exchange_weak(expected, true, std::memory_order_acquire, std::memory_order_relaxed)) {
 				break;
 			}
 			do {
@@ -78,8 +57,46 @@ class alignas(Thread::CACHE_LINE_BYTES) SpinLock {
 	_ALWAYS_INLINE_ void unlock() const {
 		locked.store(false, std::memory_order_release);
 	}
+
+	_ALWAYS_INLINE_ void acquire() const {
+		(void)locked.load(std::memory_order_acquire);
+	}
+
+	_ALWAYS_INLINE_ void release() const {
+		// Do as little as possible to issue a release on the atomic
+		// without changing its value.
+		while (true) {
+			for (int i = 0; i < 2; i++) {
+				bool expected = (bool)i;
+				if (locked.compare_exchange_weak(expected, expected, std::memory_order_release, std::memory_order_relaxed)) {
+					return;
+				}
+			}
+		}
+	}
 };
 
+#if defined(__APPLE__)
+
+#include <os/lock.h>
+
+class SpinLock {
+	mutable os_unfair_lock _lock = OS_UNFAIR_LOCK_INIT;
+
+public:
+	_ALWAYS_INLINE_ void lock() const {
+		os_unfair_lock_lock(&_lock);
+	}
+
+	_ALWAYS_INLINE_ void unlock() const {
+		os_unfair_lock_unlock(&_lock);
+	}
+};
+
+#else
+
+using SpinLock = AcqRelSpinLock;
+
 #endif // __APPLE__
 
 #endif // SPIN_LOCK_H
diff --git a/core/templates/rid_owner.h b/core/templates/rid_owner.h
index 42001590546c..8cd135b07226 100644
--- a/core/templates/rid_owner.h
+++ b/core/templates/rid_owner.h
@@ -32,7 +32,7 @@
 #define RID_OWNER_H
 
 #include "core/os/memory.h"
-#include "core/os/mutex.h"
+#include "core/os/spin_lock.h"
 #include "core/string/print_string.h"
 #include "core/templates/hash_set.h"
 #include "core/templates/list.h"
@@ -43,6 +43,20 @@
 #include <stdio.h>
 #include <typeinfo>
 
+#ifdef SANITIZERS_ENABLED
+#ifdef __has_feature
+#if __has_feature(thread_sanitizer)
+#define TSAN_ENABLED
+#endif
+#elif defined(__SANITIZE_THREAD__)
+#define TSAN_ENABLED
+#endif
+#endif
+
+#ifdef TSAN_ENABLED
+#include <sanitizer/tsan_interface.h>
+#endif
+
 class RID_AllocBase {
 	static SafeNumeric<uint64_t> base_id;
 
@@ -83,18 +97,18 @@ class RID_Alloc : public RID_AllocBase {
 
 	const char *description = nullptr;
 
-	mutable Mutex mutex;
+	AcqRelSpinLock spin;
 
 	_FORCE_INLINE_ RID _allocate_rid() {
 		if constexpr (THREAD_SAFE) {
-			mutex.lock();
+			spin.lock();
 		}
 
 		if (alloc_count == max_alloc) {
 			//allocate a new chunk
 			uint32_t chunk_count = alloc_count == 0 ? 0 : (max_alloc / elements_in_chunk);
 			if (THREAD_SAFE && chunk_count == chunk_limit) {
-				mutex.unlock();
+				spin.unlock();
 				if (description != nullptr) {
 					ERR_FAIL_V_MSG(RID(), vformat("Element limit for RID of type '%s' reached.", String(description)));
 				} else {
@@ -120,7 +134,8 @@ class RID_Alloc : public RID_AllocBase {
 				free_list_chunks[chunk_count][i] = alloc_count + i;
 			}
 
-			max_alloc += elements_in_chunk;
+			// Store atomically to avoid data race with the load in get_or_null().
+			((std::atomic<uint32_t> *)&max_alloc)->store(max_alloc + elements_in_chunk, std::memory_order_relaxed);
 		}
 
 		uint32_t free_index = free_list_chunks[alloc_count / elements_in_chunk][alloc_count % elements_in_chunk];
@@ -140,7 +155,7 @@ class RID_Alloc : public RID_AllocBase {
 		alloc_count++;
 
 		if constexpr (THREAD_SAFE) {
-			mutex.unlock();
+			spin.unlock();
 		}
 
 		return _make_from_id(id);
@@ -168,9 +183,13 @@ class RID_Alloc : public RID_AllocBase {
 			return nullptr;
 		}
 
+		spin.acquire();
+
 		uint64_t id = p_rid.get_id();
 		uint32_t idx = uint32_t(id & 0xFFFFFFFF);
-		if (unlikely(idx >= max_alloc)) {
+		// Read atomically to avoid data race with the store in _allocate_rid().
+		uint32_t ma = ((std::atomic<uint32_t> *)&max_alloc)->load(std::memory_order_acquire);
+		if (unlikely(idx >= ma)) {
 			return nullptr;
 		}
 
@@ -180,6 +199,9 @@ class RID_Alloc : public RID_AllocBase {
 		uint32_t validator = uint32_t(id >> 32);
 
 		Chunk &c = chunks[idx_chunk][idx_element];
+#ifdef TSAN_ENABLED
+		__tsan_acquire(&c.validator); // We know not a race in practice.
+#endif
 		if (unlikely(p_initialize)) {
 			if (unlikely(!(c.validator & 0x80000000))) {
 				ERR_FAIL_V_MSG(nullptr, "Initializing already initialized RID");
@@ -189,7 +211,8 @@ class RID_Alloc : public RID_AllocBase {
 				ERR_FAIL_V_MSG(nullptr, "Attempting to initialize the wrong RID");
 			}
 
-			c.validator &= 0x7FFFFFFF; //initialized
+			// Mark initialized.
+			c.validator &= 0x7FFFFFFF;
 
 		} else if (unlikely(c.validator != validator)) {
 			if ((c.validator & 0x80000000) && c.validator != 0xFFFFFFFF) {
@@ -197,6 +220,9 @@ class RID_Alloc : public RID_AllocBase {
 			}
 			return nullptr;
 		}
+#ifdef TSAN_ENABLED
+		__tsan_release(&c.validator);
+#endif
 
 		T *ptr = &c.data;
 
@@ -205,24 +231,39 @@ class RID_Alloc : public RID_AllocBase {
 	void initialize_rid(RID p_rid) {
 		T *mem = get_or_null(p_rid, true);
 		ERR_FAIL_NULL(mem);
+#ifdef TSAN_ENABLED
+		__tsan_acquire(mem); // We know not a race in practice.
+#endif
 		memnew_placement(mem, T);
+#ifdef TSAN_ENABLED
+		__tsan_release(mem);
+#endif
+		spin.release();
 	}
+
 	void initialize_rid(RID p_rid, const T &p_value) {
 		T *mem = get_or_null(p_rid, true);
 		ERR_FAIL_NULL(mem);
+#ifdef TSAN_ENABLED
+		__tsan_acquire(mem); // We know not a race in practice.
+#endif
 		memnew_placement(mem, T(p_value));
+#ifdef TSAN_ENABLED
+		__tsan_release(mem);
+#endif
+		spin.release();
 	}
 
 	_FORCE_INLINE_ bool owns(const RID &p_rid) const {
 		if constexpr (THREAD_SAFE) {
-			mutex.lock();
+			spin.lock();
 		}
 
 		uint64_t id = p_rid.get_id();
 		uint32_t idx = uint32_t(id & 0xFFFFFFFF);
 		if (unlikely(idx >= max_alloc)) {
 			if constexpr (THREAD_SAFE) {
-				mutex.unlock();
+				spin.unlock();
 			}
 			return false;
 		}
@@ -235,7 +276,7 @@ class RID_Alloc : public RID_AllocBase {
 		bool owned = (validator != 0x7FFFFFFF) && (chunks[idx_chunk][idx_element].validator & 0x7FFFFFFF) == validator;
 
 		if constexpr (THREAD_SAFE) {
-			mutex.unlock();
+			spin.unlock();
 		}
 
 		return owned;
@@ -243,14 +284,14 @@ class RID_Alloc : public RID_AllocBase {
 
 	_FORCE_INLINE_ void free(const RID &p_rid) {
 		if constexpr (THREAD_SAFE) {
-			mutex.lock();
+			spin.lock();
 		}
 
 		uint64_t id = p_rid.get_id();
 		uint32_t idx = uint32_t(id & 0xFFFFFFFF);
 		if (unlikely(idx >= max_alloc)) {
 			if constexpr (THREAD_SAFE) {
-				mutex.unlock();
+				spin.unlock();
 			}
 			ERR_FAIL();
 		}
@@ -261,12 +302,12 @@ class RID_Alloc : public RID_AllocBase {
 		uint32_t validator = uint32_t(id >> 32);
 		if (unlikely(chunks[idx_chunk][idx_element].validator & 0x80000000)) {
 			if constexpr (THREAD_SAFE) {
-				mutex.unlock();
+				spin.unlock();
 			}
 			ERR_FAIL_MSG("Attempted to free an uninitialized or invalid RID");
 		} else if (unlikely(chunks[idx_chunk][idx_element].validator != validator)) {
 			if constexpr (THREAD_SAFE) {
-				mutex.unlock();
+				spin.unlock();
 			}
 			ERR_FAIL();
 		}
@@ -278,7 +319,7 @@ class RID_Alloc : public RID_AllocBase {
 		free_list_chunks[alloc_count / elements_in_chunk][alloc_count % elements_in_chunk] = idx;
 
 		if constexpr (THREAD_SAFE) {
-			mutex.unlock();
+			spin.unlock();
 		}
 	}
 
@@ -287,35 +328,35 @@ class RID_Alloc : public RID_AllocBase {
 	}
 	void get_owned_list(List<RID> *p_owned) const {
 		if constexpr (THREAD_SAFE) {
-			mutex.lock();
+			spin.lock();
 		}
 		for (size_t i = 0; i < max_alloc; i++) {
-			uint64_t validator = chunks[i / elements_in_chunk][i % elements_in_chunk].validator;
+			uint32_t validator = chunks[i / elements_in_chunk][i % elements_in_chunk].validator;
 			if (validator != 0xFFFFFFFF) {
-				p_owned->push_back(_make_from_id((validator << 32) | i));
+				p_owned->push_back(_make_from_id(((uint64_t)validator << 32) | i));
 			}
 		}
 		if constexpr (THREAD_SAFE) {
-			mutex.unlock();
+			spin.unlock();
 		}
 	}
 
 	//used for fast iteration in the elements or RIDs
 	void fill_owned_buffer(RID *p_rid_buffer) const {
 		if constexpr (THREAD_SAFE) {
-			mutex.lock();
+			spin.lock();
 		}
 		uint32_t idx = 0;
 		for (size_t i = 0; i < max_alloc; i++) {
-			uint64_t validator = chunks[i / elements_in_chunk][i % elements_in_chunk].validator;
+			uint32_t validator = chunks[i / elements_in_chunk][i % elements_in_chunk].validator;
 			if (validator != 0xFFFFFFFF) {
-				p_rid_buffer[idx] = _make_from_id((validator << 32) | i);
+				p_rid_buffer[idx] = _make_from_id(((uint64_t)validator << 32) | i);
 				idx++;
 			}
 		}
 
 		if constexpr (THREAD_SAFE) {
-			mutex.unlock();
+			spin.unlock();
 		}
 	}
 
@@ -329,16 +370,21 @@ class RID_Alloc : public RID_AllocBase {
 			chunk_limit = (p_maximum_number_of_elements / elements_in_chunk) + 1;
 			chunks = (Chunk **)memalloc(sizeof(Chunk *) * chunk_limit);
 			free_list_chunks = (uint32_t **)memalloc(sizeof(uint32_t *) * chunk_limit);
+			spin.release();
 		}
 	}
 
 	~RID_Alloc() {
+		if constexpr (THREAD_SAFE) {
+			spin.lock();
+		}
+
 		if (alloc_count) {
 			print_error(vformat("ERROR: %d RID allocations of type '%s' were leaked at exit.",
 					alloc_count, description ? description : typeid(T).name()));
 
 			for (size_t i = 0; i < max_alloc; i++) {
-				uint64_t validator = chunks[i / elements_in_chunk][i % elements_in_chunk].validator;
+				uint32_t validator = chunks[i / elements_in_chunk][i % elements_in_chunk].validator;
 				if (validator & 0x80000000) {
 					continue; //uninitialized
 				}
diff --git a/tests/core/templates/test_rid.h b/tests/core/templates/test_rid.h
index ba9a2bb5e267..c35dfb8c7bab 100644
--- a/tests/core/templates/test_rid.h
+++ b/tests/core/templates/test_rid.h
@@ -31,7 +31,10 @@
 #ifndef TEST_RID_H
 #define TEST_RID_H
 
+#include "core/os/thread.h"
+#include "core/templates/local_vector.h"
 #include "core/templates/rid.h"
+#include "core/templates/rid_owner.h"
 
 #include "tests/test_macros.h"
 
@@ -96,6 +99,136 @@ TEST_CASE("[RID] 'get_local_index'") {
 	CHECK(RID::from_uint64(4'294'967'295).get_local_index() == 4'294'967'295);
 	CHECK(RID::from_uint64(4'294'967'297).get_local_index() == 1);
 }
+
+// This case would let sanitizers realize data races.
+// Additionally, on purely weakly ordered architectures, it would detect synchronization issues
+// if RID_Alloc failed to impose proper memory ordering and the test's threads are distributed
+// among multiple L1 caches.
+TEST_CASE("[RID_Owner] Thread safety") {
+	struct DataHolder {
+		char data[Thread::CACHE_LINE_BYTES];
+	};
+
+	struct RID_OwnerTester {
+		uint32_t thread_count = 0;
+		RID_Owner<DataHolder, true> rid_owner;
+		TightLocalVector<Thread> threads;
+		SafeNumeric<uint32_t> next_thread_idx;
+		// Using std::atomic directly since SafeNumeric doesn't support relaxed ordering.
+		TightLocalVector<std::atomic<uint64_t>> rids; // Atomic here to prevent false data race warnings.
+		std::atomic<uint32_t> sync[2] = {};
+		std::atomic<uint32_t> correct = 0;
+
+		// A barrier that doesn't introduce memory ordering constraints, only compiler ones.
+		// The idea is not to cause any sync traffic that would make the code we want to test
+		// seem correct as a side effect.
+		void lockstep(uint32_t p_step) {
+			uint32_t buf_idx = p_step % 2;
+			uint32_t target = (p_step / 2 + 1) * threads.size();
+			sync[buf_idx].fetch_add(1, std::memory_order_relaxed);
+			do {
+				std::this_thread::yield();
+			} while (sync[buf_idx].load(std::memory_order_relaxed) != target);
+		}
+
+		explicit RID_OwnerTester(bool p_chunk_for_all, bool p_chunks_preallocated) :
+				thread_count(OS::get_singleton()->get_processor_count()),
+				rid_owner(sizeof(DataHolder) * (p_chunk_for_all ? thread_count : 1)) {
+			threads.resize(thread_count);
+			rids.resize(threads.size());
+			if (p_chunks_preallocated) {
+				LocalVector<RID> prealloc_rids;
+				for (uint32_t i = 0; i < (p_chunk_for_all ? 1 : threads.size()); i++) {
+					prealloc_rids.push_back(rid_owner.make_rid());
+				}
+				for (uint32_t i = 0; i < prealloc_rids.size(); i++) {
+					rid_owner.free(prealloc_rids[i]);
+				}
+			}
+		}
+
+		~RID_OwnerTester() {
+			for (uint32_t i = 0; i < threads.size(); i++) {
+				rid_owner.free(RID::from_uint64(rids[i].load(std::memory_order_relaxed)));
+			}
+		}
+
+		void test() {
+			for (uint32_t i = 0; i < threads.size(); i++) {
+				threads[i].start(
+						[](void *p_data) {
+							RID_OwnerTester *rot = (RID_OwnerTester *)p_data;
+
+							auto _compute_thread_unique_byte = [](uint32_t p_idx) -> char {
+								return ((p_idx & 0xff) ^ (0b11111110 << (p_idx % 8)));
+							};
+
+							// 1. Each thread gets a zero-based index.
+							uint32_t self_th_idx = rot->next_thread_idx.postincrement();
+
+							rot->lockstep(0);
+
+							// 2. Each thread makes a RID holding unique data.
+							DataHolder initial_data;
+							memset(&initial_data, _compute_thread_unique_byte(self_th_idx), Thread::CACHE_LINE_BYTES);
+							RID my_rid = rot->rid_owner.make_rid(initial_data);
+							rot->rids[self_th_idx].store(my_rid.get_id(), std::memory_order_relaxed);
+
+							rot->lockstep(1);
+
+							// 3. Each thread verifies all the others.
+							uint32_t local_correct = 0;
+							for (uint32_t th_idx = 0; th_idx < rot->threads.size(); th_idx++) {
+								if (th_idx == self_th_idx) {
+									continue;
+								}
+								char expected_unique_byte = _compute_thread_unique_byte(th_idx);
+								RID rid = RID::from_uint64(rot->rids[th_idx].load(std::memory_order_relaxed));
+								DataHolder *data = rot->rid_owner.get_or_null(rid);
+								bool ok = true;
+								for (uint32_t j = 0; j < Thread::CACHE_LINE_BYTES; j++) {
+									if (data->data[j] != expected_unique_byte) {
+										ok = false;
+										break;
+									}
+								}
+								if (ok) {
+									local_correct++;
+								}
+							}
+
+							rot->lockstep(2);
+
+							rot->correct.fetch_add(local_correct, std::memory_order_acq_rel);
+						},
+						this);
+			}
+
+			for (uint32_t i = 0; i < threads.size(); i++) {
+				threads[i].wait_to_finish();
+			}
+
+			CHECK_EQ(correct.load(), threads.size() * (threads.size() - 1));
+		}
+	};
+
+	SUBCASE("All items in one chunk, pre-allocated") {
+		RID_OwnerTester tester(true, true);
+		tester.test();
+	}
+	SUBCASE("All items in one chunk, NOT pre-allocated") {
+		RID_OwnerTester tester(true, false);
+		tester.test();
+	}
+	SUBCASE("One item per chunk, pre-allocated") {
+		RID_OwnerTester tester(false, true);
+		tester.test();
+	}
+	SUBCASE("One item per chunk, NOT pre-allocated") {
+		RID_OwnerTester tester(false, false);
+		tester.test();
+	}
+}
 } // namespace TestRID
 
 #endif // TEST_RID_H