From e8ba639f5f994db0f5759079498dd8a6dd5a353b Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 1 Feb 2023 08:56:50 +1100
Subject: [PATCH 001/116] Update/julia master (#2)

This PR updates the binding to the latest Julia master (up to this commit: 134f3e7dfaa04511a2f81f4a40cdc85f4e433706).
---
 Makefile                       |   3 +
 src/Makefile                   |  58 ++++++++++++
 src/array.c                    |  10 +++
 src/gc.c                       | 159 +++++++++++++++++++++++++++++++--
 src/init.c                     |  10 +++
 src/julia.h                    |   6 ++
 src/julia_internal.h           |  18 ++++
 src/julia_threads.h            |  10 +++
 src/llvm-final-gc-lowering.cpp |  68 ++++++++++++++
 src/llvm-pass-helpers.cpp      |   5 ++
 src/threading.c                |   4 +
 11 files changed, 346 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index c080f0d144cf6..bc24f9272b060 100644
--- a/Makefile
+++ b/Makefile
@@ -621,6 +621,9 @@ testall: check-whitespace $(JULIA_BUILD_MODE)
 testall1: check-whitespace $(JULIA_BUILD_MODE)
 	@env JULIA_CPU_THREADS=1 $(MAKE) $(QUIET_MAKE) -C $(BUILDROOT)/test all JULIA_BUILD_MODE=$(JULIA_BUILD_MODE)
 
+testall3: check-whitespace $(JULIA_BUILD_MODE)
+	@env JULIA_CPU_THREADS=3 $(MAKE) $(QUIET_MAKE) -C $(BUILDROOT)/test all JULIA_BUILD_MODE=$(JULIA_BUILD_MODE)
+
 test-%: check-whitespace $(JULIA_BUILD_MODE) .FORCE
 	@([ $$(( $$(date +%s) - $$(date -r $(build_private_libdir)/sys.$(SHLIB_EXT) +%s) )) -le 100 ] && \
 		printf '\033[93m    HINT The system image was recently rebuilt. Are you aware of the test-revise-* targets? See CONTRIBUTING.md. \033[0m\n') || true
diff --git a/src/Makefile b/src/Makefile
index 0baa34fedf877..d113eea5422a5 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -4,6 +4,17 @@ BUILDDIR := .
 include $(JULIAHOME)/Make.inc
 include $(JULIAHOME)/deps/llvm-ver.make
 
+ifeq ($(USE_MMTK), 1)
+CFLAGS = -DMMTKHEAP
+CPPFLAGS = -DMMTKHEAP
+MMTK_BUILD_TYPE = ${MMTK_BUILD}
+MMTK_DIR = ${MMTK_JULIA_DIR}
+MMTK_API_DIR_INCLUDE = $(MMTK_DIR)/api
+MMTK_JULIA_DIR_INCLUDE = $(MMTK_DIR)/../julia
+MMTK_LIB := -L$(MMTK_DIR)/target/$(MMTK_BUILD_TYPE)/ -lmmtk_julia
+LDFLAGS += -Wl,-rpath=$(MMTK_DIR)/target/$(MMTK_BUILD_TYPE)/
+endif
+
 JCFLAGS += $(CFLAGS)
 JCXXFLAGS += $(CXXFLAGS)
 JCPPFLAGS += $(CPPFLAGS)
@@ -18,6 +29,11 @@ FLAGS := \
 	-I$(SRCDIR)/flisp -I$(SRCDIR)/support \
 	-I$(LIBUV_INC) -I$(build_includedir) \
 	-I$(JULIAHOME)/deps/valgrind
+
+ifeq ($(USE_MMTK), 1)
+FLAGS += -I$(MMTK_API_DIR_INCLUDE) -I$(MMTK_JULIA_DIR_INCLUDE)
+endif
+
 FLAGS += -Wall -Wno-strict-aliasing -fno-omit-frame-pointer -fvisibility=hidden -fno-common \
 		 -Wno-comment -Wpointer-arith -Wundef
 ifeq ($(USEGCC),1) # GCC bug #25509 (void)__attribute__((warn_unused_result))
@@ -159,6 +175,12 @@ endif
 COMMON_LIBPATHS := -L$(build_libdir) -L$(build_shlibdir)
 RT_LIBS := $(WHOLE_ARCHIVE) $(LIBUV) $(WHOLE_ARCHIVE) $(LIBUTF8PROC) $(NO_WHOLE_ARCHIVE) $(LIBUNWIND) $(RT_LLVMLINK) $(OSLIBS)
 CG_LIBS := $(LIBUNWIND) $(CG_LLVMLINK) $(OSLIBS)
+
+ifeq ($(USE_MMTK), 1)
+CG_LIBS += $(MMTK_LIB)
+RT_LIBS += $(MMTK_LIB)
+endif
+
 RT_DEBUG_LIBS := $(COMMON_LIBPATHS) $(WHOLE_ARCHIVE) $(BUILDDIR)/flisp/libflisp-debug.a $(WHOLE_ARCHIVE) $(BUILDDIR)/support/libsupport-debug.a -ljulia-debug $(RT_LIBS)
 CG_DEBUG_LIBS := $(COMMON_LIBPATHS) $(CG_LIBS) -ljulia-debug -ljulia-internal-debug
 RT_RELEASE_LIBS := $(COMMON_LIBPATHS) $(WHOLE_ARCHIVE) $(BUILDDIR)/flisp/libflisp.a $(WHOLE_ARCHIVE) $(BUILDDIR)/support/libsupport.a -ljulia $(RT_LIBS)
@@ -167,6 +189,12 @@ CG_RELEASE_LIBS := $(COMMON_LIBPATHS) $(CG_LIBS) -ljulia -ljulia-internal
 OBJS := $(SRCS:%=$(BUILDDIR)/%.o)
 DOBJS := $(SRCS:%=$(BUILDDIR)/%.dbg.obj)
 
+ifeq ($(USE_MMTK), 1)
+MMTK_SRCS := mmtk_julia
+MMTK_OBJS := $(MMTK_SRCS:%=$(MMTK_JULIA_DIR_INCLUDE)/%.o)
+MMTK_DOBJS := $(MMTK_SRCS:%=$(MMTK_JULIA_DIR_INCLUDE)/%.o)
+endif
+
 CODEGEN_OBJS := $(CODEGEN_SRCS:%=$(BUILDDIR)/%.o)
 CODEGEN_DOBJS := $(CODEGEN_SRCS:%=$(BUILDDIR)/%.dbg.obj)
 
@@ -234,6 +262,13 @@ $(BUILDDIR)/%.o : $(SRCDIR)/%.d
 $(BUILDDIR)/%.dbg.obj : $(SRCDIR)/%.d
 	@$(call PRINT_DTRACE, $(DTRACE) -G -s $< -o $@)
 
+ifeq ($(USE_MMTK), 1)
+$(MMTK_JULIA_DIR_INCLUDE)/%.o: $(MMTK_JULIA_DIR_INCLUDE)/%.c $(HEADERS) | $(MMTK_JULIA_DIR_INCLUDE)
+	@$(call PRINT_CC, $(CC) $(JCPPFLAGS) $(JCFLAGS) $(SHIPFLAGS) $(DISABLE_ASSERTIONS) -c $< -o $@)
+$(MMTK_JULIA_DIR_INCLUDE)/%.dbg.obj: $(MMTK_JULIA_DIR_INCLUDE)/%.c $(HEADERS) | $(MMTK_JULIA_DIR_INCLUDE)
+	@$(call PRINT_CC, $(CC) $(JCPPFLAGS) $(JCFLAGS) $(DEBUGFLAGS) -c $< -o $@)
+endif
+
 # public header rules
 $(eval $(call dir_target,$(build_includedir)/julia))
 define public_header_target
@@ -363,6 +398,19 @@ $(BUILDDIR)/julia_version.h: $(JULIAHOME)/VERSION
 
 CXXLD = $(CXX) -shared
 
+ifeq ($(USE_MMTK), 1)
+$(build_shlibdir)/libjulia-internal.$(JL_MAJOR_MINOR_SHLIB_EXT): $(SRCDIR)/julia.expmap $(OBJS) $(MMTK_OBJS) $(BUILDDIR)/flisp/libflisp.a $(BUILDDIR)/support/libsupport.a $(LIBUV)
+	@$(call PRINT_LINK, $(CXXLD) $(call IMPLIB_FLAGS,$@) $(JCXXFLAGS) $(CXXLDFLAGS) $(SHIPFLAGS) $(OBJS) $(MMTK_OBJS) $(RPATH_LIB) -o $@ \
+		$(JLDFLAGS) $(JLIBLDFLAGS) $(RT_RELEASE_LIBS) $(call SONAME_FLAGS,libjulia-internal.$(JL_MAJOR_SHLIB_EXT)))
+	@$(INSTALL_NAME_CMD)libjulia-internal.$(SHLIB_EXT) $@
+		$(DSYMUTIL) $@
+
+$(build_shlibdir)/libjulia-internal-debug.$(JL_MAJOR_MINOR_SHLIB_EXT): $(SRCDIR)/julia.expmap $(DOBJS) $(MMTK_DOBJS) $(BUILDDIR)/flisp/libflisp-debug.a $(BUILDDIR)/support/libsupport-debug.a $(LIBUV)
+	@$(call PRINT_LINK, $(CXXLD) $(call IMPLIB_FLAGS,$@) $(JCXXFLAGS) $(CXXLDFLAGS) $(DEBUGFLAGS) $(DOBJS) $(MMTK_DOBJS) $(RPATH_LIB) -o $@ \
+		$(JLDFLAGS) $(JLIBLDFLAGS) $(RT_DEBUG_LIBS) $(call SONAME_FLAGS,libjulia-internal-debug.$(JL_MAJOR_SHLIB_EXT)))
+	@$(INSTALL_NAME_CMD)libjulia-internal-debug.$(SHLIB_EXT) $@
+		$(DSYMUTIL) $@
+else
 $(build_shlibdir)/libjulia-internal.$(JL_MAJOR_MINOR_SHLIB_EXT): $(SRCDIR)/julia.expmap $(OBJS) $(BUILDDIR)/flisp/libflisp.a $(BUILDDIR)/support/libsupport.a $(LIBUV)
 	@$(call PRINT_LINK, $(CXXLD) $(call IMPLIB_FLAGS,$@) $(JCXXFLAGS) $(JL_CXXFLAGS) $(CXXLDFLAGS) $(SHIPFLAGS) $(OBJS) $(RPATH_LIB) -o $@ \
 		$(JLDFLAGS) $(JLIBLDFLAGS) $(RT_RELEASE_LIBS) $(call SONAME_FLAGS,libjulia-internal.$(JL_MAJOR_SHLIB_EXT)))
@@ -374,6 +422,7 @@ $(build_shlibdir)/libjulia-internal-debug.$(JL_MAJOR_MINOR_SHLIB_EXT): $(SRCDIR)
 		$(JLDFLAGS) $(JLIBLDFLAGS) $(RT_DEBUG_LIBS) $(call SONAME_FLAGS,libjulia-internal-debug.$(JL_MAJOR_SHLIB_EXT)))
 	@$(INSTALL_NAME_CMD)libjulia-internal-debug.$(SHLIB_EXT) $@
 	$(DSYMUTIL) $@
+endif
 
 ifneq ($(OS), WINNT)
 $(build_shlibdir)/libjulia-internal.$(JL_MAJOR_SHLIB_EXT) $(build_shlibdir)/libjulia-internal-debug.$(JL_MAJOR_SHLIB_EXT): $(build_shlibdir)/libjulia-internal%.$(JL_MAJOR_SHLIB_EXT): \
@@ -415,11 +464,20 @@ libjulia-codegen-release: $(build_shlibdir)/libjulia-codegen.$(JL_MAJOR_MINOR_SH
 libjulia-codegen-debug: $(build_shlibdir)/libjulia-codegen-debug.$(JL_MAJOR_MINOR_SHLIB_EXT)
 libjulia-codegen-debug libjulia-codegen-release: $(PUBLIC_HEADER_TARGETS)
 
+ifeq ($(USE_MMTK), 1)
+clean:
+	-rm -fr $(build_shlibdir)/libjulia-internal* $(build_shlibdir)/libccalltest* $(build_shlibdir)/libllvmcalltest*
+	-rm -f $(BUILDDIR)/julia_flisp.boot $(BUILDDIR)/julia_flisp.boot.inc
+	-rm -f $(BUILDDIR)/*.dbg.obj $(BUILDDIR)/*.o $(BUILDDIR)/*.dwo $(BUILDDIR)/*.$(SHLIB_EXT) $(BUILDDIR)/*.a
+	-rm -f $(BUILDDIR)/julia_version.h
+	-rm -fr $(MMTK_JULIA_DIR_INCLUDE)/*.o
+else
 clean:
 	-rm -fr $(build_shlibdir)/libjulia-internal* $(build_shlibdir)/libjulia-codegen* $(build_shlibdir)/libccalltest* $(build_shlibdir)/libllvmcalltest*
 	-rm -f $(BUILDDIR)/julia_flisp.boot $(BUILDDIR)/julia_flisp.boot.inc $(BUILDDIR)/jl_internal_funcs.inc
 	-rm -f $(BUILDDIR)/*.dbg.obj $(BUILDDIR)/*.o $(BUILDDIR)/*.dwo $(BUILDDIR)/*.$(SHLIB_EXT) $(BUILDDIR)/*.a $(BUILDDIR)/*.h.gen
 	-rm -f $(BUILDDIR)/julia_version.h
+endif
 
 clean-flisp:
 	-$(MAKE) -C $(SRCDIR)/flisp clean BUILDDIR='$(abspath $(BUILDDIR)/flisp)'
diff --git a/src/array.c b/src/array.c
index ae89087502627..f515f5d26c024 100644
--- a/src/array.c
+++ b/src/array.c
@@ -497,17 +497,27 @@ JL_DLLEXPORT jl_value_t *jl_alloc_string(size_t len)
     jl_ptls_t ptls = ct->ptls;
     const size_t allocsz = sz + sizeof(jl_taggedvalue_t);
     if (sz <= GC_MAX_SZCLASS) {
+#ifndef MMTKHEAP
         int pool_id = jl_gc_szclass_align8(allocsz);
         jl_gc_pool_t *p = &ptls->heap.norm_pools[pool_id];
         int osize = jl_gc_sizeclasses[pool_id];
         // We call `jl_gc_pool_alloc_noinline` instead of `jl_gc_pool_alloc` to avoid double-counting in
         // the Allocations Profiler. (See https://github.com/JuliaLang/julia/pull/43868 for more details.)
         s = jl_gc_pool_alloc_noinline(ptls, (char*)p - (char*)ptls, osize);
+#else
+        int pool_id = jl_gc_szclass_align8(allocsz);
+        int osize = jl_gc_sizeclasses[pool_id];
+        s = jl_mmtk_gc_alloc_default(ptls, pool_id, osize, jl_string_type);
+#endif
     }
     else {
         if (allocsz < sz) // overflow in adding offs, size was "negative"
             jl_throw(jl_memory_exception);
+#ifndef MMTKHEAP
         s = jl_gc_big_alloc_noinline(ptls, allocsz);
+#else
+        s = jl_mmtk_gc_alloc_big(ptls, allocsz);
+#endif
     }
     jl_set_typeof(s, jl_string_type);
     maybe_record_alloc_to_profile(s, len, jl_string_type);
diff --git a/src/gc.c b/src/gc.c
index fc2a4041910f5..7eb05fbb12251 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -7,6 +7,10 @@
 #include <malloc.h> // for malloc_trim
 #endif
 
+#ifdef MMTKHEAP
+#include "mmtk_julia.h"
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -244,6 +248,9 @@ STATIC_INLINE void jl_free_aligned(void *p) JL_NOTSAFEPOINT
 #else
 STATIC_INLINE void *jl_malloc_aligned(size_t sz, size_t align)
 {
+#ifdef MMTKHEAP
+    return mmtk_malloc_aligned(sz, align);
+#endif
 #if defined(_P64) || defined(__APPLE__)
     if (align <= 16)
         return malloc(sz);
@@ -256,6 +263,14 @@ STATIC_INLINE void *jl_malloc_aligned(size_t sz, size_t align)
 STATIC_INLINE void *jl_realloc_aligned(void *d, size_t sz, size_t oldsz,
                                        size_t align)
 {
+#ifdef MMTKHEAP
+    void *res = jl_malloc_aligned(sz, align);
+    if (res != NULL) {
+        memcpy(res, d, oldsz > sz ? sz : oldsz);
+        mmtk_free_aligned(d);
+    }
+    return res;
+#endif
 #if defined(_P64) || defined(__APPLE__)
     if (align <= 16)
         return realloc(d, sz);
@@ -269,7 +284,11 @@ STATIC_INLINE void *jl_realloc_aligned(void *d, size_t sz, size_t oldsz,
 }
 STATIC_INLINE void jl_free_aligned(void *p) JL_NOTSAFEPOINT
 {
+#ifdef MMTKHEAP
+    mmtk_free_aligned(p);
+#else
     free(p);
+#endif
 }
 #endif
 #define malloc_cache_align(sz) jl_malloc_aligned(sz, JL_CACHE_BYTE_ALIGNMENT)
@@ -284,7 +303,10 @@ static void schedule_finalization(void *o, void *f) JL_NOTSAFEPOINT
     jl_atomic_store_relaxed(&jl_gc_have_pending_finalizers, 1);
 }
 
-static void run_finalizer(jl_task_t *ct, void *o, void *ff)
+#ifndef MMTKHEAP
+static
+#endif
+void run_finalizer(jl_task_t *ct, void *o, void *ff)
 {
     int ptr_finalizer = gc_ptr_tag(o, 1);
     o = gc_ptr_clear_tag(o, 3);
@@ -393,7 +415,10 @@ static void jl_gc_run_finalizers_in_list(jl_task_t *ct, arraylist_t *list) JL_NO
     ct->sticky = sticky;
 }
 
-static uint64_t finalizer_rngState[4];
+#ifndef MMTKHEAP
+static
+#endif
+uint64_t finalizer_rngState[4];
 
 void jl_rng_split(uint64_t to[4], uint64_t from[4]) JL_NOTSAFEPOINT;
 
@@ -404,6 +429,10 @@ JL_DLLEXPORT void jl_gc_init_finalizer_rng_state(void)
 
 static void run_finalizers(jl_task_t *ct)
 {
+#ifdef MMTKHEAP
+    mmtk_jl_run_finalizers(ct->ptls);
+    return;
+#endif
     // Racy fast path:
     // The race here should be OK since the race can only happen if
     // another thread is writing to it with the lock held. In such case,
@@ -442,6 +471,10 @@ JL_DLLEXPORT void jl_gc_run_pending_finalizers(jl_task_t *ct)
 {
     if (ct == NULL)
         ct = jl_current_task;
+#ifdef MMTKHEAP
+    mmtk_jl_run_pending_finalizers(ct->ptls);
+    return;
+#endif
     jl_ptls_t ptls = ct->ptls;
     if (!ptls->in_finalizer && ptls->locks.len == 0 && ptls->finalizers_inhibited == 0) {
         run_finalizers(ct);
@@ -532,6 +565,10 @@ void jl_gc_run_all_finalizers(jl_task_t *ct)
 
 void jl_gc_add_finalizer_(jl_ptls_t ptls, void *v, void *f) JL_NOTSAFEPOINT
 {
+#ifdef MMTKHEAP
+    register_finalizer(v, f, 0);
+    return;
+#endif
     assert(jl_atomic_load_relaxed(&ptls->gc_state) == 0);
     arraylist_t *a = &ptls->finalizers;
     // This acquire load and the release store at the end are used to
@@ -560,14 +597,20 @@ void jl_gc_add_finalizer_(jl_ptls_t ptls, void *v, void *f) JL_NOTSAFEPOINT
 
 JL_DLLEXPORT void jl_gc_add_ptr_finalizer(jl_ptls_t ptls, jl_value_t *v, void *f) JL_NOTSAFEPOINT
 {
+#ifndef MMTKHEAP
     jl_gc_add_finalizer_(ptls, (void*)(((uintptr_t)v) | 1), f);
+#else
+    register_finalizer(v, f, 1);
+#endif
 }
 
 // schedule f(v) to call at the next quiescent interval (aka after the next safepoint/region on all threads)
 JL_DLLEXPORT void jl_gc_add_quiescent(jl_ptls_t ptls, void **v, void *f) JL_NOTSAFEPOINT
 {
+#ifndef MMTKHEAP
     assert(!gc_ptr_tag(v, 3));
     jl_gc_add_finalizer_(ptls, (void*)(((uintptr_t)v) | 3), f);
+#endif
 }
 
 JL_DLLEXPORT void jl_gc_add_finalizer_th(jl_ptls_t ptls, jl_value_t *v, jl_function_t *f) JL_NOTSAFEPOINT
@@ -582,6 +625,10 @@ JL_DLLEXPORT void jl_gc_add_finalizer_th(jl_ptls_t ptls, jl_value_t *v, jl_funct
 
 JL_DLLEXPORT void jl_finalize_th(jl_task_t *ct, jl_value_t *o)
 {
+#ifdef MMTKHEAP
+    run_finalizers_for_obj(o);
+    return;
+#endif
     JL_LOCK_NOGC(&finalizers_lock);
     // Copy the finalizers into a temporary list so that code in the finalizer
     // won't change the list as we loop through them.
@@ -955,12 +1002,16 @@ void jl_gc_force_mark_old(jl_ptls_t ptls, jl_value_t *v) JL_NOTSAFEPOINT
 
 static inline void maybe_collect(jl_ptls_t ptls)
 {
+#ifndef MMTKHEAP
     if (jl_atomic_load_relaxed(&ptls->gc_num.allocd) >= 0 || jl_gc_debug_check_other()) {
         jl_gc_collect(JL_GC_AUTO);
     }
     else {
         jl_gc_safepoint_(ptls);
     }
+#else
+    mmtk_gc_poll(ptls);
+#endif
 }
 
 // weak references
@@ -971,7 +1022,11 @@ JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref_th(jl_ptls_t ptls,
     jl_weakref_t *wr = (jl_weakref_t*)jl_gc_alloc(ptls, sizeof(void*),
                                                   jl_weakref_type);
     wr->value = value;  // NOTE: wb not needed here
+#ifdef MMTKHEAP
+    mmtk_add_weak_candidate(wr);
+#else
     arraylist_push(&ptls->heap.weak_refs, wr);
+#endif
     return wr;
 }
 
@@ -1219,14 +1274,25 @@ size_t jl_array_nbytes(jl_array_t *a) JL_NOTSAFEPOINT
     return sz;
 }
 
-static void jl_gc_free_array(jl_array_t *a) JL_NOTSAFEPOINT
+#ifndef MMTKHEAP
+static
+#endif
+void jl_gc_free_array(jl_array_t *a) JL_NOTSAFEPOINT
 {
     if (a->flags.how == 2) {
         char *d = (char*)a->data - a->offset*a->elsize;
+#ifndef MMTKHEAP
         if (a->flags.isaligned)
             jl_free_aligned(d);
         else
             free(d);
+#else
+        if (a->flags.isaligned)
+            mmtk_free_aligned(d);
+        else {
+            mmtk_free(d);
+        }
+#endif
         gc_num.freed += jl_array_nbytes(a);
         gc_num.freecall++;
     }
@@ -1703,6 +1769,7 @@ static void gc_sweep_perm_alloc(void)
 
 JL_DLLEXPORT void jl_gc_queue_root(const jl_value_t *ptr)
 {
+#ifndef MMTKHEAP
     jl_ptls_t ptls = jl_current_task->ptls;
     jl_taggedvalue_t *o = jl_astaggedvalue(ptr);
     // The modification of the `gc_bits` is not atomic but it
@@ -1712,6 +1779,7 @@ JL_DLLEXPORT void jl_gc_queue_root(const jl_value_t *ptr)
     o->bits.gc = GC_MARKED;
     arraylist_push(ptls->heap.remset, (jl_value_t*)ptr);
     ptls->heap.remset_nptr++; // conservative
+#endif
 }
 
 void jl_gc_queue_multiroot(const jl_value_t *parent, const jl_value_t *ptr) JL_NOTSAFEPOINT
@@ -3066,9 +3134,15 @@ JL_DLLEXPORT int jl_gc_enable(int on)
         if (jl_atomic_fetch_add(&jl_gc_disable_counter, -1) == 1) {
             gc_num.allocd += gc_num.deferred_alloc;
             gc_num.deferred_alloc = 0;
+#ifdef MMTKHEAP
+            enable_collection();
+#endif
         }
     }
     else if (prev && !on) {
+#ifdef MMTKHEAP
+        disable_collection();
+#endif
         // enable -> disable
         jl_atomic_fetch_add(&jl_gc_disable_counter, 1);
         // check if the GC is running and wait for it to finish
@@ -3134,7 +3208,10 @@ JL_DLLEXPORT int64_t jl_gc_live_bytes(void)
     return live_bytes;
 }
 
-static void jl_gc_premark(jl_ptls_t ptls2)
+#ifndef MMTKHEAP
+static
+#endif
+void jl_gc_premark(jl_ptls_t ptls2)
 {
     arraylist_t *remset = ptls2->heap.remset;
     ptls2->heap.remset = ptls2->heap.last_remset;
@@ -3465,6 +3542,10 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
         jl_atomic_fetch_add((_Atomic(uint64_t)*)&gc_num.deferred_alloc, localbytes);
         return;
     }
+#ifdef MMTKHEAP
+    handle_user_collection_request(ptls);
+    return;
+#endif
     jl_gc_debug_print();
 
     int8_t old_state = jl_atomic_load_relaxed(&ptls->gc_state);
@@ -3593,6 +3674,10 @@ void jl_init_thread_heap(jl_ptls_t ptls)
 
     memset(&ptls->gc_num, 0, sizeof(ptls->gc_num));
     jl_atomic_store_relaxed(&ptls->gc_num.allocd, -(int64_t)gc_num.interval);
+#ifdef MMTKHEAP
+    MMTk_Mutator mmtk_mutator = bind_mutator((void *)ptls, ptls->tid);
+    ptls->mmtk_mutator_ptr = ((MMTkMutatorContext*)mmtk_mutator);
+#endif
 }
 
 // System-wide initializations
@@ -3632,6 +3717,50 @@ void jl_gc_init(void)
     if (high_water_mark < max_total_memory)
        max_total_memory = high_water_mark;
 
+#ifdef MMTKHEAP
+    long long min_heap_size;
+    long long max_heap_size;
+    char* min_size_def = getenv("MMTK_MIN_HSIZE");
+    char* min_size_gb = getenv("MMTK_MIN_HSIZE_G");
+
+    char* max_size_def = getenv("MMTK_MAX_HSIZE");
+    char* max_size_gb = getenv("MMTK_MAX_HSIZE_G");
+
+    // default min heap currently set as Julia's default_collect_interval
+    if (min_size_def != NULL) {
+        char *p;
+        double min_size = strtod(min_size_def, &p);
+        min_heap_size = (long) 1024 * 1024 * min_size;
+    } else if (min_size_gb != NULL) {
+        char *p;
+        double min_size = strtod(min_size_gb, &p);
+        min_heap_size = (long) 1024 * 1024 * 1024 * min_size;
+    } else {
+        min_heap_size = default_collect_interval;
+    }
+
+    // default max heap currently set as 70% the free memory in the system
+    if (max_size_def != NULL) {
+        char *p;
+        double max_size = strtod(max_size_def, &p);
+        max_heap_size = (long) 1024 * 1024 * max_size;
+    } else if (max_size_gb != NULL) {
+        char *p;
+        double max_size = strtod(max_size_gb, &p);
+        max_heap_size = (long) 1024 * 1024 * 1024 * max_size;
+    } else {
+        max_heap_size = uv_get_free_memory() * 70 / 100;
+    }
+
+    // if only max size is specified initialize MMTk with a fixed size heap
+    if (max_size_def != NULL || max_size_gb != NULL && (min_size_def == NULL && min_size_gb == NULL)) {
+        gc_init(0, max_heap_size, &mmtk_upcalls, (sizeof(jl_taggedvalue_t)));
+    } else {
+        gc_init(min_heap_size, max_heap_size, &mmtk_upcalls, (sizeof(jl_taggedvalue_t)));
+    }
+
+#endif
+
     jl_gc_mark_sp_t sp = {NULL, NULL, NULL, NULL};
     gc_mark_loop(NULL, sp);
     t_start = jl_hrtime();
@@ -3664,6 +3793,9 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
             jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz);
         jl_atomic_store_relaxed(&ptls->gc_num.malloc,
             jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1);
+#ifdef MMTKHEAP
+        return mmtk_counted_malloc(sz);
+#endif
     }
     return malloc(sz);
 }
@@ -3679,6 +3811,9 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
             jl_atomic_load_relaxed(&ptls->gc_num.allocd) + nm*sz);
         jl_atomic_store_relaxed(&ptls->gc_num.malloc,
             jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1);
+#ifdef MMTKHEAP
+        return mmtk_counted_calloc(nm, sz);
+#endif
     }
     return calloc(nm, sz);
 }
@@ -3687,14 +3822,18 @@ JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz)
 {
     jl_gcframe_t **pgcstack = jl_get_pgcstack();
     jl_task_t *ct = jl_current_task;
-    free(p);
     if (pgcstack && ct->world_age) {
         jl_ptls_t ptls = ct->ptls;
         jl_atomic_store_relaxed(&ptls->gc_num.freed,
             jl_atomic_load_relaxed(&ptls->gc_num.freed) + sz);
         jl_atomic_store_relaxed(&ptls->gc_num.freecall,
             jl_atomic_load_relaxed(&ptls->gc_num.freecall) + 1);
+#ifdef MMTKHEAP
+        mmtk_free_with_size(p, sz);
+        return;
+#endif
     }
+    free(p);
 }
 
 JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz)
@@ -3712,6 +3851,9 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size
                 jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (sz - old));
         jl_atomic_store_relaxed(&ptls->gc_num.realloc,
             jl_atomic_load_relaxed(&ptls->gc_num.realloc) + 1);
+#ifdef MMTKHEAP
+        return mmtk_realloc_with_old_size(p, sz, old);
+#endif
     }
     return realloc(p, sz);
 }
@@ -3853,6 +3995,7 @@ JL_DLLEXPORT void *jl_gc_managed_realloc(void *d, size_t sz, size_t oldsz,
 
 jl_value_t *jl_gc_realloc_string(jl_value_t *s, size_t sz)
 {
+#ifndef MMTKHEAP
     size_t len = jl_string_len(s);
     if (sz <= len) return s;
     jl_taggedvalue_t *v = jl_astaggedvalue(s);
@@ -3886,6 +4029,12 @@ jl_value_t *jl_gc_realloc_string(jl_value_t *s, size_t sz)
     jl_value_t *snew = jl_valueof(&newbig->header);
     *(size_t*)snew = sz;
     return snew;
+#else
+    size_t len = jl_string_len(s);
+    jl_value_t *snew = jl_alloc_string(sz);
+    memcpy(jl_string_data(snew), jl_string_data(s), sz <= len ? sz : len);
+    return snew;
+#endif
 }
 
 // Perm gen allocator
diff --git a/src/init.c b/src/init.c
index 0651d3b274f24..45d6b8ee98873 100644
--- a/src/init.c
+++ b/src/init.c
@@ -295,8 +295,12 @@ JL_DLLEXPORT void jl_atexit_hook(int exitcode) JL_NOTSAFEPOINT_ENTER
     JL_STDOUT = (uv_stream_t*) STDOUT_FILENO;
     JL_STDERR = (uv_stream_t*) STDERR_FILENO;
 
+#ifndef MMTKHEAP
     if (ct)
         jl_gc_run_all_finalizers(ct);
+#else
+    mmtk_jl_gc_run_all_finalizers();
+#endif
 
     uv_loop_t *loop = jl_global_event_loop();
     if (loop != NULL) {
@@ -806,6 +810,12 @@ JL_DLLEXPORT void julia_init(JL_IMAGE_SEARCH rel)
     arraylist_new(&jl_image_relocs, 0);
 
     jl_ptls_t ptls = jl_init_threadtls(0);
+
+#ifdef MMTKHEAP
+    // start MMTk's GC
+    initialize_collection((void*) ptls);
+#endif
+
 #pragma GCC diagnostic push
 #if defined(_COMPILER_GCC_) && __GNUC__ >= 12
 #pragma GCC diagnostic ignored "-Wdangling-pointer"
diff --git a/src/julia.h b/src/julia.h
index 03efa773d026c..2bc1a97b681ed 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -932,22 +932,27 @@ JL_DLLEXPORT void jl_gc_queue_multiroot(const jl_value_t *root, const jl_value_t
 
 STATIC_INLINE void jl_gc_wb(const void *parent, const void *ptr) JL_NOTSAFEPOINT
 {
+#ifndef MMTKHEAP
     // parent and ptr isa jl_value_t*
     if (__unlikely(jl_astaggedvalue(parent)->bits.gc == 3 && // parent is old and not in remset
                    (jl_astaggedvalue(ptr)->bits.gc & 1) == 0)) // ptr is young
         jl_gc_queue_root((jl_value_t*)parent);
+#endif
 }
 
 STATIC_INLINE void jl_gc_wb_back(const void *ptr) JL_NOTSAFEPOINT // ptr isa jl_value_t*
 {
+#ifndef MMTKHEAP
     // if ptr is old
     if (__unlikely(jl_astaggedvalue(ptr)->bits.gc == 3)) {
         jl_gc_queue_root((jl_value_t*)ptr);
     }
+#endif
 }
 
 STATIC_INLINE void jl_gc_multi_wb(const void *parent, const jl_value_t *ptr) JL_NOTSAFEPOINT
 {
+#ifndef MMTKHEAP
     // ptr is an immutable object
     if (__likely(jl_astaggedvalue(parent)->bits.gc != 3))
         return; // parent is young or in remset
@@ -957,6 +962,7 @@ STATIC_INLINE void jl_gc_multi_wb(const void *parent, const jl_value_t *ptr) JL_
     const jl_datatype_layout_t *ly = dt->layout;
     if (ly->npointers)
         jl_gc_queue_multiroot((jl_value_t*)parent, ptr);
+#endif
 }
 
 JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz);
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 7565967b0a270..adf0c0c3fdd67 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -329,6 +329,10 @@ JL_DLLEXPORT extern const char *jl_filename;
 jl_value_t *jl_gc_pool_alloc_noinline(jl_ptls_t ptls, int pool_offset,
                                    int osize);
 jl_value_t *jl_gc_big_alloc_noinline(jl_ptls_t ptls, size_t allocsz);
+#ifdef MMTKHEAP
+JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_default(jl_ptls_t ptls, int pool_offset, int osize, void* ty);
+JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_big(jl_ptls_t ptls, size_t allocsz);
+#endif
 JL_DLLEXPORT int jl_gc_classify_pools(size_t sz, int *osize) JL_NOTSAFEPOINT;
 extern uv_mutex_t gc_perm_lock;
 void *jl_gc_perm_alloc_nolock(size_t sz, int zero,
@@ -452,17 +456,27 @@ STATIC_INLINE jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty)
     jl_value_t *v;
     const size_t allocsz = sz + sizeof(jl_taggedvalue_t);
     if (sz <= GC_MAX_SZCLASS) {
+#ifndef MMTKHEAP
         int pool_id = jl_gc_szclass(allocsz);
         jl_gc_pool_t *p = &ptls->heap.norm_pools[pool_id];
         int osize = jl_gc_sizeclasses[pool_id];
         // We call `jl_gc_pool_alloc_noinline` instead of `jl_gc_pool_alloc` to avoid double-counting in
         // the Allocations Profiler. (See https://github.com/JuliaLang/julia/pull/43868 for more details.)
         v = jl_gc_pool_alloc_noinline(ptls, (char*)p - (char*)ptls, osize);
+#else
+        int pool_id = jl_gc_szclass(allocsz);
+        int osize = jl_gc_sizeclasses[pool_id];
+        v = jl_mmtk_gc_alloc_default(ptls, pool_id, osize, ty);
+#endif
     }
     else {
         if (allocsz < sz) // overflow in adding offs, size was "negative"
             jl_throw(jl_memory_exception);
+#ifndef MMTKHEAP
         v = jl_gc_big_alloc_noinline(ptls, allocsz);
+#else
+        v = jl_mmtk_gc_alloc_big(ptls, allocsz);
+#endif
     }
     jl_set_typeof(v, ty);
     maybe_record_alloc_to_profile(v, sz, (jl_datatype_t*)ty);
@@ -564,16 +578,20 @@ void gc_setmark_buf(jl_ptls_t ptls, void *buf, uint8_t, size_t) JL_NOTSAFEPOINT;
 
 STATIC_INLINE void jl_gc_wb_binding(jl_binding_t *bnd, void *val) JL_NOTSAFEPOINT // val isa jl_value_t*
 {
+#ifndef MMTKHEAP
     jl_gc_wb(bnd, val);
+#endif
 }
 
 STATIC_INLINE void jl_gc_wb_buf(void *parent, void *bufptr, size_t minsz) JL_NOTSAFEPOINT // parent isa jl_value_t*
 {
+#ifndef MMTKHEAP
     // if parent is marked and buf is not
     if (__unlikely(jl_astaggedvalue(parent)->bits.gc & 1)) {
         jl_task_t *ct = jl_current_task;
         gc_setmark_buf(ct->ptls, bufptr, 3, minsz);
     }
+#endif
 }
 
 void jl_gc_debug_print_status(void);
diff --git a/src/julia_threads.h b/src/julia_threads.h
index 5874225c12eac..4d6284562120b 100644
--- a/src/julia_threads.h
+++ b/src/julia_threads.h
@@ -4,6 +4,10 @@
 #ifndef JL_THREADS_H
 #define JL_THREADS_H
 
+#ifdef MMTKHEAP
+#include "mmtkMutator.h"
+#endif
+
 #include "julia_atomics.h"
 #ifndef _OS_WINDOWS_
 #include "pthread.h"
@@ -282,6 +286,12 @@ typedef struct _jl_tls_states_t {
         uint64_t sleep_leave;
     )
 
+#ifdef MMTKHEAP
+    MMTkMutatorContext* mmtk_mutator_ptr;
+    void* cursor;
+    void* limit;
+#endif
+
     // some hidden state (usually just because we don't have the type's size declaration)
 #ifdef LIBRARY_EXPORTS
     uv_mutex_t sleep_lock;
diff --git a/src/llvm-final-gc-lowering.cpp b/src/llvm-final-gc-lowering.cpp
index 3b8533c6d0115..3e2eb3bcdf6ed 100644
--- a/src/llvm-final-gc-lowering.cpp
+++ b/src/llvm-final-gc-lowering.cpp
@@ -224,10 +224,78 @@ Value *FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
         derefAttr = Attribute::getWithDereferenceableBytes(F.getContext(), sz + sizeof(void*));
     }
     else {
+#ifndef MMTKHEAP
         auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), offset);
         auto pool_osize = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize);
         newI = builder.CreateCall(poolAllocFunc, { ptls, pool_offs, pool_osize });
         derefAttr = Attribute::getWithDereferenceableBytes(F.getContext(), osize);
+    #else
+        auto pool_osize_i32 = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize);
+        auto pool_osize = ConstantInt::get(Type::getInt64Ty(F.getContext()), osize);
+        auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, cursor));
+        auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()),  offsetof(jl_tls_states_t, limit));
+
+        auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos);
+        auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr");
+        auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor");
+
+
+        auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8));
+        auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor);
+        auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor);
+        auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta");
+        auto result = builder.CreateNSWAdd(cursor, delta, "result");
+
+        auto new_cursor = builder.CreateNSWAdd(result, pool_osize);
+
+        auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos);
+        auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr");
+        auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit");
+
+        auto gt_limit = builder.CreateICmpSGT(new_cursor, limit);
+
+        auto current_block = target->getParent();
+        builder.SetInsertPoint(target->getNextNode());
+        auto phiNode = builder.CreatePHI(poolAllocFunc->getReturnType(), 2, "phi_fast_slow");
+        auto top_cont = current_block->splitBasicBlock(target->getNextNode(), "top_cont");
+
+        auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction());
+        auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction(), top_cont);
+
+        auto next_br = current_block->getTerminator();
+        next_br->eraseFromParent();
+        builder.SetInsertPoint(current_block);
+        builder.CreateCondBr(gt_limit, slowpath, fastpath);
+
+        // slowpath
+        builder.SetInsertPoint(slowpath);
+        auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1);
+        auto new_call = builder.CreateCall(poolAllocFunc, { ptls, pool_offs, pool_osize_i32 });
+        new_call->setAttributes(new_call->getCalledFunction()->getAttributes());
+        builder.CreateBr(top_cont);
+
+        // // fastpath
+        builder.SetInsertPoint(fastpath);
+        builder.CreateStore(new_cursor, cursor_ptr);
+
+        // ptls->gc_num.allocd += osize;
+        auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_num));
+        auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos);
+        auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc");
+        auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls);
+        auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize);
+        builder.CreateStore(pool_allocd_total, pool_alloc_tls);
+
+        auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t)));
+        auto v_as_ptr = builder.CreateIntToPtr(v_raw, poolAllocFunc->getReturnType());
+        builder.CreateBr(top_cont);
+
+        phiNode->addIncoming(new_call, slowpath);
+        phiNode->addIncoming(v_as_ptr, fastpath);
+        phiNode->takeName(target);
+
+        return phiNode;
+#endif
     }
     newI->setAttributes(newI->getCalledFunction()->getAttributes());
     newI->addRetAttr(derefAttr);
diff --git a/src/llvm-pass-helpers.cpp b/src/llvm-pass-helpers.cpp
index ea390f01010fd..c46228f13490b 100644
--- a/src/llvm-pass-helpers.cpp
+++ b/src/llvm-pass-helpers.cpp
@@ -226,8 +226,13 @@ namespace jl_intrinsics {
 }
 
 namespace jl_well_known {
+#ifndef MMTKHEAP
     static const char *GC_BIG_ALLOC_NAME = XSTR(jl_gc_big_alloc);
     static const char *GC_POOL_ALLOC_NAME = XSTR(jl_gc_pool_alloc);
+#else
+    static const char *GC_BIG_ALLOC_NAME = XSTR(jl_mmtk_gc_alloc_big);
+    static const char *GC_POOL_ALLOC_NAME = XSTR(jl_mmtk_gc_alloc_default_llvm);
+#endif
     static const char *GC_QUEUE_ROOT_NAME = XSTR(jl_gc_queue_root);
 
     using jl_intrinsics::addGCAllocAttributes;
diff --git a/src/threading.c b/src/threading.c
index db9df0bad0dde..52b3fc2d8c06d 100644
--- a/src/threading.c
+++ b/src/threading.c
@@ -347,6 +347,10 @@ jl_ptls_t jl_init_threadtls(int16_t tid)
     ptls->rngseed = jl_rand();
     if (tid == 0)
         ptls->disable_gc = 1;
+#ifdef MMTKHEAP
+    if (tid == 0)
+        disable_collection();
+#endif
 #ifdef _OS_WINDOWS_
     if (tid == 0) {
         if (!DuplicateHandle(GetCurrentProcess(), GetCurrentThread(),

From 72a275233012a80dfd5c5ac1c83afdf9aff0a87a Mon Sep 17 00:00:00 2001
From: Kiran <kpamnany@users.noreply.github.com>
Date: Thu, 16 Mar 2023 19:14:52 -0400
Subject: [PATCH 002/116] Refactor MMTk changes to Julia (#5)

In preparation for upstreaming these changes.

This ended up being a pretty large set of changes, but I think most of this would have been necessary for the upstream PR anyway.

Summary of the changes:
- Build-related, to more closely match how Julia adds optional libraries. More has to be done here, for automation and for BinaryBuilder.
- Reduced the `#ifdef MMTK_GC`s.
- Documented the GC interface in `gc-interface.h`.
- Moved code that is common to Julia's GC and MMTk into `gc-common.c`.
- Exclude `gc-debug.c` and `gc-pages.c` entirely for MMTk.
- Reorganized `gc.h` into 3 parts: common, MMTk-specific, and Julia GC-specific.
- Reorganized changes to `julia.h` and `julia_internal.h`.
- Removed `#ifdef MMTK_GC` in `llvm-pass-helpers.cpp`.

We can now start working on resolving the TODOs.
---
 Make.inc                       |  26 ++
 contrib/refresh_checksums.mk   |   2 +-
 src/Makefile                   |  77 +--
 src/array.c                    |   4 +-
 src/gc-common.c                | 732 +++++++++++++++++++++++++++++
 src/gc-debug.c                 |  47 +-
 src/gc-pages.c                 |   4 +
 src/gc.c                       | 832 +--------------------------------
 src/gc.h                       | 243 ++++++----
 src/init.c                     |  13 +-
 src/julia.h                    |  22 +-
 src/julia_internal.h           |  49 +-
 src/julia_threads.h            |   4 +-
 src/llvm-final-gc-lowering.cpp |   6 +-
 src/llvm-pass-helpers.cpp      |   5 -
 src/mmtk-gc.c                  | 487 +++++++++++++++++++
 src/threading.c                |   6 +-
 17 files changed, 1515 insertions(+), 1044 deletions(-)
 create mode 100644 src/gc-common.c
 create mode 100644 src/mmtk-gc.c

diff --git a/Make.inc b/Make.inc
index bb1922c32bc44..7c1ca6a5db7a8 100644
--- a/Make.inc
+++ b/Make.inc
@@ -86,6 +86,9 @@ HAVE_SSP := 0
 WITH_GC_VERIFY := 0
 WITH_GC_DEBUG_ENV := 0
 
+# MMTk GC
+WITH_MMTK ?= 0
+
 # Enable DTrace support
 WITH_DTRACE := 0
 
@@ -709,6 +712,29 @@ JCXXFLAGS += -DGC_DEBUG_ENV
 JCFLAGS += -DGC_DEBUG_ENV
 endif
 
+ifeq ($(WITH_MMTK), 1)
+ifeq (${MMTK_JULIA_DIR},)
+$(error MMTK_JULIA_DIR must be set to use MMTk)
+endif
+JCXXFLAGS += -DMMTK_GC
+JCFLAGS += -DMMTK_GC
+ifeq (${MMTK_BUILD},)
+ifeq (debug,$(findstring debug,$(MAKECMDGOALS)))
+MMTK_BUILD = debug
+else
+MMTK_BUILD = release
+endif
+endif
+MMTK_DIR = ${MMTK_JULIA_DIR}/mmtk
+MMTK_API_INC = $(MMTK_DIR)/api
+MMTK_JULIA_INC = ${MMTK_JULIA_DIR}/julia
+MMTK_LIB := -L$(MMTK_DIR)/target/$(MMTK_BUILD) -lmmtk_julia
+LDFLAGS += -Wl,-rpath=$(MMTK_DIR)/target/$(MMTK_BUILD)/
+else
+MMTK_JULIA_INC :=
+MMTK_LIB :=
+endif
+
 ifeq ($(WITH_DTRACE), 1)
 JCXXFLAGS += -DUSE_DTRACE
 JCFLAGS += -DUSE_DTRACE
diff --git a/contrib/refresh_checksums.mk b/contrib/refresh_checksums.mk
index fc632728e9a9e..664a1e4b038e0 100644
--- a/contrib/refresh_checksums.mk
+++ b/contrib/refresh_checksums.mk
@@ -24,7 +24,7 @@ CLANG_TRIPLETS=$(filter %-darwin %-freebsd,$(TRIPLETS))
 NON_CLANG_TRIPLETS=$(filter-out %-darwin %-freebsd,$(TRIPLETS))
 
 # These are the projects currently using BinaryBuilder; both GCC-expanded and non-GCC-expanded:
-BB_PROJECTS=mbedtls libssh2 nghttp2 mpfr curl libgit2 pcre libuv unwind llvmunwind dsfmt objconv p7zip zlib libsuitesparse openlibm blastrampoline
+BB_PROJECTS=mbedtls libssh2 nghttp2 mpfr curl libgit2 pcre libuv unwind llvmunwind dsfmt objconv p7zip zlib libsuitesparse openlibm blastrampoline libmmtk_julia
 BB_GCC_EXPANDED_PROJECTS=openblas csl
 BB_CXX_EXPANDED_PROJECTS=gmp llvm clang llvm-tools lld
 # These are non-BB source-only deps
diff --git a/src/Makefile b/src/Makefile
index d113eea5422a5..2e976282015d6 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -4,17 +4,6 @@ BUILDDIR := .
 include $(JULIAHOME)/Make.inc
 include $(JULIAHOME)/deps/llvm-ver.make
 
-ifeq ($(USE_MMTK), 1)
-CFLAGS = -DMMTKHEAP
-CPPFLAGS = -DMMTKHEAP
-MMTK_BUILD_TYPE = ${MMTK_BUILD}
-MMTK_DIR = ${MMTK_JULIA_DIR}
-MMTK_API_DIR_INCLUDE = $(MMTK_DIR)/api
-MMTK_JULIA_DIR_INCLUDE = $(MMTK_DIR)/../julia
-MMTK_LIB := -L$(MMTK_DIR)/target/$(MMTK_BUILD_TYPE)/ -lmmtk_julia
-LDFLAGS += -Wl,-rpath=$(MMTK_DIR)/target/$(MMTK_BUILD_TYPE)/
-endif
-
 JCFLAGS += $(CFLAGS)
 JCXXFLAGS += $(CXXFLAGS)
 JCPPFLAGS += $(CPPFLAGS)
@@ -30,10 +19,6 @@ FLAGS := \
 	-I$(LIBUV_INC) -I$(build_includedir) \
 	-I$(JULIAHOME)/deps/valgrind
 
-ifeq ($(USE_MMTK), 1)
-FLAGS += -I$(MMTK_API_DIR_INCLUDE) -I$(MMTK_JULIA_DIR_INCLUDE)
-endif
-
 FLAGS += -Wall -Wno-strict-aliasing -fno-omit-frame-pointer -fvisibility=hidden -fno-common \
 		 -Wno-comment -Wpointer-arith -Wundef
 ifeq ($(USEGCC),1) # GCC bug #25509 (void)__attribute__((warn_unused_result))
@@ -45,6 +30,10 @@ ifeq ($(USECLANG),1)
 FLAGS += -Wno-return-type-c-linkage
 endif
 
+ifeq ($(WITH_MMTK), 1)
+FLAGS += -I$(MMTK_API_INC) -I$(MMTK_JULIA_INC)
+endif
+
 FLAGS += -DJL_BUILD_ARCH='"$(ARCH)"'
 ifeq ($(OS),WINNT)
 FLAGS += -DJL_BUILD_UNAME='"NT"'
@@ -60,9 +49,10 @@ SRCS := \
 	jltypes gf typemap smallintset ast builtins module interpreter symbol \
 	dlload sys init task array staticdata toplevel jl_uv datatype \
 	simplevector runtime_intrinsics precompile jloptions \
-	threading partr stackwalk gc gc-debug gc-pages gc-stacks gc-alloc-profiler method \
-	jlapi signal-handling safepoint timing subtype rtutils gc-heap-snapshot \
-	crc32c APInt-C processor ircode opaque_closure codegen-stubs coverage runtime_ccall
+	threading partr stackwalk gc-common gc gc-debug gc-pages gc-stacks gc-alloc-profiler \
+	mmtk-gc method jlapi signal-handling safepoint timing subtype rtutils \
+	gc-heap-snapshot crc32c APInt-C processor ircode opaque_closure codegen-stubs \
+	coverage runtime_ccall
 
 RT_LLVMLINK :=
 CG_LLVMLINK :=
@@ -173,13 +163,8 @@ LIBJULIA_PATH_REL := libjulia
 endif
 
 COMMON_LIBPATHS := -L$(build_libdir) -L$(build_shlibdir)
-RT_LIBS := $(WHOLE_ARCHIVE) $(LIBUV) $(WHOLE_ARCHIVE) $(LIBUTF8PROC) $(NO_WHOLE_ARCHIVE) $(LIBUNWIND) $(RT_LLVMLINK) $(OSLIBS)
-CG_LIBS := $(LIBUNWIND) $(CG_LLVMLINK) $(OSLIBS)
-
-ifeq ($(USE_MMTK), 1)
-CG_LIBS += $(MMTK_LIB)
-RT_LIBS += $(MMTK_LIB)
-endif
+RT_LIBS := $(WHOLE_ARCHIVE) $(LIBUV) $(WHOLE_ARCHIVE) $(LIBUTF8PROC) $(NO_WHOLE_ARCHIVE) $(LIBUNWIND) $(RT_LLVMLINK) $(OSLIBS) $(MMTK_LIB)
+CG_LIBS := $(LIBUNWIND) $(CG_LLVMLINK) $(OSLIBS) $(MMTK_LIB)
 
 RT_DEBUG_LIBS := $(COMMON_LIBPATHS) $(WHOLE_ARCHIVE) $(BUILDDIR)/flisp/libflisp-debug.a $(WHOLE_ARCHIVE) $(BUILDDIR)/support/libsupport-debug.a -ljulia-debug $(RT_LIBS)
 CG_DEBUG_LIBS := $(COMMON_LIBPATHS) $(CG_LIBS) -ljulia-debug -ljulia-internal-debug
@@ -189,10 +174,13 @@ CG_RELEASE_LIBS := $(COMMON_LIBPATHS) $(CG_LIBS) -ljulia -ljulia-internal
 OBJS := $(SRCS:%=$(BUILDDIR)/%.o)
 DOBJS := $(SRCS:%=$(BUILDDIR)/%.dbg.obj)
 
-ifeq ($(USE_MMTK), 1)
+ifeq ($(WITH_MMTK), 1)
 MMTK_SRCS := mmtk_julia
-MMTK_OBJS := $(MMTK_SRCS:%=$(MMTK_JULIA_DIR_INCLUDE)/%.o)
-MMTK_DOBJS := $(MMTK_SRCS:%=$(MMTK_JULIA_DIR_INCLUDE)/%.o)
+MMTK_OBJS := $(MMTK_SRCS:%=$(MMTK_JULIA_INC)/%.o)
+MMTK_DOBJS := $(MMTK_SRCS:%=$(MMTK_JULIA_INC)/%.dbg.obj)
+else
+MMTK_OBJS :=
+MMTK_DOBJS :=
 endif
 
 CODEGEN_OBJS := $(CODEGEN_SRCS:%=$(BUILDDIR)/%.o)
@@ -262,10 +250,10 @@ $(BUILDDIR)/%.o : $(SRCDIR)/%.d
 $(BUILDDIR)/%.dbg.obj : $(SRCDIR)/%.d
 	@$(call PRINT_DTRACE, $(DTRACE) -G -s $< -o $@)
 
-ifeq ($(USE_MMTK), 1)
-$(MMTK_JULIA_DIR_INCLUDE)/%.o: $(MMTK_JULIA_DIR_INCLUDE)/%.c $(HEADERS) | $(MMTK_JULIA_DIR_INCLUDE)
+ifeq ($(WITH_MMTK), 1)
+$(MMTK_JULIA_INC)/%.o: $(MMTK_JULIA_INC)/%.c $(HEADERS) | $(MMTK_JULIA_INC)
 	@$(call PRINT_CC, $(CC) $(JCPPFLAGS) $(JCFLAGS) $(SHIPFLAGS) $(DISABLE_ASSERTIONS) -c $< -o $@)
-$(MMTK_JULIA_DIR_INCLUDE)/%.dbg.obj: $(MMTK_JULIA_DIR_INCLUDE)/%.c $(HEADERS) | $(MMTK_JULIA_DIR_INCLUDE)
+$(MMTK_JULIA_INC)/%.dbg.obj: $(MMTK_JULIA_INC)/%.c $(HEADERS) | $(MMTK_JULIA_INC)
 	@$(call PRINT_CC, $(CC) $(JCPPFLAGS) $(JCFLAGS) $(DEBUGFLAGS) -c $< -o $@)
 endif
 
@@ -333,6 +321,8 @@ $(BUILDDIR)/disasm.o $(BUILDDIR)/disasm.dbg.obj: $(SRCDIR)/debuginfo.h $(SRCDIR)
 $(BUILDDIR)/gc-debug.o $(BUILDDIR)/gc-debug.dbg.obj: $(SRCDIR)/gc.h
 $(BUILDDIR)/gc-pages.o $(BUILDDIR)/gc-pages.dbg.obj: $(SRCDIR)/gc.h
 $(BUILDDIR)/gc.o $(BUILDDIR)/gc.dbg.obj: $(SRCDIR)/gc.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h
+$(BUILDDIR)/mmtk-gc.o $(BUILDDIR)/mmtk-gc.dbg.obj: $(SRCDIR)/gc.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h
+$(BUILDDIR)/gc-common.o $(BUILDDIR)/gc-common.dbg.obj: $(SRCDIR)/gc.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h
 $(BUILDDIR)/gc-heap-snapshot.o $(BUILDDIR)/gc-heap-snapshot.dbg.obj: $(SRCDIR)/gc.h $(SRCDIR)/gc-heap-snapshot.h
 $(BUILDDIR)/gc-alloc-profiler.o $(BUILDDIR)/gc-alloc-profiler.dbg.obj: $(SRCDIR)/gc.h $(SRCDIR)/gc-alloc-profiler.h
 $(BUILDDIR)/init.o $(BUILDDIR)/init.dbg.obj: $(SRCDIR)/builtin_proto.h
@@ -398,7 +388,6 @@ $(BUILDDIR)/julia_version.h: $(JULIAHOME)/VERSION
 
 CXXLD = $(CXX) -shared
 
-ifeq ($(USE_MMTK), 1)
 $(build_shlibdir)/libjulia-internal.$(JL_MAJOR_MINOR_SHLIB_EXT): $(SRCDIR)/julia.expmap $(OBJS) $(MMTK_OBJS) $(BUILDDIR)/flisp/libflisp.a $(BUILDDIR)/support/libsupport.a $(LIBUV)
 	@$(call PRINT_LINK, $(CXXLD) $(call IMPLIB_FLAGS,$@) $(JCXXFLAGS) $(CXXLDFLAGS) $(SHIPFLAGS) $(OBJS) $(MMTK_OBJS) $(RPATH_LIB) -o $@ \
 		$(JLDFLAGS) $(JLIBLDFLAGS) $(RT_RELEASE_LIBS) $(call SONAME_FLAGS,libjulia-internal.$(JL_MAJOR_SHLIB_EXT)))
@@ -410,19 +399,6 @@ $(build_shlibdir)/libjulia-internal-debug.$(JL_MAJOR_MINOR_SHLIB_EXT): $(SRCDIR)
 		$(JLDFLAGS) $(JLIBLDFLAGS) $(RT_DEBUG_LIBS) $(call SONAME_FLAGS,libjulia-internal-debug.$(JL_MAJOR_SHLIB_EXT)))
 	@$(INSTALL_NAME_CMD)libjulia-internal-debug.$(SHLIB_EXT) $@
 		$(DSYMUTIL) $@
-else
-$(build_shlibdir)/libjulia-internal.$(JL_MAJOR_MINOR_SHLIB_EXT): $(SRCDIR)/julia.expmap $(OBJS) $(BUILDDIR)/flisp/libflisp.a $(BUILDDIR)/support/libsupport.a $(LIBUV)
-	@$(call PRINT_LINK, $(CXXLD) $(call IMPLIB_FLAGS,$@) $(JCXXFLAGS) $(JL_CXXFLAGS) $(CXXLDFLAGS) $(SHIPFLAGS) $(OBJS) $(RPATH_LIB) -o $@ \
-		$(JLDFLAGS) $(JLIBLDFLAGS) $(RT_RELEASE_LIBS) $(call SONAME_FLAGS,libjulia-internal.$(JL_MAJOR_SHLIB_EXT)))
-	@$(INSTALL_NAME_CMD)libjulia-internal.$(SHLIB_EXT) $@
-	$(DSYMUTIL) $@
-
-$(build_shlibdir)/libjulia-internal-debug.$(JL_MAJOR_MINOR_SHLIB_EXT): $(SRCDIR)/julia.expmap $(DOBJS) $(BUILDDIR)/flisp/libflisp-debug.a $(BUILDDIR)/support/libsupport-debug.a $(LIBUV)
-	@$(call PRINT_LINK, $(CXXLD) $(call IMPLIB_FLAGS,$@) $(JCXXFLAGS) $(JL_CXXFLAGS) $(CXXLDFLAGS) $(DEBUGFLAGS) $(DOBJS) $(RPATH_LIB) -o $@ \
-		$(JLDFLAGS) $(JLIBLDFLAGS) $(RT_DEBUG_LIBS) $(call SONAME_FLAGS,libjulia-internal-debug.$(JL_MAJOR_SHLIB_EXT)))
-	@$(INSTALL_NAME_CMD)libjulia-internal-debug.$(SHLIB_EXT) $@
-	$(DSYMUTIL) $@
-endif
 
 ifneq ($(OS), WINNT)
 $(build_shlibdir)/libjulia-internal.$(JL_MAJOR_SHLIB_EXT) $(build_shlibdir)/libjulia-internal-debug.$(JL_MAJOR_SHLIB_EXT): $(build_shlibdir)/libjulia-internal%.$(JL_MAJOR_SHLIB_EXT): \
@@ -464,20 +440,11 @@ libjulia-codegen-release: $(build_shlibdir)/libjulia-codegen.$(JL_MAJOR_MINOR_SH
 libjulia-codegen-debug: $(build_shlibdir)/libjulia-codegen-debug.$(JL_MAJOR_MINOR_SHLIB_EXT)
 libjulia-codegen-debug libjulia-codegen-release: $(PUBLIC_HEADER_TARGETS)
 
-ifeq ($(USE_MMTK), 1)
 clean:
 	-rm -fr $(build_shlibdir)/libjulia-internal* $(build_shlibdir)/libccalltest* $(build_shlibdir)/libllvmcalltest*
 	-rm -f $(BUILDDIR)/julia_flisp.boot $(BUILDDIR)/julia_flisp.boot.inc
 	-rm -f $(BUILDDIR)/*.dbg.obj $(BUILDDIR)/*.o $(BUILDDIR)/*.dwo $(BUILDDIR)/*.$(SHLIB_EXT) $(BUILDDIR)/*.a
-	-rm -f $(BUILDDIR)/julia_version.h
-	-rm -fr $(MMTK_JULIA_DIR_INCLUDE)/*.o
-else
-clean:
-	-rm -fr $(build_shlibdir)/libjulia-internal* $(build_shlibdir)/libjulia-codegen* $(build_shlibdir)/libccalltest* $(build_shlibdir)/libllvmcalltest*
-	-rm -f $(BUILDDIR)/julia_flisp.boot $(BUILDDIR)/julia_flisp.boot.inc $(BUILDDIR)/jl_internal_funcs.inc
-	-rm -f $(BUILDDIR)/*.dbg.obj $(BUILDDIR)/*.o $(BUILDDIR)/*.dwo $(BUILDDIR)/*.$(SHLIB_EXT) $(BUILDDIR)/*.a $(BUILDDIR)/*.h.gen
-	-rm -f $(BUILDDIR)/julia_version.h
-endif
+	-rm -f $(BUILDDIR)/julia_version.h $(MMTK_OBJS) $(MMTK_DOBJS)
 
 clean-flisp:
 	-$(MAKE) -C $(SRCDIR)/flisp clean BUILDDIR='$(abspath $(BUILDDIR)/flisp)'
diff --git a/src/array.c b/src/array.c
index f515f5d26c024..c6cefbebceb20 100644
--- a/src/array.c
+++ b/src/array.c
@@ -497,7 +497,7 @@ JL_DLLEXPORT jl_value_t *jl_alloc_string(size_t len)
     jl_ptls_t ptls = ct->ptls;
     const size_t allocsz = sz + sizeof(jl_taggedvalue_t);
     if (sz <= GC_MAX_SZCLASS) {
-#ifndef MMTKHEAP
+#ifndef MMTK_GC
         int pool_id = jl_gc_szclass_align8(allocsz);
         jl_gc_pool_t *p = &ptls->heap.norm_pools[pool_id];
         int osize = jl_gc_sizeclasses[pool_id];
@@ -513,7 +513,7 @@ JL_DLLEXPORT jl_value_t *jl_alloc_string(size_t len)
     else {
         if (allocsz < sz) // overflow in adding offs, size was "negative"
             jl_throw(jl_memory_exception);
-#ifndef MMTKHEAP
+#ifndef MMTK_GC
         s = jl_gc_big_alloc_noinline(ptls, allocsz);
 #else
         s = jl_mmtk_gc_alloc_big(ptls, allocsz);
diff --git a/src/gc-common.c b/src/gc-common.c
new file mode 100644
index 0000000000000..f5636c97fe32a
--- /dev/null
+++ b/src/gc-common.c
@@ -0,0 +1,732 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+#include "gc.h"
+
+jl_gc_num_t gc_num = {0};
+size_t last_long_collect_interval;
+int gc_n_threads;
+jl_ptls_t* gc_all_tls_states;
+
+int64_t live_bytes = 0;
+
+JL_DLLEXPORT _Atomic(int) jl_gc_have_pending_finalizers = 0;
+
+// mutex for gc-heap-snapshot.
+jl_mutex_t heapsnapshot_lock;
+
+const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00
+JL_DLLEXPORT uintptr_t jl_get_buff_tag(void)
+{
+    return jl_buff_tag;
+}
+
+// GC knobs and self-measurement variables
+
+int64_t last_gc_total_bytes = 0;
+
+// max_total_memory is a suggestion.  We try very hard to stay
+// under this limit, but we will go above it rather than halting.
+#ifdef _P64
+typedef uint64_t memsize_t;
+const size_t default_collect_interval = 5600 * 1024 * sizeof(void*);
+const size_t max_collect_interval = 1250000000UL;
+size_t total_mem;
+// We expose this to the user/ci as jl_gc_set_max_memory
+memsize_t max_total_memory = (memsize_t) 2 * 1024 * 1024 * 1024 * 1024 * 1024;
+#else
+typedef uint32_t memsize_t;
+const size_t default_collect_interval = 3200 * 1024 * sizeof(void*);
+const size_t max_collect_interval =  500000000UL;
+// Work really hard to stay within 2GB
+// Alternative is to risk running out of address space
+// on 32 bit architectures.
+memsize_t max_total_memory = (memsize_t) 2 * 1024 * 1024 * 1024;
+#endif
+
+
+// finalizers
+// ---
+uint64_t finalizer_rngState[4];
+
+JL_DLLEXPORT void jl_gc_init_finalizer_rng_state(void)
+{
+    jl_rng_split(finalizer_rngState, jl_current_task->rngState);
+}
+
+void run_finalizer(jl_task_t *ct, void *o, void *ff)
+{
+    int ptr_finalizer = gc_ptr_tag(o, 1);
+    o = gc_ptr_clear_tag(o, 3);
+    if (ptr_finalizer) {
+        ((void (*)(void*))ff)((void*)o);
+        return;
+    }
+    JL_TRY {
+        size_t last_age = ct->world_age;
+        ct->world_age = jl_atomic_load_acquire(&jl_world_counter);
+        jl_apply_generic((jl_value_t*)ff, (jl_value_t**)&o, 1);
+        ct->world_age = last_age;
+    }
+    JL_CATCH {
+        jl_printf((JL_STREAM*)STDERR_FILENO, "error in running finalizer: ");
+        jl_static_show((JL_STREAM*)STDERR_FILENO, jl_current_exception());
+        jl_printf((JL_STREAM*)STDERR_FILENO, "\n");
+        jlbacktrace(); // written to STDERR_FILENO
+    }
+}
+
+JL_DLLEXPORT int jl_gc_get_finalizers_inhibited(jl_ptls_t ptls)
+{
+    if (ptls == NULL)
+        ptls = jl_current_task->ptls;
+    return ptls->finalizers_inhibited;
+}
+
+JL_DLLEXPORT void jl_gc_disable_finalizers_internal(void)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    ptls->finalizers_inhibited++;
+}
+
+JL_DLLEXPORT void jl_gc_enable_finalizers_internal(void)
+{
+    jl_task_t *ct = jl_current_task;
+#ifdef NDEBUG
+    ct->ptls->finalizers_inhibited--;
+#else
+    jl_gc_enable_finalizers(ct, 1);
+#endif
+}
+
+JL_DLLEXPORT void jl_gc_enable_finalizers(jl_task_t *ct, int on)
+{
+    if (ct == NULL)
+        ct = jl_current_task;
+    jl_ptls_t ptls = ct->ptls;
+    int old_val = ptls->finalizers_inhibited;
+    int new_val = old_val + (on ? -1 : 1);
+    if (new_val < 0) {
+        JL_TRY {
+            jl_error(""); // get a backtrace
+        }
+        JL_CATCH {
+            jl_printf((JL_STREAM*)STDERR_FILENO, "WARNING: GC finalizers already enabled on this thread.\n");
+            // Only print the backtrace once, to avoid spamming the logs
+            static int backtrace_printed = 0;
+            if (backtrace_printed == 0) {
+                backtrace_printed = 1;
+                jlbacktrace(); // written to STDERR_FILENO
+            }
+        }
+        return;
+    }
+    ptls->finalizers_inhibited = new_val;
+    if (jl_atomic_load_relaxed(&jl_gc_have_pending_finalizers)) {
+        jl_gc_run_pending_finalizers(ct);
+    }
+}
+
+
+// allocation
+// ---
+
+JL_DLLEXPORT jl_value_t *(jl_gc_alloc)(jl_ptls_t ptls, size_t sz, void *ty)
+{
+    return jl_gc_alloc_(ptls, sz, ty);
+}
+
+// Instrumented version of jl_gc_big_alloc_inner, called into by
+// LLVM-generated code.
+JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz)
+{
+    jl_value_t *val = jl_gc_big_alloc_inner(ptls, sz);
+
+    maybe_record_alloc_to_profile(val, sz, jl_gc_unknown_type_tag);
+    return val;
+}
+
+// This wrapper exists only to prevent `jl_gc_big_alloc_inner` from being
+// inlined into its callers. We provide an external-facing interface for
+// callers, and inline `jl_gc_big_alloc_inner` into this. (See
+// https://github.com/JuliaLang/julia/pull/43868 for more details.)
+jl_value_t *jl_gc_big_alloc_noinline(jl_ptls_t ptls, size_t sz)
+{
+    return jl_gc_big_alloc_inner(ptls, sz);
+}
+
+// Instrumented version of jl_gc_pool_alloc_inner, called into by LLVM-generated code.
+JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset, int osize)
+{
+    jl_value_t *val = jl_gc_pool_alloc_inner(ptls, pool_offset, osize);
+
+    maybe_record_alloc_to_profile(val, osize, jl_gc_unknown_type_tag);
+    return val;
+}
+
+// This wrapper exists only to prevent `jl_gc_pool_alloc_inner` from being inlined into
+// its callers. We provide an external-facing interface for callers, and inline `jl_gc_pool_alloc_inner`
+// into this. (See https://github.com/JuliaLang/julia/pull/43868 for more details.)
+jl_value_t *jl_gc_pool_alloc_noinline(jl_ptls_t ptls, int pool_offset, int osize)
+{
+    return jl_gc_pool_alloc_inner(ptls, pool_offset, osize);
+}
+
+int jl_gc_classify_pools(size_t sz, int *osize)
+{
+    if (sz > GC_MAX_SZCLASS)
+        return -1;
+    size_t allocsz = sz + sizeof(jl_taggedvalue_t);
+    int klass = jl_gc_szclass(allocsz);
+    *osize = jl_gc_sizeclasses[klass];
+    return (int)(intptr_t)(&((jl_ptls_t)0)->heap.norm_pools[klass]);
+}
+
+// TODO: jl_gc_track_malloced_array needed? Eliminate heap.mallocarrays,
+// heap.mafreelist, mallocarray_t?
+void jl_gc_track_malloced_array(jl_ptls_t ptls, jl_array_t *a) JL_NOTSAFEPOINT
+{
+    // This is **NOT** a GC safe point.
+    mallocarray_t *ma;
+    if (ptls->heap.mafreelist == NULL) {
+        ma = (mallocarray_t*)malloc_s(sizeof(mallocarray_t));
+    }
+    else {
+        ma = ptls->heap.mafreelist;
+        ptls->heap.mafreelist = ma->next;
+    }
+    ma->a = a;
+    ma->next = ptls->heap.mallocarrays;
+    ptls->heap.mallocarrays = ma;
+}
+
+void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    jl_atomic_store_relaxed(&ptls->gc_num.allocd,
+        jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz);
+}
+
+
+// GCNum, statistics manipulation
+// ---
+void combine_thread_gc_counts(jl_gc_num_t *dest) JL_NOTSAFEPOINT
+{
+    int gc_n_threads;
+    jl_ptls_t* gc_all_tls_states;
+    gc_n_threads = jl_atomic_load_acquire(&jl_n_threads);
+    gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls = gc_all_tls_states[i];
+        if (ptls) {
+            dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_num.allocd) + gc_num.interval);
+            dest->freed += jl_atomic_load_relaxed(&ptls->gc_num.freed);
+            dest->malloc += jl_atomic_load_relaxed(&ptls->gc_num.malloc);
+            dest->realloc += jl_atomic_load_relaxed(&ptls->gc_num.realloc);
+            dest->poolalloc += jl_atomic_load_relaxed(&ptls->gc_num.poolalloc);
+            dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_num.bigalloc);
+            dest->freecall += jl_atomic_load_relaxed(&ptls->gc_num.freecall);
+        }
+    }
+}
+
+void reset_thread_gc_counts(void) JL_NOTSAFEPOINT
+{
+    int gc_n_threads;
+    jl_ptls_t* gc_all_tls_states;
+    gc_n_threads = jl_atomic_load_acquire(&jl_n_threads);
+    gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls = gc_all_tls_states[i];
+        if (ptls) {
+            memset(&ptls->gc_num, 0, sizeof(ptls->gc_num));
+            jl_atomic_store_relaxed(&ptls->gc_num.allocd, -(int64_t)gc_num.interval);
+        }
+    }
+}
+
+void jl_gc_reset_alloc_count(void) JL_NOTSAFEPOINT
+{
+    combine_thread_gc_counts(&gc_num);
+    live_bytes += (gc_num.deferred_alloc + gc_num.allocd);
+    gc_num.allocd = 0;
+    gc_num.deferred_alloc = 0;
+    reset_thread_gc_counts();
+}
+
+size_t jl_array_nbytes(jl_array_t *a) JL_NOTSAFEPOINT
+{
+    size_t sz = 0;
+    int isbitsunion = jl_array_isbitsunion(a);
+    if (jl_array_ndims(a) == 1)
+        sz = a->elsize * a->maxsize + ((a->elsize == 1 && !isbitsunion) ? 1 : 0);
+    else
+        sz = a->elsize * jl_array_len(a);
+    if (isbitsunion)
+        // account for isbits Union array selector bytes
+        sz += jl_array_len(a);
+    return sz;
+}
+
+
+void gc_premark(jl_ptls_t ptls2)
+{
+    arraylist_t *remset = ptls2->heap.remset;
+    ptls2->heap.remset = ptls2->heap.last_remset;
+    ptls2->heap.last_remset = remset;
+    ptls2->heap.remset->len = 0;
+    ptls2->heap.remset_nptr = 0;
+    // avoid counting remembered objects
+    // in `perm_scanned_bytes`
+    size_t len = remset->len;
+    void **items = remset->items;
+    for (size_t i = 0; i < len; i++) {
+        jl_value_t *item = (jl_value_t *)items[i];
+        objprofile_count(jl_typeof(item), 2, 0);
+        jl_astaggedvalue(item)->bits.gc = GC_OLD_MARKED;
+    }
+}
+
+
+
+// GC control
+// ---
+
+_Atomic(uint32_t) jl_gc_disable_counter = 1;
+
+JL_DLLEXPORT int jl_gc_enable(int on)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    int prev = !ptls->disable_gc;
+    ptls->disable_gc = (on == 0);
+    if (on && !prev) {
+        // disable -> enable
+        if (jl_atomic_fetch_add(&jl_gc_disable_counter, -1) == 1) {
+            gc_num.allocd += gc_num.deferred_alloc;
+            gc_num.deferred_alloc = 0;
+            enable_collection();
+        }
+    }
+    else if (prev && !on) {
+        disable_collection();
+        // enable -> disable
+        jl_atomic_fetch_add(&jl_gc_disable_counter, 1);
+        // check if the GC is running and wait for it to finish
+        jl_gc_safepoint_(ptls);
+    }
+    return prev;
+}
+
+JL_DLLEXPORT int jl_gc_is_enabled(void)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return !ptls->disable_gc;
+}
+
+JL_DLLEXPORT void jl_gc_get_total_bytes(int64_t *bytes) JL_NOTSAFEPOINT
+{
+    jl_gc_num_t num = gc_num;
+    combine_thread_gc_counts(&num);
+    // Sync this logic with `base/util.jl:GC_Diff`
+    *bytes = (num.total_allocd + num.deferred_alloc + num.allocd);
+}
+
+JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void)
+{
+    return gc_num.total_time;
+}
+
+JL_DLLEXPORT jl_gc_num_t jl_gc_num(void)
+{
+    jl_gc_num_t num = gc_num;
+    combine_thread_gc_counts(&num);
+    return num;
+}
+
+JL_DLLEXPORT void jl_gc_reset_stats(void)
+{
+    gc_num.max_pause = 0;
+    gc_num.max_memory = 0;
+    gc_num.max_time_to_safepoint = 0;
+}
+
+// TODO: these were supposed to be thread local
+JL_DLLEXPORT int64_t jl_gc_diff_total_bytes(void) JL_NOTSAFEPOINT
+{
+    int64_t oldtb = last_gc_total_bytes;
+    int64_t newtb;
+    jl_gc_get_total_bytes(&newtb);
+    last_gc_total_bytes = newtb;
+    return newtb - oldtb;
+}
+
+JL_DLLEXPORT int64_t jl_gc_sync_total_bytes(int64_t offset) JL_NOTSAFEPOINT
+{
+    int64_t oldtb = last_gc_total_bytes;
+    int64_t newtb;
+    jl_gc_get_total_bytes(&newtb);
+    last_gc_total_bytes = newtb - offset;
+    return newtb - oldtb;
+}
+
+JL_DLLEXPORT int64_t jl_gc_live_bytes(void)
+{
+    return live_bytes;
+}
+
+JL_DLLEXPORT void jl_gc_set_max_memory(uint64_t max_mem)
+{
+    if (max_mem > 0 && max_mem < (uint64_t)1 << (sizeof(memsize_t) * 8 - 1))
+        max_total_memory = max_mem;
+}
+
+// callback for passing OOM errors from gmp
+JL_DLLEXPORT void jl_throw_out_of_memory_error(void)
+{
+    jl_throw(jl_memory_exception);
+}
+
+// allocation wrappers that save the size of allocations, to allow using
+// jl_gc_counted_* functions with a libc-compatible API.
+
+JL_DLLEXPORT void *jl_malloc(size_t sz)
+{
+    int64_t *p = (int64_t *)jl_gc_counted_malloc(sz + JL_SMALL_BYTE_ALIGNMENT);
+    if (p == NULL)
+        return NULL;
+    p[0] = sz;
+    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
+}
+
+//_unchecked_calloc does not check for potential overflow of nm*sz
+STATIC_INLINE void *_unchecked_calloc(size_t nm, size_t sz) {
+    size_t nmsz = nm*sz;
+    int64_t *p = (int64_t *)jl_gc_counted_calloc(nmsz + JL_SMALL_BYTE_ALIGNMENT, 1);
+    if (p == NULL)
+        return NULL;
+    p[0] = nmsz;
+    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
+}
+
+JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz)
+{
+    if (nm > SSIZE_MAX/sz - JL_SMALL_BYTE_ALIGNMENT)
+        return NULL;
+    return _unchecked_calloc(nm, sz);
+}
+
+JL_DLLEXPORT void jl_free(void *p)
+{
+    if (p != NULL) {
+        int64_t *pp = (int64_t *)p - 2;
+        size_t sz = pp[0];
+        jl_gc_counted_free_with_size(pp, sz + JL_SMALL_BYTE_ALIGNMENT);
+    }
+}
+
+JL_DLLEXPORT void *jl_realloc(void *p, size_t sz)
+{
+    int64_t *pp;
+    size_t szold;
+    if (p == NULL) {
+        pp = NULL;
+        szold = 0;
+    }
+    else {
+        pp = (int64_t *)p - 2;
+        szold = pp[0] + JL_SMALL_BYTE_ALIGNMENT;
+    }
+    int64_t *pnew = (int64_t *)jl_gc_counted_realloc_with_old_size(pp, szold, sz + JL_SMALL_BYTE_ALIGNMENT);
+    if (pnew == NULL)
+        return NULL;
+    pnew[0] = sz;
+    return (void *)(pnew + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
+}
+
+// allocating blocks for Arrays and Strings
+
+JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    maybe_collect(ptls);
+    size_t allocsz = LLT_ALIGN(sz, JL_CACHE_BYTE_ALIGNMENT);
+    if (allocsz < sz)  // overflow in adding offs, size was "negative"
+        jl_throw(jl_memory_exception);
+    jl_atomic_store_relaxed(&ptls->gc_num.allocd,
+        jl_atomic_load_relaxed(&ptls->gc_num.allocd) + allocsz);
+    jl_atomic_store_relaxed(&ptls->gc_num.malloc,
+        jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1);
+    int last_errno = errno;
+#ifdef _OS_WINDOWS_
+    DWORD last_error = GetLastError();
+#endif
+    void *b = malloc_cache_align(allocsz);
+    if (b == NULL)
+        jl_throw(jl_memory_exception);
+#ifdef _OS_WINDOWS_
+    SetLastError(last_error);
+#endif
+    errno = last_errno;
+    // jl_gc_managed_malloc is currently always used for allocating array buffers.
+    maybe_record_alloc_to_profile((jl_value_t*)b, sz, (jl_datatype_t*)jl_buff_tag);
+    return b;
+}
+
+void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t oldsz,
+                                 int isaligned, jl_value_t *owner, int8_t can_collect)
+{
+    if (can_collect)
+        maybe_collect(ptls);
+
+    size_t allocsz = LLT_ALIGN(sz, JL_CACHE_BYTE_ALIGNMENT);
+    if (allocsz < sz)  // overflow in adding offs, size was "negative"
+        jl_throw(jl_memory_exception);
+
+    // TODO: not needed? gc_cache.*?
+    if (jl_astaggedvalue(owner)->bits.gc == GC_OLD_MARKED) {
+        ptls->gc_cache.perm_scanned_bytes += allocsz - oldsz;
+        live_bytes += allocsz - oldsz;
+    }
+    else if (allocsz < oldsz)
+        jl_atomic_store_relaxed(&ptls->gc_num.freed,
+            jl_atomic_load_relaxed(&ptls->gc_num.freed) + (oldsz - allocsz));
+    else
+        jl_atomic_store_relaxed(&ptls->gc_num.allocd,
+            jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (allocsz - oldsz));
+    jl_atomic_store_relaxed(&ptls->gc_num.realloc,
+        jl_atomic_load_relaxed(&ptls->gc_num.realloc) + 1);
+
+    int last_errno = errno;
+#ifdef _OS_WINDOWS_
+    DWORD last_error = GetLastError();
+#endif
+    void *b;
+    if (isaligned)
+        b = realloc_cache_align(d, allocsz, oldsz);
+    else
+        b = realloc(d, allocsz);
+    if (b == NULL)
+        jl_throw(jl_memory_exception);
+#ifdef _OS_WINDOWS_
+    SetLastError(last_error);
+#endif
+    errno = last_errno;
+    maybe_record_alloc_to_profile((jl_value_t*)b, sz, jl_gc_unknown_type_tag);
+    return b;
+}
+
+JL_DLLEXPORT void *jl_gc_managed_realloc(void *d, size_t sz, size_t oldsz,
+                                         int isaligned, jl_value_t *owner)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return gc_managed_realloc_(ptls, d, sz, oldsz, isaligned, owner, 1);
+}
+
+// Perm gen allocator
+// 2M pool
+#define GC_PERM_POOL_SIZE (2 * 1024 * 1024)
+// 20k limit for pool allocation. At most 1% fragmentation
+#define GC_PERM_POOL_LIMIT (20 * 1024)
+uv_mutex_t gc_perm_lock;
+static uintptr_t gc_perm_pool = 0;
+static uintptr_t gc_perm_end = 0;
+
+static void *gc_perm_alloc_large(size_t sz, int zero, unsigned align, unsigned offset) JL_NOTSAFEPOINT
+{
+    // `align` must be power of two
+    assert(offset == 0 || offset < align);
+    const size_t malloc_align = sizeof(void*) == 8 ? 16 : 4;
+    if (align > 1 && (offset != 0 || align > malloc_align))
+        sz += align - 1;
+    int last_errno = errno;
+#ifdef _OS_WINDOWS_
+    DWORD last_error = GetLastError();
+#endif
+    void *base = zero ? calloc(1, sz) : malloc(sz);
+    if (base == NULL)
+        jl_throw(jl_memory_exception);
+#ifdef _OS_WINDOWS_
+    SetLastError(last_error);
+#endif
+    errno = last_errno;
+    jl_may_leak(base);
+    assert(align > 0);
+    unsigned diff = (offset - (uintptr_t)base) % align;
+    return (void*)((char*)base + diff);
+}
+
+STATIC_INLINE void *gc_try_perm_alloc_pool(size_t sz, unsigned align, unsigned offset) JL_NOTSAFEPOINT
+{
+    uintptr_t pool = LLT_ALIGN(gc_perm_pool + offset, (uintptr_t)align) - offset;
+    uintptr_t end = pool + sz;
+    if (end > gc_perm_end)
+        return NULL;
+    gc_perm_pool = end;
+    return (void*)jl_assume(pool);
+}
+
+// **NOT** a safepoint
+void *jl_gc_perm_alloc_nolock(size_t sz, int zero, unsigned align, unsigned offset)
+{
+    // The caller should have acquired `gc_perm_lock`
+    assert(align < GC_PERM_POOL_LIMIT);
+#ifndef MEMDEBUG
+    if (__unlikely(sz > GC_PERM_POOL_LIMIT))
+#endif
+        return gc_perm_alloc_large(sz, zero, align, offset);
+    void *ptr = gc_try_perm_alloc_pool(sz, align, offset);
+    if (__likely(ptr))
+        return ptr;
+    int last_errno = errno;
+#ifdef _OS_WINDOWS_
+    DWORD last_error = GetLastError();
+    void *pool = VirtualAlloc(NULL, GC_PERM_POOL_SIZE, MEM_COMMIT, PAGE_READWRITE);
+    SetLastError(last_error);
+    errno = last_errno;
+    if (__unlikely(pool == NULL))
+        return NULL;
+#else
+    void *pool = mmap(0, GC_PERM_POOL_SIZE, PROT_READ | PROT_WRITE,
+                      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    errno = last_errno;
+    if (__unlikely(pool == MAP_FAILED))
+        return NULL;
+#endif
+    gc_perm_pool = (uintptr_t)pool;
+    gc_perm_end = gc_perm_pool + GC_PERM_POOL_SIZE;
+    return gc_try_perm_alloc_pool(sz, align, offset);
+}
+
+// **NOT** a safepoint
+void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align, unsigned offset)
+{
+    assert(align < GC_PERM_POOL_LIMIT);
+#ifndef MEMDEBUG
+    if (__unlikely(sz > GC_PERM_POOL_LIMIT))
+#endif
+        return gc_perm_alloc_large(sz, zero, align, offset);
+    uv_mutex_lock(&gc_perm_lock);
+    void *p = jl_gc_perm_alloc_nolock(sz, zero, align, offset);
+    uv_mutex_unlock(&gc_perm_lock);
+    return p;
+}
+
+JL_DLLEXPORT void jl_gc_add_finalizer(jl_value_t *v, jl_function_t *f)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    jl_gc_add_finalizer_th(ptls, v, f);
+}
+
+JL_DLLEXPORT void jl_finalize(jl_value_t *o)
+{
+    jl_finalize_th(jl_current_task, o);
+}
+
+JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return jl_gc_new_weakref_th(ptls, value);
+}
+
+JL_DLLEXPORT jl_value_t *jl_gc_allocobj(size_t sz)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return jl_gc_alloc(ptls, sz, NULL);
+}
+
+JL_DLLEXPORT jl_value_t *jl_gc_alloc_0w(void)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return jl_gc_alloc(ptls, 0, NULL);
+}
+
+JL_DLLEXPORT jl_value_t *jl_gc_alloc_1w(void)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return jl_gc_alloc(ptls, sizeof(void*), NULL);
+}
+
+JL_DLLEXPORT jl_value_t *jl_gc_alloc_2w(void)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return jl_gc_alloc(ptls, sizeof(void*) * 2, NULL);
+}
+
+JL_DLLEXPORT jl_value_t *jl_gc_alloc_3w(void)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return jl_gc_alloc(ptls, sizeof(void*) * 3, NULL);
+}
+
+JL_DLLEXPORT size_t jl_gc_max_internal_obj_size(void)
+{
+    // TODO: meaningful for MMTk?
+    return GC_MAX_SZCLASS;
+}
+
+JL_DLLEXPORT size_t jl_gc_external_obj_hdr_size(void)
+{
+    return sizeof(bigval_t);
+}
+
+
+JL_DLLEXPORT void * jl_gc_alloc_typed(jl_ptls_t ptls, size_t sz, void *ty)
+{
+    return jl_gc_alloc(ptls, sz, ty);
+}
+
+JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj)
+{
+    // TODO: correct for MMTk?
+    arraylist_push(&ptls->sweep_objs, obj);
+}
+
+
+// gc-debug common functions
+// ---
+
+int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
+{
+    int nf = (int)jl_datatype_nfields(vt);
+    for (int i = 1; i < nf; i++) {
+        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
+            return i - 1;
+    }
+    return nf - 1;
+}
+
+int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
+{
+    char *slot = (char*)_slot;
+    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
+    char *start = NULL;
+    size_t len = 0;
+    size_t elsize = sizeof(void*);
+    if (vt == jl_module_type) {
+        jl_module_t *m = (jl_module_t*)obj;
+        start = (char*)m->usings.items;
+        len = m->usings.len;
+    }
+    else if (vt == jl_simplevector_type) {
+        start = (char*)jl_svec_data(obj);
+        len = jl_svec_len(obj);
+    }
+    else if (vt->name == jl_array_typename) {
+        jl_array_t *a = (jl_array_t*)obj;
+        start = (char*)a->data;
+        len = jl_array_len(a);
+        elsize = a->elsize;
+    }
+    if (slot < start || slot >= start + elsize * len)
+        return -1;
+    return (slot - start) / elsize;
+}
+
+static int gc_logging_enabled = 0;
+
+JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
+    gc_logging_enabled = enable;
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/gc-debug.c b/src/gc-debug.c
index a233b18d7dcfc..c5ab21a3fb3c1 100644
--- a/src/gc-debug.c
+++ b/src/gc-debug.c
@@ -1,5 +1,7 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
 
+#ifndef MMTK_GC
+
 #include "gc.h"
 #include <inttypes.h>
 #include <stdio.h>
@@ -1231,43 +1233,6 @@ void gc_count_pool(void)
     jl_safe_printf("************************\n");
 }
 
-int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
-{
-    int nf = (int)jl_datatype_nfields(vt);
-    for (int i = 1; i < nf; i++) {
-        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
-            return i - 1;
-    }
-    return nf - 1;
-}
-
-int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
-{
-    char *slot = (char*)_slot;
-    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
-    char *start = NULL;
-    size_t len = 0;
-    size_t elsize = sizeof(void*);
-    if (vt == jl_module_type) {
-        jl_module_t *m = (jl_module_t*)obj;
-        start = (char*)m->usings.items;
-        len = m->usings.len;
-    }
-    else if (vt == jl_simplevector_type) {
-        start = (char*)jl_svec_data(obj);
-        len = jl_svec_len(obj);
-    }
-    else if (vt->name == jl_array_typename) {
-        jl_array_t *a = (jl_array_t*)obj;
-        start = (char*)a->data;
-        len = jl_array_len(a);
-        elsize = a->elsize;
-    }
-    if (slot < start || slot >= start + elsize * len)
-        return -1;
-    return (slot - start) / elsize;
-}
-
 // Print a backtrace from the `mq->start` of the mark queue up to `mq->current`
 // `offset` will be added to `mq->current` for convenience in the debugger.
 NOINLINE void gc_mark_loop_unwind(jl_ptls_t ptls, jl_gc_markqueue_t *mq, int offset)
@@ -1292,12 +1257,6 @@ NOINLINE void gc_mark_loop_unwind(jl_ptls_t ptls, jl_gc_markqueue_t *mq, int off
     jl_set_safe_restore(old_buf);
 }
 
-static int gc_logging_enabled = 0;
-
-JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
-    gc_logging_enabled = enable;
-}
-
 void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect) JL_NOTSAFEPOINT {
     if (!gc_logging_enabled) {
         return;
@@ -1312,3 +1271,5 @@ void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect
 #ifdef __cplusplus
 }
 #endif
+
+#endif  // !MMTK_GC
diff --git a/src/gc-pages.c b/src/gc-pages.c
index d579eb0cd4fbb..e367334450863 100644
--- a/src/gc-pages.c
+++ b/src/gc-pages.c
@@ -1,5 +1,7 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
 
+#ifndef MMTK_GC
+
 #include "gc.h"
 #ifndef _OS_WINDOWS_
 #  include <sys/resource.h>
@@ -335,3 +337,5 @@ void jl_gc_free_page(void *p) JL_NOTSAFEPOINT
 #ifdef __cplusplus
 }
 #endif
+
+#endif // !MMTK_GC
diff --git a/src/gc.c b/src/gc.c
index cab7c37369450..e656fa331be38 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -1,5 +1,7 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
 
+#ifndef MMTK_GC
+
 #include "gc.h"
 #include "julia_gcext.h"
 #include "julia_assert.h"
@@ -7,10 +9,6 @@
 #include <malloc.h> // for malloc_trim
 #endif
 
-#ifdef MMTKHEAP
-#include "mmtk_julia.h"
-#endif
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -123,9 +121,6 @@ JL_DLLEXPORT void jl_gc_set_cb_notify_external_free(jl_gc_cb_notify_external_fre
 static jl_mutex_t finalizers_lock;
 static uv_mutex_t gc_cache_lock;
 
-// mutex for gc-heap-snapshot.
-jl_mutex_t heapsnapshot_lock;
-
 // Flag that tells us whether we need to support conservative marking
 // of objects.
 static _Atomic(int) support_conservative_marking = 0;
@@ -162,16 +157,6 @@ static _Atomic(int) support_conservative_marking = 0;
  * finalizers in unmanaged (GC safe) mode.
  */
 
-jl_gc_num_t gc_num = {0};
-static size_t last_long_collect_interval;
-int gc_n_threads;
-jl_ptls_t* gc_all_tls_states;
-const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00
-JL_DLLEXPORT uintptr_t jl_get_buff_tag(void)
-{
-    return jl_buff_tag;
-}
-
 pagetable_t memory_map;
 
 // List of marked big objects.  Not per-thread.  Accessed only by master thread.
@@ -185,7 +170,6 @@ bigval_t *big_objects_marked = NULL;
 // `to_finalize` should not have tagged pointers.
 arraylist_t finalizer_list_marked;
 arraylist_t to_finalize;
-JL_DLLEXPORT _Atomic(int) jl_gc_have_pending_finalizers = 0;
 
 NOINLINE uintptr_t gc_get_stack_ptr(void)
 {
@@ -215,31 +199,26 @@ void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads)
 }
 
 
-void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads);
-
 // malloc wrappers, aligned allocation
 
 #if defined(_OS_WINDOWS_)
-STATIC_INLINE void *jl_malloc_aligned(size_t sz, size_t align)
+inline void *jl_malloc_aligned(size_t sz, size_t align)
 {
     return _aligned_malloc(sz ? sz : 1, align);
 }
-STATIC_INLINE void *jl_realloc_aligned(void *p, size_t sz, size_t oldsz,
+inline void *jl_realloc_aligned(void *p, size_t sz, size_t oldsz,
                                        size_t align)
 {
     (void)oldsz;
     return _aligned_realloc(p, sz ? sz : 1, align);
 }
-STATIC_INLINE void jl_free_aligned(void *p) JL_NOTSAFEPOINT
+inline void jl_free_aligned(void *p) JL_NOTSAFEPOINT
 {
     _aligned_free(p);
 }
 #else
-STATIC_INLINE void *jl_malloc_aligned(size_t sz, size_t align)
+inline void *jl_malloc_aligned(size_t sz, size_t align)
 {
-#ifdef MMTKHEAP
-    return mmtk_malloc_aligned(sz, align);
-#endif
 #if defined(_P64) || defined(__APPLE__)
     if (align <= 16)
         return malloc(sz);
@@ -249,17 +228,9 @@ STATIC_INLINE void *jl_malloc_aligned(size_t sz, size_t align)
         return NULL;
     return ptr;
 }
-STATIC_INLINE void *jl_realloc_aligned(void *d, size_t sz, size_t oldsz,
+inline void *jl_realloc_aligned(void *d, size_t sz, size_t oldsz,
                                        size_t align)
 {
-#ifdef MMTKHEAP
-    void *res = jl_malloc_aligned(sz, align);
-    if (res != NULL) {
-        memcpy(res, d, oldsz > sz ? sz : oldsz);
-        mmtk_free_aligned(d);
-    }
-    return res;
-#endif
 #if defined(_P64) || defined(__APPLE__)
     if (align <= 16)
         return realloc(d, sz);
@@ -271,17 +242,11 @@ STATIC_INLINE void *jl_realloc_aligned(void *d, size_t sz, size_t oldsz,
     }
     return b;
 }
-STATIC_INLINE void jl_free_aligned(void *p) JL_NOTSAFEPOINT
+inline void jl_free_aligned(void *p) JL_NOTSAFEPOINT
 {
-#ifdef MMTKHEAP
-    mmtk_free_aligned(p);
-#else
     free(p);
-#endif
 }
 #endif
-#define malloc_cache_align(sz) jl_malloc_aligned(sz, JL_CACHE_BYTE_ALIGNMENT)
-#define realloc_cache_align(p, sz, oldsz) jl_realloc_aligned(p, sz, oldsz, JL_CACHE_BYTE_ALIGNMENT)
 
 static void schedule_finalization(void *o, void *f) JL_NOTSAFEPOINT
 {
@@ -292,31 +257,6 @@ static void schedule_finalization(void *o, void *f) JL_NOTSAFEPOINT
     jl_atomic_store_relaxed(&jl_gc_have_pending_finalizers, 1);
 }
 
-#ifndef MMTKHEAP
-static
-#endif
-void run_finalizer(jl_task_t *ct, void *o, void *ff)
-{
-    int ptr_finalizer = gc_ptr_tag(o, 1);
-    o = gc_ptr_clear_tag(o, 3);
-    if (ptr_finalizer) {
-        ((void (*)(void*))ff)((void*)o);
-        return;
-    }
-    JL_TRY {
-        size_t last_age = ct->world_age;
-        ct->world_age = jl_atomic_load_acquire(&jl_world_counter);
-        jl_apply_generic((jl_value_t*)ff, (jl_value_t**)&o, 1);
-        ct->world_age = last_age;
-    }
-    JL_CATCH {
-        jl_printf((JL_STREAM*)STDERR_FILENO, "error in running finalizer: ");
-        jl_static_show((JL_STREAM*)STDERR_FILENO, jl_current_exception());
-        jl_printf((JL_STREAM*)STDERR_FILENO, "\n");
-        jlbacktrace(); // written to STDERR_FILENO
-    }
-}
-
 // if `need_sync` is true, the `list` is the `finalizers` list of another
 // thread and we need additional synchronizations
 static void finalize_object(arraylist_t *list, jl_value_t *o,
@@ -404,24 +344,8 @@ static void jl_gc_run_finalizers_in_list(jl_task_t *ct, arraylist_t *list) JL_NO
     ct->sticky = sticky;
 }
 
-#ifndef MMTKHEAP
-static
-#endif
-uint64_t finalizer_rngState[4];
-
-void jl_rng_split(uint64_t to[4], uint64_t from[4]) JL_NOTSAFEPOINT;
-
-JL_DLLEXPORT void jl_gc_init_finalizer_rng_state(void)
-{
-    jl_rng_split(finalizer_rngState, jl_current_task->rngState);
-}
-
 static void run_finalizers(jl_task_t *ct)
 {
-#ifdef MMTKHEAP
-    mmtk_jl_run_finalizers(ct->ptls);
-    return;
-#endif
     // Racy fast path:
     // The race here should be OK since the race can only happen if
     // another thread is writing to it with the lock held. In such case,
@@ -460,67 +384,12 @@ JL_DLLEXPORT void jl_gc_run_pending_finalizers(jl_task_t *ct)
 {
     if (ct == NULL)
         ct = jl_current_task;
-#ifdef MMTKHEAP
-    mmtk_jl_run_pending_finalizers(ct->ptls);
-    return;
-#endif
     jl_ptls_t ptls = ct->ptls;
     if (!ptls->in_finalizer && ptls->locks.len == 0 && ptls->finalizers_inhibited == 0) {
         run_finalizers(ct);
     }
 }
 
-JL_DLLEXPORT int jl_gc_get_finalizers_inhibited(jl_ptls_t ptls)
-{
-    if (ptls == NULL)
-        ptls = jl_current_task->ptls;
-    return ptls->finalizers_inhibited;
-}
-
-JL_DLLEXPORT void jl_gc_disable_finalizers_internal(void)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    ptls->finalizers_inhibited++;
-}
-
-JL_DLLEXPORT void jl_gc_enable_finalizers_internal(void)
-{
-    jl_task_t *ct = jl_current_task;
-#ifdef NDEBUG
-    ct->ptls->finalizers_inhibited--;
-#else
-    jl_gc_enable_finalizers(ct, 1);
-#endif
-}
-
-JL_DLLEXPORT void jl_gc_enable_finalizers(jl_task_t *ct, int on)
-{
-    if (ct == NULL)
-        ct = jl_current_task;
-    jl_ptls_t ptls = ct->ptls;
-    int old_val = ptls->finalizers_inhibited;
-    int new_val = old_val + (on ? -1 : 1);
-    if (new_val < 0) {
-        JL_TRY {
-            jl_error(""); // get a backtrace
-        }
-        JL_CATCH {
-            jl_printf((JL_STREAM*)STDERR_FILENO, "WARNING: GC finalizers already enabled on this thread.\n");
-            // Only print the backtrace once, to avoid spamming the logs
-            static int backtrace_printed = 0;
-            if (backtrace_printed == 0) {
-                backtrace_printed = 1;
-                jlbacktrace(); // written to STDERR_FILENO
-            }
-        }
-        return;
-    }
-    ptls->finalizers_inhibited = new_val;
-    if (jl_atomic_load_relaxed(&jl_gc_have_pending_finalizers)) {
-        jl_gc_run_pending_finalizers(ct);
-    }
-}
-
 static void schedule_all_finalizers(arraylist_t *flist) JL_NOTSAFEPOINT
 {
     void **items = flist->items;
@@ -537,6 +406,7 @@ static void schedule_all_finalizers(arraylist_t *flist) JL_NOTSAFEPOINT
 
 void jl_gc_run_all_finalizers(jl_task_t *ct)
 {
+    if (!ct) return;
     int gc_n_threads;
     jl_ptls_t* gc_all_tls_states;
     gc_n_threads = jl_atomic_load_acquire(&jl_n_threads);
@@ -554,10 +424,6 @@ void jl_gc_run_all_finalizers(jl_task_t *ct)
 
 void jl_gc_add_finalizer_(jl_ptls_t ptls, void *v, void *f) JL_NOTSAFEPOINT
 {
-#ifdef MMTKHEAP
-    register_finalizer(v, f, 0);
-    return;
-#endif
     assert(jl_atomic_load_relaxed(&ptls->gc_state) == 0);
     arraylist_t *a = &ptls->finalizers;
     // This acquire load and the release store at the end are used to
@@ -586,20 +452,14 @@ void jl_gc_add_finalizer_(jl_ptls_t ptls, void *v, void *f) JL_NOTSAFEPOINT
 
 JL_DLLEXPORT void jl_gc_add_ptr_finalizer(jl_ptls_t ptls, jl_value_t *v, void *f) JL_NOTSAFEPOINT
 {
-#ifndef MMTKHEAP
     jl_gc_add_finalizer_(ptls, (void*)(((uintptr_t)v) | 1), f);
-#else
-    register_finalizer(v, f, 1);
-#endif
 }
 
 // schedule f(v) to call at the next quiescent interval (aka after the next safepoint/region on all threads)
 JL_DLLEXPORT void jl_gc_add_quiescent(jl_ptls_t ptls, void **v, void *f) JL_NOTSAFEPOINT
 {
-#ifndef MMTKHEAP
     assert(!gc_ptr_tag(v, 3));
     jl_gc_add_finalizer_(ptls, (void*)(((uintptr_t)v) | 3), f);
-#endif
 }
 
 JL_DLLEXPORT void jl_gc_add_finalizer_th(jl_ptls_t ptls, jl_value_t *v, jl_function_t *f) JL_NOTSAFEPOINT
@@ -614,10 +474,6 @@ JL_DLLEXPORT void jl_gc_add_finalizer_th(jl_ptls_t ptls, jl_value_t *v, jl_funct
 
 JL_DLLEXPORT void jl_finalize_th(jl_task_t *ct, jl_value_t *o)
 {
-#ifdef MMTKHEAP
-    run_finalizers_for_obj(o);
-    return;
-#endif
     JL_LOCK_NOGC(&finalizers_lock);
     // Copy the finalizers into a temporary list so that code in the finalizer
     // won't change the list as we loop through them.
@@ -677,28 +533,6 @@ static void gc_sweep_foreign_objs(void)
     }
 }
 
-// GC knobs and self-measurement variables
-static int64_t last_gc_total_bytes = 0;
-
-// max_total_memory is a suggestion.  We try very hard to stay
-// under this limit, but we will go above it rather than halting.
-#ifdef _P64
-typedef uint64_t memsize_t;
-static const size_t default_collect_interval = 5600 * 1024 * sizeof(void*);
-static const size_t max_collect_interval = 1250000000UL;
-static size_t total_mem;
-// We expose this to the user/ci as jl_gc_set_max_memory
-static memsize_t max_total_memory = (memsize_t) 2 * 1024 * 1024 * 1024 * 1024 * 1024;
-#else
-typedef uint32_t memsize_t;
-static const size_t default_collect_interval = 3200 * 1024 * sizeof(void*);
-static const size_t max_collect_interval =  500000000UL;
-// Work really hard to stay within 2GB
-// Alternative is to risk running out of address space
-// on 32 bit architectures.
-static memsize_t max_total_memory = (memsize_t) 2 * 1024 * 1024 * 1024;
-#endif
-
 // global variables for GC stats
 
 // Resetting the object to a young object, this is used when marking the
@@ -761,7 +595,7 @@ int prev_sweep_full = 1;
 #define inc_sat(v,s) v = (v) >= s ? s : (v)+1
 
 // Full collection heuristics
-static int64_t live_bytes = 0;
+extern int64_t live_bytes;
 static int64_t promoted_bytes = 0;
 static int64_t last_live_bytes = 0; // live_bytes at last collection
 static int64_t t_start = 0; // Time GC starts;
@@ -977,18 +811,14 @@ void jl_gc_force_mark_old(jl_ptls_t ptls, jl_value_t *v) JL_NOTSAFEPOINT
         jl_gc_queue_root(v);
 }
 
-STATIC_INLINE void maybe_collect(jl_ptls_t ptls)
+inline void maybe_collect(jl_ptls_t ptls)
 {
-#ifndef MMTKHEAP
     if (jl_atomic_load_relaxed(&ptls->gc_num.allocd) >= 0 || jl_gc_debug_check_other()) {
         jl_gc_collect(JL_GC_AUTO);
     }
     else {
         jl_gc_safepoint_(ptls);
     }
-#else
-    mmtk_gc_poll(ptls);
-#endif
 }
 
 // weak references
@@ -999,11 +829,7 @@ JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref_th(jl_ptls_t ptls,
     jl_weakref_t *wr = (jl_weakref_t*)jl_gc_alloc(ptls, sizeof(void*),
                                                   jl_weakref_type);
     wr->value = value;  // NOTE: wb not needed here
-#ifdef MMTKHEAP
-    mmtk_add_weak_candidate(wr);
-#else
     arraylist_push(&ptls->heap.weak_refs, wr);
-#endif
     return wr;
 }
 
@@ -1057,7 +883,7 @@ static void sweep_weak_refs(void)
 // big value list
 
 // Size includes the tag and the tag is not cleared!!
-STATIC_INLINE jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz)
+inline jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz)
 {
     maybe_collect(ptls);
     size_t offs = offsetof(bigval_t, header);
@@ -1085,21 +911,6 @@ STATIC_INLINE jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz)
     return jl_valueof(&v->header);
 }
 
-// Instrumented version of jl_gc_big_alloc_inner, called into by LLVM-generated code.
-JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz)
-{
-    jl_value_t *val = jl_gc_big_alloc_inner(ptls, sz);
-    maybe_record_alloc_to_profile(val, sz, jl_gc_unknown_type_tag);
-    return val;
-}
-
-// This wrapper exists only to prevent `jl_gc_big_alloc_inner` from being inlined into
-// its callers. We provide an external-facing interface for callers, and inline `jl_gc_big_alloc_inner`
-// into this. (See https://github.com/JuliaLang/julia/pull/43868 for more details.)
-jl_value_t *jl_gc_big_alloc_noinline(jl_ptls_t ptls, size_t sz) {
-    return jl_gc_big_alloc_inner(ptls, sz);
-}
-
 // Sweep list rooted at *pv, removing and freeing any unmarked objects.
 // Return pointer to last `next` field in the culled list.
 static bigval_t **sweep_big_list(int sweep_full, bigval_t **pv) JL_NOTSAFEPOINT
@@ -1166,108 +977,14 @@ static void sweep_big(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT
     gc_time_big_end();
 }
 
-// tracking Arrays with malloc'd storage
-
-void jl_gc_track_malloced_array(jl_ptls_t ptls, jl_array_t *a) JL_NOTSAFEPOINT
-{
-    // This is **NOT** a GC safe point.
-    mallocarray_t *ma;
-    if (ptls->heap.mafreelist == NULL) {
-        ma = (mallocarray_t*)malloc_s(sizeof(mallocarray_t));
-    }
-    else {
-        ma = ptls->heap.mafreelist;
-        ptls->heap.mafreelist = ma->next;
-    }
-    ma->a = a;
-    ma->next = ptls->heap.mallocarrays;
-    ptls->heap.mallocarrays = ma;
-}
-
-void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    jl_atomic_store_relaxed(&ptls->gc_num.allocd,
-        jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz);
-}
-
-static void combine_thread_gc_counts(jl_gc_num_t *dest) JL_NOTSAFEPOINT
-{
-    int gc_n_threads;
-    jl_ptls_t* gc_all_tls_states;
-    gc_n_threads = jl_atomic_load_acquire(&jl_n_threads);
-    gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
-    for (int i = 0; i < gc_n_threads; i++) {
-        jl_ptls_t ptls = gc_all_tls_states[i];
-        if (ptls) {
-            dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_num.allocd) + gc_num.interval);
-            dest->freed += jl_atomic_load_relaxed(&ptls->gc_num.freed);
-            dest->malloc += jl_atomic_load_relaxed(&ptls->gc_num.malloc);
-            dest->realloc += jl_atomic_load_relaxed(&ptls->gc_num.realloc);
-            dest->poolalloc += jl_atomic_load_relaxed(&ptls->gc_num.poolalloc);
-            dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_num.bigalloc);
-            dest->freecall += jl_atomic_load_relaxed(&ptls->gc_num.freecall);
-        }
-    }
-}
-
-static void reset_thread_gc_counts(void) JL_NOTSAFEPOINT
-{
-    int gc_n_threads;
-    jl_ptls_t* gc_all_tls_states;
-    gc_n_threads = jl_atomic_load_acquire(&jl_n_threads);
-    gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
-    for (int i = 0; i < gc_n_threads; i++) {
-        jl_ptls_t ptls = gc_all_tls_states[i];
-        if (ptls != NULL) {
-            memset(&ptls->gc_num, 0, sizeof(ptls->gc_num));
-            jl_atomic_store_relaxed(&ptls->gc_num.allocd, -(int64_t)gc_num.interval);
-        }
-    }
-}
-
-void jl_gc_reset_alloc_count(void) JL_NOTSAFEPOINT
-{
-    combine_thread_gc_counts(&gc_num);
-    live_bytes += (gc_num.deferred_alloc + gc_num.allocd);
-    gc_num.allocd = 0;
-    gc_num.deferred_alloc = 0;
-    reset_thread_gc_counts();
-}
-
-size_t jl_array_nbytes(jl_array_t *a) JL_NOTSAFEPOINT
-{
-    size_t sz = 0;
-    int isbitsunion = jl_array_isbitsunion(a);
-    if (jl_array_ndims(a) == 1)
-        sz = a->elsize * a->maxsize + ((a->elsize == 1 && !isbitsunion) ? 1 : 0);
-    else
-        sz = a->elsize * jl_array_len(a);
-    if (isbitsunion)
-        // account for isbits Union array selector bytes
-        sz += jl_array_len(a);
-    return sz;
-}
-
-#ifndef MMTKHEAP
-static
-#endif
-void jl_gc_free_array(jl_array_t *a) JL_NOTSAFEPOINT
+static void jl_gc_free_array(jl_array_t *a) JL_NOTSAFEPOINT
 {
     if (a->flags.how == 2) {
         char *d = (char*)a->data - a->offset*a->elsize;
-#ifndef MMTKHEAP
         if (a->flags.isaligned)
             jl_free_aligned(d);
         else
             free(d);
-#else
-        if (a->flags.isaligned)
-            mmtk_free_aligned(d);
-        else {
-            mmtk_free(d);
-        }
-#endif
         gc_num.freed += jl_array_nbytes(a);
         gc_num.freecall++;
     }
@@ -1351,7 +1068,7 @@ static NOINLINE jl_taggedvalue_t *add_page(jl_gc_pool_t *p) JL_NOTSAFEPOINT
 }
 
 // Size includes the tag and the tag is not cleared!!
-STATIC_INLINE jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset,
+inline jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset,
                                           int osize)
 {
     // Use the pool offset instead of the pool address as the argument
@@ -1409,32 +1126,6 @@ STATIC_INLINE jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset
     return jl_valueof(v);
 }
 
-// Instrumented version of jl_gc_pool_alloc_inner, called into by LLVM-generated code.
-JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset,
-                                          int osize)
-{
-    jl_value_t *val = jl_gc_pool_alloc_inner(ptls, pool_offset, osize);
-    maybe_record_alloc_to_profile(val, osize, jl_gc_unknown_type_tag);
-    return val;
-}
-
-// This wrapper exists only to prevent `jl_gc_pool_alloc_inner` from being inlined into
-// its callers. We provide an external-facing interface for callers, and inline `jl_gc_pool_alloc_inner`
-// into this. (See https://github.com/JuliaLang/julia/pull/43868 for more details.)
-jl_value_t *jl_gc_pool_alloc_noinline(jl_ptls_t ptls, int pool_offset, int osize) {
-    return jl_gc_pool_alloc_inner(ptls, pool_offset, osize);
-}
-
-int jl_gc_classify_pools(size_t sz, int *osize)
-{
-    if (sz > GC_MAX_SZCLASS)
-        return -1;
-    size_t allocsz = sz + sizeof(jl_taggedvalue_t);
-    int klass = jl_gc_szclass(allocsz);
-    *osize = jl_gc_sizeclasses[klass];
-    return (int)(intptr_t)(&((jl_ptls_t)0)->heap.norm_pools[klass]);
-}
-
 // sweep phase
 
 int64_t lazy_freed_pages = 0;
@@ -1743,7 +1434,6 @@ static void gc_sweep_perm_alloc(void)
 
 JL_DLLEXPORT void jl_gc_queue_root(const jl_value_t *ptr)
 {
-#ifndef MMTKHEAP
     jl_ptls_t ptls = jl_current_task->ptls;
     jl_taggedvalue_t *o = jl_astaggedvalue(ptr);
     // The modification of the `gc_bits` is not atomic but it
@@ -1753,7 +1443,6 @@ JL_DLLEXPORT void jl_gc_queue_root(const jl_value_t *ptr)
     o->bits.gc = GC_MARKED;
     arraylist_push(ptls->heap.remset, (jl_value_t*)ptr);
     ptls->heap.remset_nptr++; // conservative
-#endif
 }
 
 void jl_gc_queue_multiroot(const jl_value_t *parent, const jl_value_t *ptr) JL_NOTSAFEPOINT
@@ -2639,27 +2328,6 @@ JL_EXTENSION NOINLINE void gc_mark_loop(jl_ptls_t ptls)
     gc_drain_own_chunkqueue(ptls, &ptls->mark_queue);
 }
 
-#ifndef MMTKHEAP
-static
-#endif
-void gc_premark(jl_ptls_t ptls2)
-{
-    arraylist_t *remset = ptls2->heap.remset;
-    ptls2->heap.remset = ptls2->heap.last_remset;
-    ptls2->heap.last_remset = remset;
-    ptls2->heap.remset->len = 0;
-    ptls2->heap.remset_nptr = 0;
-    // avoid counting remembered objects
-    // in `perm_scanned_bytes`
-    size_t len = remset->len;
-    void **items = remset->items;
-    for (size_t i = 0; i < len; i++) {
-        jl_value_t *item = (jl_value_t *)items[i];
-        objprofile_count(jl_typeof(item), 2, 0);
-        jl_astaggedvalue(item)->bits.gc = GC_OLD_MARKED;
-    }
-}
-
 static void gc_queue_thread_local(jl_gc_markqueue_t *mq, jl_ptls_t ptls2)
 {
     jl_task_t *task;
@@ -2797,93 +2465,6 @@ static void sweep_finalizer_list(arraylist_t *list)
     list->len = j;
 }
 
-// collector entry point and control
-static _Atomic(uint32_t) jl_gc_disable_counter = 1;
-
-JL_DLLEXPORT int jl_gc_enable(int on)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    int prev = !ptls->disable_gc;
-    ptls->disable_gc = (on == 0);
-    if (on && !prev) {
-        // disable -> enable
-        if (jl_atomic_fetch_add(&jl_gc_disable_counter, -1) == 1) {
-            gc_num.allocd += gc_num.deferred_alloc;
-            gc_num.deferred_alloc = 0;
-#ifdef MMTKHEAP
-            enable_collection();
-#endif
-        }
-    }
-    else if (prev && !on) {
-#ifdef MMTKHEAP
-        disable_collection();
-#endif
-        // enable -> disable
-        jl_atomic_fetch_add(&jl_gc_disable_counter, 1);
-        // check if the GC is running and wait for it to finish
-        jl_gc_safepoint_(ptls);
-    }
-    return prev;
-}
-
-JL_DLLEXPORT int jl_gc_is_enabled(void)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return !ptls->disable_gc;
-}
-
-JL_DLLEXPORT void jl_gc_get_total_bytes(int64_t *bytes) JL_NOTSAFEPOINT
-{
-    jl_gc_num_t num = gc_num;
-    combine_thread_gc_counts(&num);
-    // Sync this logic with `base/util.jl:GC_Diff`
-    *bytes = (num.total_allocd + num.deferred_alloc + num.allocd);
-}
-
-JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void)
-{
-    return gc_num.total_time;
-}
-
-JL_DLLEXPORT jl_gc_num_t jl_gc_num(void)
-{
-    jl_gc_num_t num = gc_num;
-    combine_thread_gc_counts(&num);
-    return num;
-}
-
-JL_DLLEXPORT void jl_gc_reset_stats(void)
-{
-    gc_num.max_pause = 0;
-    gc_num.max_memory = 0;
-    gc_num.max_time_to_safepoint = 0;
-}
-
-// TODO: these were supposed to be thread local
-JL_DLLEXPORT int64_t jl_gc_diff_total_bytes(void) JL_NOTSAFEPOINT
-{
-    int64_t oldtb = last_gc_total_bytes;
-    int64_t newtb;
-    jl_gc_get_total_bytes(&newtb);
-    last_gc_total_bytes = newtb;
-    return newtb - oldtb;
-}
-
-JL_DLLEXPORT int64_t jl_gc_sync_total_bytes(int64_t offset) JL_NOTSAFEPOINT
-{
-    int64_t oldtb = last_gc_total_bytes;
-    int64_t newtb;
-    jl_gc_get_total_bytes(&newtb);
-    last_gc_total_bytes = newtb - offset;
-    return newtb - oldtb;
-}
-
-JL_DLLEXPORT int64_t jl_gc_live_bytes(void)
-{
-    return live_bytes;
-}
-
 size_t jl_maxrss(void);
 
 // Only one thread should be running in this function
@@ -3165,10 +2746,6 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
         jl_atomic_fetch_add((_Atomic(uint64_t)*)&gc_num.deferred_alloc, localbytes);
         return;
     }
-#ifdef MMTKHEAP
-    handle_user_collection_request(ptls);
-    return;
-#endif
     jl_gc_debug_print();
 
     int8_t old_state = jl_atomic_load_relaxed(&ptls->gc_state);
@@ -3258,11 +2835,6 @@ void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq)
 
 // allocator entry points
 
-JL_DLLEXPORT jl_value_t *(jl_gc_alloc)(jl_ptls_t ptls, size_t sz, void *ty)
-{
-    return jl_gc_alloc_(ptls, sz, ty);
-}
-
 // Per-thread initialization
 void jl_init_thread_heap(jl_ptls_t ptls)
 {
@@ -3302,10 +2874,6 @@ void jl_init_thread_heap(jl_ptls_t ptls)
 
     memset(&ptls->gc_num, 0, sizeof(ptls->gc_num));
     jl_atomic_store_relaxed(&ptls->gc_num.allocd, -(int64_t)gc_num.interval);
-#ifdef MMTKHEAP
-    MMTk_Mutator mmtk_mutator = bind_mutator((void *)ptls, ptls->tid);
-    ptls->mmtk_mutator_ptr = ((MMTkMutatorContext*)mmtk_mutator);
-#endif
 }
 
 // System-wide initializations
@@ -3344,67 +2912,9 @@ void jl_gc_init(void)
 
     if (high_water_mark < max_total_memory)
        max_total_memory = high_water_mark;
-
-#ifdef MMTKHEAP
-    long long min_heap_size;
-    long long max_heap_size;
-    char* min_size_def = getenv("MMTK_MIN_HSIZE");
-    char* min_size_gb = getenv("MMTK_MIN_HSIZE_G");
-
-    char* max_size_def = getenv("MMTK_MAX_HSIZE");
-    char* max_size_gb = getenv("MMTK_MAX_HSIZE_G");
-
-    // default min heap currently set as Julia's default_collect_interval
-    if (min_size_def != NULL) {
-        char *p;
-        double min_size = strtod(min_size_def, &p);
-        min_heap_size = (long) 1024 * 1024 * min_size;
-    } else if (min_size_gb != NULL) {
-        char *p;
-        double min_size = strtod(min_size_gb, &p);
-        min_heap_size = (long) 1024 * 1024 * 1024 * min_size;
-    } else {
-        min_heap_size = default_collect_interval;
-    }
-
-    // default max heap currently set as 70% the free memory in the system
-    if (max_size_def != NULL) {
-        char *p;
-        double max_size = strtod(max_size_def, &p);
-        max_heap_size = (long) 1024 * 1024 * max_size;
-    } else if (max_size_gb != NULL) {
-        char *p;
-        double max_size = strtod(max_size_gb, &p);
-        max_heap_size = (long) 1024 * 1024 * 1024 * max_size;
-    } else {
-        max_heap_size = uv_get_free_memory() * 70 / 100;
-    }
-
-    // if only max size is specified initialize MMTk with a fixed size heap
-    if (max_size_def != NULL || max_size_gb != NULL && (min_size_def == NULL && min_size_gb == NULL)) {
-        gc_init(0, max_heap_size, &mmtk_upcalls, (sizeof(jl_taggedvalue_t)));
-    } else {
-        gc_init(min_heap_size, max_heap_size, &mmtk_upcalls, (sizeof(jl_taggedvalue_t)));
-    }
-
-#endif
     t_start = jl_hrtime();
 }
 
-JL_DLLEXPORT void jl_gc_set_max_memory(uint64_t max_mem)
-{
-    if (max_mem > 0
-        && max_mem < (uint64_t)1 << (sizeof(memsize_t) * 8 - 1)) {
-        max_total_memory = max_mem;
-    }
-}
-
-// callback for passing OOM errors from gmp
-JL_DLLEXPORT void jl_throw_out_of_memory_error(void)
-{
-    jl_throw(jl_memory_exception);
-}
-
 // allocation wrappers that track allocation and let collection run
 
 JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
@@ -3418,9 +2928,6 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
             jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz);
         jl_atomic_store_relaxed(&ptls->gc_num.malloc,
             jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1);
-#ifdef MMTKHEAP
-        return mmtk_counted_malloc(sz);
-#endif
     }
     return malloc(sz);
 }
@@ -3436,9 +2943,6 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
             jl_atomic_load_relaxed(&ptls->gc_num.allocd) + nm*sz);
         jl_atomic_store_relaxed(&ptls->gc_num.malloc,
             jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1);
-#ifdef MMTKHEAP
-        return mmtk_counted_calloc(nm, sz);
-#endif
     }
     return calloc(nm, sz);
 }
@@ -3447,18 +2951,14 @@ JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz)
 {
     jl_gcframe_t **pgcstack = jl_get_pgcstack();
     jl_task_t *ct = jl_current_task;
+    free(p);
     if (pgcstack != NULL && ct->world_age) {
         jl_ptls_t ptls = ct->ptls;
         jl_atomic_store_relaxed(&ptls->gc_num.freed,
             jl_atomic_load_relaxed(&ptls->gc_num.freed) + sz);
         jl_atomic_store_relaxed(&ptls->gc_num.freecall,
             jl_atomic_load_relaxed(&ptls->gc_num.freecall) + 1);
-#ifdef MMTKHEAP
-        mmtk_free_with_size(p, sz);
-        return;
-#endif
     }
-    free(p);
 }
 
 JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz)
@@ -3476,151 +2976,12 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size
                 jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (sz - old));
         jl_atomic_store_relaxed(&ptls->gc_num.realloc,
             jl_atomic_load_relaxed(&ptls->gc_num.realloc) + 1);
-#ifdef MMTKHEAP
-        return mmtk_realloc_with_old_size(p, sz, old);
-#endif
     }
     return realloc(p, sz);
 }
 
-// allocation wrappers that save the size of allocations, to allow using
-// jl_gc_counted_* functions with a libc-compatible API.
-
-JL_DLLEXPORT void *jl_malloc(size_t sz)
-{
-    int64_t *p = (int64_t *)jl_gc_counted_malloc(sz + JL_SMALL_BYTE_ALIGNMENT);
-    if (p == NULL)
-        return NULL;
-    p[0] = sz;
-    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
-}
-
-//_unchecked_calloc does not check for potential overflow of nm*sz
-STATIC_INLINE void *_unchecked_calloc(size_t nm, size_t sz) {
-    size_t nmsz = nm*sz;
-    int64_t *p = (int64_t *)jl_gc_counted_calloc(nmsz + JL_SMALL_BYTE_ALIGNMENT, 1);
-    if (p == NULL)
-        return NULL;
-    p[0] = nmsz;
-    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
-}
-
-JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz)
-{
-    if (nm > SSIZE_MAX/sz - JL_SMALL_BYTE_ALIGNMENT)
-        return NULL;
-    return _unchecked_calloc(nm, sz);
-}
-
-JL_DLLEXPORT void jl_free(void *p)
-{
-    if (p != NULL) {
-        int64_t *pp = (int64_t *)p - 2;
-        size_t sz = pp[0];
-        jl_gc_counted_free_with_size(pp, sz + JL_SMALL_BYTE_ALIGNMENT);
-    }
-}
-
-JL_DLLEXPORT void *jl_realloc(void *p, size_t sz)
-{
-    int64_t *pp;
-    size_t szold;
-    if (p == NULL) {
-        pp = NULL;
-        szold = 0;
-    }
-    else {
-        pp = (int64_t *)p - 2;
-        szold = pp[0] + JL_SMALL_BYTE_ALIGNMENT;
-    }
-    int64_t *pnew = (int64_t *)jl_gc_counted_realloc_with_old_size(pp, szold, sz + JL_SMALL_BYTE_ALIGNMENT);
-    if (pnew == NULL)
-        return NULL;
-    pnew[0] = sz;
-    return (void *)(pnew + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
-}
-
-// allocating blocks for Arrays and Strings
-
-JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    maybe_collect(ptls);
-    size_t allocsz = LLT_ALIGN(sz, JL_CACHE_BYTE_ALIGNMENT);
-    if (allocsz < sz)  // overflow in adding offs, size was "negative"
-        jl_throw(jl_memory_exception);
-    jl_atomic_store_relaxed(&ptls->gc_num.allocd,
-        jl_atomic_load_relaxed(&ptls->gc_num.allocd) + allocsz);
-    jl_atomic_store_relaxed(&ptls->gc_num.malloc,
-        jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1);
-    int last_errno = errno;
-#ifdef _OS_WINDOWS_
-    DWORD last_error = GetLastError();
-#endif
-    void *b = malloc_cache_align(allocsz);
-    if (b == NULL)
-        jl_throw(jl_memory_exception);
-#ifdef _OS_WINDOWS_
-    SetLastError(last_error);
-#endif
-    errno = last_errno;
-    // jl_gc_managed_malloc is currently always used for allocating array buffers.
-    maybe_record_alloc_to_profile((jl_value_t*)b, sz, (jl_datatype_t*)jl_buff_tag);
-    return b;
-}
-
-static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t oldsz,
-                                 int isaligned, jl_value_t *owner, int8_t can_collect)
-{
-    if (can_collect)
-        maybe_collect(ptls);
-
-    size_t allocsz = LLT_ALIGN(sz, JL_CACHE_BYTE_ALIGNMENT);
-    if (allocsz < sz)  // overflow in adding offs, size was "negative"
-        jl_throw(jl_memory_exception);
-
-    if (jl_astaggedvalue(owner)->bits.gc == GC_OLD_MARKED) {
-        ptls->gc_cache.perm_scanned_bytes += allocsz - oldsz;
-        live_bytes += allocsz - oldsz;
-    }
-    else if (allocsz < oldsz)
-        jl_atomic_store_relaxed(&ptls->gc_num.freed,
-            jl_atomic_load_relaxed(&ptls->gc_num.freed) + (oldsz - allocsz));
-    else
-        jl_atomic_store_relaxed(&ptls->gc_num.allocd,
-            jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (allocsz - oldsz));
-    jl_atomic_store_relaxed(&ptls->gc_num.realloc,
-        jl_atomic_load_relaxed(&ptls->gc_num.realloc) + 1);
-
-    int last_errno = errno;
-#ifdef _OS_WINDOWS_
-    DWORD last_error = GetLastError();
-#endif
-    void *b;
-    if (isaligned)
-        b = realloc_cache_align(d, allocsz, oldsz);
-    else
-        b = realloc(d, allocsz);
-    if (b == NULL)
-        jl_throw(jl_memory_exception);
-#ifdef _OS_WINDOWS_
-    SetLastError(last_error);
-#endif
-    errno = last_errno;
-    maybe_record_alloc_to_profile((jl_value_t*)b, sz, jl_gc_unknown_type_tag);
-    return b;
-}
-
-JL_DLLEXPORT void *jl_gc_managed_realloc(void *d, size_t sz, size_t oldsz,
-                                         int isaligned, jl_value_t *owner)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return gc_managed_realloc_(ptls, d, sz, oldsz, isaligned, owner, 1);
-}
-
 jl_value_t *jl_gc_realloc_string(jl_value_t *s, size_t sz)
 {
-#ifndef MMTKHEAP
     size_t len = jl_string_len(s);
     if (sz <= len) return s;
     jl_taggedvalue_t *v = jl_astaggedvalue(s);
@@ -3654,148 +3015,6 @@ jl_value_t *jl_gc_realloc_string(jl_value_t *s, size_t sz)
     jl_value_t *snew = jl_valueof(&newbig->header);
     *(size_t*)snew = sz;
     return snew;
-#else
-    size_t len = jl_string_len(s);
-    jl_value_t *snew = jl_alloc_string(sz);
-    memcpy(jl_string_data(snew), jl_string_data(s), sz <= len ? sz : len);
-    return snew;
-#endif
-}
-
-// Perm gen allocator
-// 2M pool
-#define GC_PERM_POOL_SIZE (2 * 1024 * 1024)
-// 20k limit for pool allocation. At most 1% fragmentation
-#define GC_PERM_POOL_LIMIT (20 * 1024)
-uv_mutex_t gc_perm_lock;
-static uintptr_t gc_perm_pool = 0;
-static uintptr_t gc_perm_end = 0;
-
-static void *gc_perm_alloc_large(size_t sz, int zero, unsigned align, unsigned offset) JL_NOTSAFEPOINT
-{
-    // `align` must be power of two
-    assert(offset == 0 || offset < align);
-    const size_t malloc_align = sizeof(void*) == 8 ? 16 : 4;
-    if (align > 1 && (offset != 0 || align > malloc_align))
-        sz += align - 1;
-    int last_errno = errno;
-#ifdef _OS_WINDOWS_
-    DWORD last_error = GetLastError();
-#endif
-    void *base = zero ? calloc(1, sz) : malloc(sz);
-    if (base == NULL)
-        jl_throw(jl_memory_exception);
-#ifdef _OS_WINDOWS_
-    SetLastError(last_error);
-#endif
-    errno = last_errno;
-    jl_may_leak(base);
-    assert(align > 0);
-    unsigned diff = (offset - (uintptr_t)base) % align;
-    return (void*)((char*)base + diff);
-}
-
-STATIC_INLINE void *gc_try_perm_alloc_pool(size_t sz, unsigned align, unsigned offset) JL_NOTSAFEPOINT
-{
-    uintptr_t pool = LLT_ALIGN(gc_perm_pool + offset, (uintptr_t)align) - offset;
-    uintptr_t end = pool + sz;
-    if (end > gc_perm_end)
-        return NULL;
-    gc_perm_pool = end;
-    return (void*)jl_assume(pool);
-}
-
-// **NOT** a safepoint
-void *jl_gc_perm_alloc_nolock(size_t sz, int zero, unsigned align, unsigned offset)
-{
-    // The caller should have acquired `gc_perm_lock`
-    assert(align < GC_PERM_POOL_LIMIT);
-#ifndef MEMDEBUG
-    if (__unlikely(sz > GC_PERM_POOL_LIMIT))
-#endif
-        return gc_perm_alloc_large(sz, zero, align, offset);
-    void *ptr = gc_try_perm_alloc_pool(sz, align, offset);
-    if (__likely(ptr))
-        return ptr;
-    int last_errno = errno;
-#ifdef _OS_WINDOWS_
-    DWORD last_error = GetLastError();
-    void *pool = VirtualAlloc(NULL, GC_PERM_POOL_SIZE, MEM_COMMIT, PAGE_READWRITE);
-    SetLastError(last_error);
-    errno = last_errno;
-    if (__unlikely(pool == NULL))
-        return NULL;
-#else
-    void *pool = mmap(0, GC_PERM_POOL_SIZE, PROT_READ | PROT_WRITE,
-                      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-    errno = last_errno;
-    if (__unlikely(pool == MAP_FAILED))
-        return NULL;
-#endif
-    gc_perm_pool = (uintptr_t)pool;
-    gc_perm_end = gc_perm_pool + GC_PERM_POOL_SIZE;
-    return gc_try_perm_alloc_pool(sz, align, offset);
-}
-
-// **NOT** a safepoint
-void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align, unsigned offset)
-{
-    assert(align < GC_PERM_POOL_LIMIT);
-#ifndef MEMDEBUG
-    if (__unlikely(sz > GC_PERM_POOL_LIMIT))
-#endif
-        return gc_perm_alloc_large(sz, zero, align, offset);
-    uv_mutex_lock(&gc_perm_lock);
-    void *p = jl_gc_perm_alloc_nolock(sz, zero, align, offset);
-    uv_mutex_unlock(&gc_perm_lock);
-    return p;
-}
-
-JL_DLLEXPORT void jl_gc_add_finalizer(jl_value_t *v, jl_function_t *f)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    jl_gc_add_finalizer_th(ptls, v, f);
-}
-
-JL_DLLEXPORT void jl_finalize(jl_value_t *o)
-{
-    jl_finalize_th(jl_current_task, o);
-}
-
-JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return jl_gc_new_weakref_th(ptls, value);
-}
-
-JL_DLLEXPORT jl_value_t *jl_gc_allocobj(size_t sz)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return jl_gc_alloc(ptls, sz, NULL);
-}
-
-JL_DLLEXPORT jl_value_t *jl_gc_alloc_0w(void)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return jl_gc_alloc(ptls, 0, NULL);
-}
-
-JL_DLLEXPORT jl_value_t *jl_gc_alloc_1w(void)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return jl_gc_alloc(ptls, sizeof(void*), NULL);
-}
-
-JL_DLLEXPORT jl_value_t *jl_gc_alloc_2w(void)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return jl_gc_alloc(ptls, sizeof(void*) * 2, NULL);
-}
-
-JL_DLLEXPORT jl_value_t *jl_gc_alloc_3w(void)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return jl_gc_alloc(ptls, sizeof(void*) * 3, NULL);
 }
 
 JL_DLLEXPORT int jl_gc_enable_conservative_gc_support(void)
@@ -3915,27 +3134,16 @@ JL_DLLEXPORT jl_value_t *jl_gc_internal_obj_base_ptr(void *p)
     return NULL;
 }
 
-JL_DLLEXPORT size_t jl_gc_max_internal_obj_size(void)
-{
-    return GC_MAX_SZCLASS;
-}
-
-JL_DLLEXPORT size_t jl_gc_external_obj_hdr_size(void)
-{
-    return sizeof(bigval_t);
-}
-
-
-JL_DLLEXPORT void * jl_gc_alloc_typed(jl_ptls_t ptls, size_t sz, void *ty)
+// added for MMTk integration
+void enable_collection(void)
 {
-    return jl_gc_alloc(ptls, sz, ty);
 }
-
-JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj)
+void disable_collection(void)
 {
-    arraylist_push(&ptls->sweep_objs, obj);
 }
 
 #ifdef __cplusplus
 }
 #endif
+
+#endif // !MMTK_GC
diff --git a/src/gc.h b/src/gc.h
index 930f7f3c30594..1db0211eb6c68 100644
--- a/src/gc.h
+++ b/src/gc.h
@@ -4,6 +4,7 @@
   allocation and garbage collection
   . non-moving, precise mark and sweep collector
   . pool-allocates small objects, keeps big objects on a simple list
+  MMTk alternative
 */
 
 #ifndef JL_GC_H
@@ -27,36 +28,48 @@
 #include "gc-heap-snapshot.h"
 #include "gc-alloc-profiler.h"
 
-#ifdef __cplusplus
-extern "C" {
+// interface from and to gc-common.c
+extern void maybe_collect(jl_ptls_t ptls);
+extern void run_finalizer(jl_task_t *ct, void *o, void *ff);
+extern void *jl_malloc_aligned(size_t sz, size_t align);
+extern void *jl_gc_counted_calloc(size_t nm, size_t sz);
+extern void jl_gc_counted_free_with_size(void *p, size_t sz);
+extern void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz);
+extern void *jl_realloc_aligned(void *d, size_t sz, size_t oldsz, size_t align);
+extern void jl_gc_add_finalizer_th(jl_ptls_t ptls, jl_value_t *v, jl_function_t *f);
+extern void jl_finalize_th(jl_task_t *ct, jl_value_t *o);
+extern jl_weakref_t *jl_gc_new_weakref_th(jl_ptls_t ptls, jl_value_t *value);
+extern jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz);
+extern jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset, int osize);
+extern void jl_rng_split(uint64_t to[4], uint64_t from[4]);
+extern void gc_premark(jl_ptls_t ptls2);
+extern void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t oldsz,
+                                 int isaligned, jl_value_t *owner, int8_t can_collect);
+extern size_t jl_array_nbytes(jl_array_t *a);
+extern void objprofile_count(void *ty, int old, int sz);
+
+#define malloc_cache_align(sz) jl_malloc_aligned(sz, JL_CACHE_BYTE_ALIGNMENT)
+#define realloc_cache_align(p, sz, oldsz) jl_realloc_aligned(p, sz, oldsz, JL_CACHE_BYTE_ALIGNMENT)
+
+// common types and globals
+#ifdef _P64
+typedef uint64_t memsize_t;
+#else
+typedef uint32_t memsize_t;
 #endif
 
-#define GC_PAGE_LG2 14 // log2(size of a page)
-#define GC_PAGE_SZ (1 << GC_PAGE_LG2) // 16k
-#define GC_PAGE_OFFSET (JL_HEAP_ALIGNMENT - (sizeof(jl_taggedvalue_t) % JL_HEAP_ALIGNMENT))
-
-#define jl_malloc_tag ((void*)0xdeadaa01)
-#define jl_singleton_tag ((void*)0xdeadaa02)
-
-// Used by GC_DEBUG_ENV
-typedef struct {
-    uint64_t num;
-    uint64_t next;
-    uint64_t min;
-    uint64_t interv;
-    uint64_t max;
-    unsigned short random[3];
-} jl_alloc_num_t;
-
-typedef struct {
-    int always_full;
-    int wait_for_debugger;
-    jl_alloc_num_t pool;
-    jl_alloc_num_t other;
-    jl_alloc_num_t print;
-} jl_gc_debug_env_t;
+extern const size_t default_collect_interval;
+extern const size_t max_collect_interval;
+extern size_t last_long_collect_interval;
+extern size_t total_mem;
+extern memsize_t max_total_memory;
+extern _Atomic(uint32_t) jl_gc_disable_counter;
+extern jl_mutex_t heapsnapshot_lock;
+extern uint64_t finalizer_rngState[];
+extern int gc_n_threads;
+extern jl_ptls_t* gc_all_tls_states;
 
-// This struct must be kept in sync with the Julia type of the same name in base/timing.jl
+// keep in sync with the Julia type of the same name in base/timing.jl
 typedef struct {
     int64_t     allocd;
     int64_t     deferred_alloc;
@@ -82,29 +95,18 @@ typedef struct {
     uint64_t    total_mark_time;
 } jl_gc_num_t;
 
-typedef enum {
-    GC_empty_chunk,
-    GC_objary_chunk,
-    GC_ary8_chunk,
-    GC_ary16_chunk,
-    GC_finlist_chunk,
-} gc_chunk_id_t;
+extern jl_gc_num_t gc_num;
 
-typedef struct _jl_gc_chunk_t {
-    gc_chunk_id_t cid;
-    struct _jl_value_t *parent;
-    struct _jl_value_t **begin;
-    struct _jl_value_t **end;
-    void *elem_begin;
-    void *elem_end;
-    uint32_t step;
-    uintptr_t nptr;
-} jl_gc_chunk_t;
+// data structure for tracking malloc'd arrays.
+typedef struct _mallocarray_t {
+    jl_array_t *a;
+    struct _mallocarray_t *next;
+} mallocarray_t;
 
-#define MAX_REFS_AT_ONCE (1 << 16)
+extern void combine_thread_gc_counts(jl_gc_num_t *dest);
+extern void reset_thread_gc_counts(void);
 
 // layout for big (>2k) objects
-
 JL_EXTENSION typedef struct _bigval_t {
     struct _bigval_t *next;
     struct _bigval_t **prev; // pointer to the next field of the prev entry
@@ -129,12 +131,111 @@ JL_EXTENSION typedef struct _bigval_t {
     // must be 64-byte aligned here, in 32 & 64 bit modes
 } bigval_t;
 
-// data structure for tracking malloc'd arrays.
+STATIC_INLINE uintptr_t gc_ptr_tag(void *v, uintptr_t mask) JL_NOTSAFEPOINT
+{
+    return ((uintptr_t)v) & mask;
+}
 
-typedef struct _mallocarray_t {
-    jl_array_t *a;
-    struct _mallocarray_t *next;
-} mallocarray_t;
+STATIC_INLINE void *gc_ptr_clear_tag(void *v, uintptr_t mask) JL_NOTSAFEPOINT
+{
+    return (void*)(((uintptr_t)v) & ~mask);
+}
+
+STATIC_INLINE int gc_marked(uintptr_t bits) JL_NOTSAFEPOINT
+{
+    return (bits & GC_MARKED) != 0;
+}
+
+#ifdef GC_VERIFY
+#error "GC_VERIFY is unsupported with MMTk"
+#endif
+
+#ifdef MEMFENCE
+#error "MEMFENCE is unsupported with MMTk"
+#endif
+
+#ifdef GC_DEBUG_ENV
+#error "GC_DEBUG_ENV is unsupported with MMTk"
+#endif
+
+#ifdef GC_FINAL_STATS
+#error "GC_FINAL_STATS is currently unsupported with MMTk"
+#endif
+
+#ifdef GC_TIME
+#error "GC_TIME is currently unsupported with MMTk"
+#endif
+
+#ifdef MEMPROFILE
+#error "MEMPROFILE is not supported with MMTk"
+#endif
+
+#ifdef OBJPROFILE
+#ifdef MMTK_GC
+#warning "OBJPROFILE is unsupported with MMTk; disabling"
+#undef OBJPROFILE
+#endif
+#endif
+
+
+#ifdef MMTK_GC
+#include "mmtk.h"
+
+typedef struct {
+    char c;
+} jl_gc_pagemeta_t;
+
+#else  // !MMTK_GC
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define GC_PAGE_LG2 14 // log2(size of a page)
+#define GC_PAGE_SZ (1 << GC_PAGE_LG2) // 16k
+#define GC_PAGE_OFFSET (JL_HEAP_ALIGNMENT - (sizeof(jl_taggedvalue_t) % JL_HEAP_ALIGNMENT))
+
+#define jl_malloc_tag ((void*)0xdeadaa01)
+#define jl_singleton_tag ((void*)0xdeadaa02)
+
+// Used by GC_DEBUG_ENV
+typedef struct {
+    uint64_t num;
+    uint64_t next;
+    uint64_t min;
+    uint64_t interv;
+    uint64_t max;
+    unsigned short random[3];
+} jl_alloc_num_t;
+
+typedef struct {
+    int always_full;
+    int wait_for_debugger;
+    jl_alloc_num_t pool;
+    jl_alloc_num_t other;
+    jl_alloc_num_t print;
+} jl_gc_debug_env_t;
+
+typedef enum {
+    GC_empty_chunk,
+    GC_objary_chunk,
+    GC_ary8_chunk,
+    GC_ary16_chunk,
+    GC_finlist_chunk,
+} gc_chunk_id_t;
+
+typedef struct _jl_gc_chunk_t {
+    gc_chunk_id_t cid;
+    struct _jl_value_t *parent;
+    struct _jl_value_t **begin;
+    struct _jl_value_t **end;
+    void *elem_begin;
+    void *elem_end;
+    uint32_t step;
+    uintptr_t nptr;
+} jl_gc_chunk_t;
+
+#define MAX_REFS_AT_ONCE (1 << 16)
 
 // pool page metadata
 typedef struct {
@@ -250,14 +351,11 @@ STATIC_INLINE unsigned ffs_u32(uint32_t bitvec)
 }
 #endif
 
-extern jl_gc_num_t gc_num;
 extern pagetable_t memory_map;
 extern bigval_t *big_objects_marked;
 extern arraylist_t finalizer_list_marked;
 extern arraylist_t to_finalize;
 extern int64_t lazy_freed_pages;
-extern int gc_n_threads;
-extern jl_ptls_t* gc_all_tls_states;
 
 STATIC_INLINE bigval_t *bigval_header(jl_taggedvalue_t *o) JL_NOTSAFEPOINT
 {
@@ -280,11 +378,6 @@ STATIC_INLINE jl_taggedvalue_t *page_pfl_end(jl_gc_pagemeta_t *p) JL_NOTSAFEPOIN
     return (jl_taggedvalue_t*)(p->data + p->fl_end_offset);
 }
 
-STATIC_INLINE int gc_marked(uintptr_t bits) JL_NOTSAFEPOINT
-{
-    return (bits & GC_MARKED) != 0;
-}
-
 STATIC_INLINE int gc_old(uintptr_t bits) JL_NOTSAFEPOINT
 {
     return (bits & GC_OLD) != 0;
@@ -295,16 +388,6 @@ STATIC_INLINE uintptr_t gc_set_bits(uintptr_t tag, int bits) JL_NOTSAFEPOINT
     return (tag & ~(uintptr_t)3) | bits;
 }
 
-STATIC_INLINE uintptr_t gc_ptr_tag(void *v, uintptr_t mask) JL_NOTSAFEPOINT
-{
-    return ((uintptr_t)v) & mask;
-}
-
-STATIC_INLINE void *gc_ptr_clear_tag(void *v, uintptr_t mask) JL_NOTSAFEPOINT
-{
-    return (void*)(((uintptr_t)v) & ~mask);
-}
-
 NOINLINE uintptr_t gc_get_stack_ptr(void);
 
 STATIC_INLINE jl_gc_pagemeta_t *page_metadata(void *_data) JL_NOTSAFEPOINT
@@ -538,24 +621,6 @@ static inline void gc_scrub(void)
 }
 #endif
 
-#ifdef OBJPROFILE
-void objprofile_count(void *ty, int old, int sz) JL_NOTSAFEPOINT;
-void objprofile_printall(void);
-void objprofile_reset(void);
-#else
-static inline void objprofile_count(void *ty, int old, int sz) JL_NOTSAFEPOINT
-{
-}
-
-static inline void objprofile_printall(void)
-{
-}
-
-static inline void objprofile_reset(void)
-{
-}
-#endif
-
 #ifdef MEMPROFILE
 void gc_stats_all_pool(void);
 void gc_stats_big_obj(void);
@@ -567,8 +632,6 @@ void gc_stats_big_obj(void);
 // For debugging
 void gc_count_pool(void);
 
-size_t jl_array_nbytes(jl_array_t *a) JL_NOTSAFEPOINT;
-
 JL_DLLEXPORT void jl_enable_gc_logging(int enable);
 void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect) JL_NOTSAFEPOINT;
 
@@ -576,4 +639,6 @@ void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect
 }
 #endif
 
+#endif // !MMTK_GC
+
 #endif
diff --git a/src/init.c b/src/init.c
index 45d6b8ee98873..2bfdebe00dfaf 100644
--- a/src/init.c
+++ b/src/init.c
@@ -295,12 +295,7 @@ JL_DLLEXPORT void jl_atexit_hook(int exitcode) JL_NOTSAFEPOINT_ENTER
     JL_STDOUT = (uv_stream_t*) STDOUT_FILENO;
     JL_STDERR = (uv_stream_t*) STDERR_FILENO;
 
-#ifndef MMTKHEAP
-    if (ct)
-        jl_gc_run_all_finalizers(ct);
-#else
-    mmtk_jl_gc_run_all_finalizers();
-#endif
+    jl_gc_run_all_finalizers(ct);
 
     uv_loop_t *loop = jl_global_event_loop();
     if (loop != NULL) {
@@ -811,11 +806,9 @@ JL_DLLEXPORT void julia_init(JL_IMAGE_SEARCH rel)
 
     jl_ptls_t ptls = jl_init_threadtls(0);
 
-#ifdef MMTKHEAP
-    // start MMTk's GC
-    initialize_collection((void*) ptls);
+#ifdef MMTK_GC
+    initialize_collection((void *)ptls);
 #endif
-
 #pragma GCC diagnostic push
 #if defined(_COMPILER_GCC_) && __GNUC__ >= 12
 #pragma GCC diagnostic ignored "-Wdangling-pointer"
diff --git a/src/julia.h b/src/julia.h
index 2bc1a97b681ed..8a8624360fc7a 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -930,29 +930,25 @@ JL_DLLEXPORT void jl_clear_malloc_data(void);
 JL_DLLEXPORT void jl_gc_queue_root(const jl_value_t *root) JL_NOTSAFEPOINT;
 JL_DLLEXPORT void jl_gc_queue_multiroot(const jl_value_t *root, const jl_value_t *stored) JL_NOTSAFEPOINT;
 
+#ifndef MMTK_GC
 STATIC_INLINE void jl_gc_wb(const void *parent, const void *ptr) JL_NOTSAFEPOINT
 {
-#ifndef MMTKHEAP
     // parent and ptr isa jl_value_t*
     if (__unlikely(jl_astaggedvalue(parent)->bits.gc == 3 && // parent is old and not in remset
                    (jl_astaggedvalue(ptr)->bits.gc & 1) == 0)) // ptr is young
         jl_gc_queue_root((jl_value_t*)parent);
-#endif
 }
 
 STATIC_INLINE void jl_gc_wb_back(const void *ptr) JL_NOTSAFEPOINT // ptr isa jl_value_t*
 {
-#ifndef MMTKHEAP
     // if ptr is old
     if (__unlikely(jl_astaggedvalue(ptr)->bits.gc == 3)) {
         jl_gc_queue_root((jl_value_t*)ptr);
     }
-#endif
 }
 
 STATIC_INLINE void jl_gc_multi_wb(const void *parent, const jl_value_t *ptr) JL_NOTSAFEPOINT
 {
-#ifndef MMTKHEAP
     // ptr is an immutable object
     if (__likely(jl_astaggedvalue(parent)->bits.gc != 3))
         return; // parent is young or in remset
@@ -962,9 +958,23 @@ STATIC_INLINE void jl_gc_multi_wb(const void *parent, const jl_value_t *ptr) JL_
     const jl_datatype_layout_t *ly = dt->layout;
     if (ly->npointers)
         jl_gc_queue_multiroot((jl_value_t*)parent, ptr);
-#endif
 }
 
+#else  // MMTK_GC
+
+STATIC_INLINE void jl_gc_wb(const void *parent, const void *ptr) JL_NOTSAFEPOINT
+{
+}
+
+STATIC_INLINE void jl_gc_wb_back(const void *ptr) JL_NOTSAFEPOINT // ptr isa jl_value_t*
+{
+}
+
+STATIC_INLINE void jl_gc_multi_wb(const void *parent, const jl_value_t *ptr) JL_NOTSAFEPOINT
+{
+}
+#endif // MMTK_GC
+
 JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz);
 JL_DLLEXPORT void *jl_gc_managed_realloc(void *d, size_t sz, size_t oldsz,
                                          int isaligned, jl_value_t *owner);
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 5b60be740bfb8..b921c63444e86 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -326,13 +326,15 @@ extern jl_array_t *jl_all_methods JL_GLOBALLY_ROOTED;
 JL_DLLEXPORT extern int jl_lineno;
 JL_DLLEXPORT extern const char *jl_filename;
 
+void enable_collection(void);
+void disable_collection(void);
 jl_value_t *jl_gc_pool_alloc_noinline(jl_ptls_t ptls, int pool_offset,
                                    int osize);
 jl_value_t *jl_gc_big_alloc_noinline(jl_ptls_t ptls, size_t allocsz);
-#ifdef MMTKHEAP
+#ifdef MMTK_GC
 JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_default(jl_ptls_t ptls, int pool_offset, int osize, void* ty);
 JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_big(jl_ptls_t ptls, size_t allocsz);
-#endif
+#endif // MMTK_GC
 JL_DLLEXPORT int jl_gc_classify_pools(size_t sz, int *osize) JL_NOTSAFEPOINT;
 extern uv_mutex_t gc_perm_lock;
 void *jl_gc_perm_alloc_nolock(size_t sz, int zero,
@@ -451,37 +453,50 @@ STATIC_INLINE uint8_t JL_CONST_FUNC jl_gc_szclass_align8(unsigned sz) JL_NOTSAFE
 #define GC_MAX_SZCLASS (2032-sizeof(void*))
 static_assert(ARRAY_CACHE_ALIGN_THRESHOLD > GC_MAX_SZCLASS, "");
 
+#ifndef MMTK_GC
 STATIC_INLINE jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty)
 {
     jl_value_t *v;
     const size_t allocsz = sz + sizeof(jl_taggedvalue_t);
     if (sz <= GC_MAX_SZCLASS) {
-#ifndef MMTKHEAP
         int pool_id = jl_gc_szclass(allocsz);
         jl_gc_pool_t *p = &ptls->heap.norm_pools[pool_id];
         int osize = jl_gc_sizeclasses[pool_id];
         // We call `jl_gc_pool_alloc_noinline` instead of `jl_gc_pool_alloc` to avoid double-counting in
         // the Allocations Profiler. (See https://github.com/JuliaLang/julia/pull/43868 for more details.)
         v = jl_gc_pool_alloc_noinline(ptls, (char*)p - (char*)ptls, osize);
-#else
+    }
+    else {
+        if (allocsz < sz) // overflow in adding offs, size was "negative"
+            jl_throw(jl_memory_exception);
+        v = jl_gc_big_alloc_noinline(ptls, allocsz);
+    }
+    jl_set_typeof(v, ty);
+    maybe_record_alloc_to_profile(v, sz, (jl_datatype_t*)ty);
+    return v;
+}
+
+#else  // MMTK_GC
+
+STATIC_INLINE jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty)
+{
+    jl_value_t *v;
+    const size_t allocsz = sz + sizeof(jl_taggedvalue_t);
+    if (sz <= GC_MAX_SZCLASS) {
         int pool_id = jl_gc_szclass(allocsz);
         int osize = jl_gc_sizeclasses[pool_id];
         v = jl_mmtk_gc_alloc_default(ptls, pool_id, osize, ty);
-#endif
     }
     else {
         if (allocsz < sz) // overflow in adding offs, size was "negative"
             jl_throw(jl_memory_exception);
-#ifndef MMTKHEAP
-        v = jl_gc_big_alloc_noinline(ptls, allocsz);
-#else
         v = jl_mmtk_gc_alloc_big(ptls, allocsz);
-#endif
     }
     jl_set_typeof(v, ty);
     maybe_record_alloc_to_profile(v, sz, (jl_datatype_t*)ty);
     return v;
 }
+#endif // MMTK_GC
 
 /* Programming style note: When using jl_gc_alloc, do not JL_GC_PUSH it into a
  * gc frame, until it has been fully initialized. An uninitialized value in a
@@ -576,24 +591,32 @@ void jl_gc_add_finalizer_(jl_ptls_t ptls, void *v, void *f) JL_NOTSAFEPOINT;
 
 void gc_setmark_buf(jl_ptls_t ptls, void *buf, uint8_t, size_t) JL_NOTSAFEPOINT;
 
+#ifndef MMTK_GC
 STATIC_INLINE void jl_gc_wb_binding(jl_binding_t *bnd, void *val) JL_NOTSAFEPOINT // val isa jl_value_t*
 {
-#ifndef MMTKHEAP
     jl_gc_wb(bnd, val);
-#endif
 }
 
 STATIC_INLINE void jl_gc_wb_buf(void *parent, void *bufptr, size_t minsz) JL_NOTSAFEPOINT // parent isa jl_value_t*
 {
-#ifndef MMTKHEAP
     // if parent is marked and buf is not
     if (__unlikely(jl_astaggedvalue(parent)->bits.gc & 1)) {
         jl_task_t *ct = jl_current_task;
         gc_setmark_buf(ct->ptls, bufptr, 3, minsz);
     }
-#endif
 }
 
+#else  // MMTK_GC
+
+STATIC_INLINE void jl_gc_wb_binding(jl_binding_t *bnd, void *val) JL_NOTSAFEPOINT // val isa jl_value_t*
+{
+}
+
+STATIC_INLINE void jl_gc_wb_buf(void *parent, void *bufptr, size_t minsz) JL_NOTSAFEPOINT // parent isa jl_value_t*
+{
+}
+#endif // MMTK_GC
+
 void jl_gc_debug_print_status(void) JL_NOTSAFEPOINT;
 JL_DLLEXPORT void jl_gc_debug_critical_error(void) JL_NOTSAFEPOINT;
 void jl_print_gc_stats(JL_STREAM *s);
diff --git a/src/julia_threads.h b/src/julia_threads.h
index c15f19e78966f..17d9d0857dc39 100644
--- a/src/julia_threads.h
+++ b/src/julia_threads.h
@@ -4,7 +4,7 @@
 #ifndef JL_THREADS_H
 #define JL_THREADS_H
 
-#ifdef MMTKHEAP
+#ifdef MMTK_GC
 #include "mmtkMutator.h"
 #endif
 
@@ -281,7 +281,7 @@ typedef struct _jl_tls_states_t {
         uint64_t sleep_leave;
     )
 
-#ifdef MMTKHEAP
+#ifdef MMTK_GC
     MMTkMutatorContext* mmtk_mutator_ptr;
     void* cursor;
     void* limit;
diff --git a/src/llvm-final-gc-lowering.cpp b/src/llvm-final-gc-lowering.cpp
index 3e2eb3bcdf6ed..5b8eeb49f60ad 100644
--- a/src/llvm-final-gc-lowering.cpp
+++ b/src/llvm-final-gc-lowering.cpp
@@ -224,12 +224,12 @@ Value *FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
         derefAttr = Attribute::getWithDereferenceableBytes(F.getContext(), sz + sizeof(void*));
     }
     else {
-#ifndef MMTKHEAP
+#ifndef MMTK_GC
         auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), offset);
         auto pool_osize = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize);
         newI = builder.CreateCall(poolAllocFunc, { ptls, pool_offs, pool_osize });
         derefAttr = Attribute::getWithDereferenceableBytes(F.getContext(), osize);
-    #else
+#else // MMTK_GC
         auto pool_osize_i32 = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize);
         auto pool_osize = ConstantInt::get(Type::getInt64Ty(F.getContext()), osize);
         auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, cursor));
@@ -295,7 +295,7 @@ Value *FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
         phiNode->takeName(target);
 
         return phiNode;
-#endif
+#endif // MMTK_GC
     }
     newI->setAttributes(newI->getCalledFunction()->getAttributes());
     newI->addRetAttr(derefAttr);
diff --git a/src/llvm-pass-helpers.cpp b/src/llvm-pass-helpers.cpp
index c46228f13490b..ea390f01010fd 100644
--- a/src/llvm-pass-helpers.cpp
+++ b/src/llvm-pass-helpers.cpp
@@ -226,13 +226,8 @@ namespace jl_intrinsics {
 }
 
 namespace jl_well_known {
-#ifndef MMTKHEAP
     static const char *GC_BIG_ALLOC_NAME = XSTR(jl_gc_big_alloc);
     static const char *GC_POOL_ALLOC_NAME = XSTR(jl_gc_pool_alloc);
-#else
-    static const char *GC_BIG_ALLOC_NAME = XSTR(jl_mmtk_gc_alloc_big);
-    static const char *GC_POOL_ALLOC_NAME = XSTR(jl_mmtk_gc_alloc_default_llvm);
-#endif
     static const char *GC_QUEUE_ROOT_NAME = XSTR(jl_gc_queue_root);
 
     using jl_intrinsics::addGCAllocAttributes;
diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
new file mode 100644
index 0000000000000..00cd54c9df920
--- /dev/null
+++ b/src/mmtk-gc.c
@@ -0,0 +1,487 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+#ifdef MMTK_GC
+
+#include "gc.h"
+#include "mmtk_julia.h"
+#include "julia_gcext.h"
+
+// callbacks
+// ---
+
+typedef void (*jl_gc_cb_func_t)(void);
+
+JL_DLLEXPORT void jl_gc_set_cb_root_scanner(jl_gc_cb_root_scanner_t cb, int enable)
+{
+}
+JL_DLLEXPORT void jl_gc_set_cb_task_scanner(jl_gc_cb_task_scanner_t cb, int enable)
+{
+}
+JL_DLLEXPORT void jl_gc_set_cb_pre_gc(jl_gc_cb_pre_gc_t cb, int enable)
+{
+}
+JL_DLLEXPORT void jl_gc_set_cb_post_gc(jl_gc_cb_post_gc_t cb, int enable)
+{
+}
+JL_DLLEXPORT void jl_gc_set_cb_notify_external_alloc(jl_gc_cb_notify_external_alloc_t cb, int enable)
+{
+}
+JL_DLLEXPORT void jl_gc_set_cb_notify_external_free(jl_gc_cb_notify_external_free_t cb, int enable)
+{
+}
+
+
+inline void maybe_collect(jl_ptls_t ptls)
+{
+    mmtk_gc_poll(ptls);
+}
+
+
+// malloc wrappers, aligned allocation
+// ---
+
+inline void *jl_malloc_aligned(size_t sz, size_t align)
+{
+    return mmtk_malloc_aligned(sz ? sz : 1, align); // XXX sz
+}
+inline void *jl_realloc_aligned(void *d, size_t sz, size_t oldsz,
+                                       size_t align)
+{
+    void *res = jl_malloc_aligned(sz, align);
+    if (res != NULL) {
+        memcpy(res, d, oldsz > sz ? sz : oldsz);
+        mmtk_free_aligned(d);
+    }
+    return res;
+}
+inline void jl_free_aligned(void *p) JL_NOTSAFEPOINT
+{
+    mmtk_free_aligned(p);
+}
+
+
+// finalizers
+// ---
+
+JL_DLLEXPORT void jl_gc_run_pending_finalizers(jl_task_t *ct)
+{
+    if (ct == NULL)
+        ct = jl_current_task;
+    mmtk_jl_run_pending_finalizers(ct->ptls);
+}
+
+JL_DLLEXPORT void jl_gc_add_ptr_finalizer(jl_ptls_t ptls, jl_value_t *v, void *f) JL_NOTSAFEPOINT
+{
+    register_finalizer(v, f, 1);
+}
+
+// schedule f(v) to call at the next quiescent interval (aka after the next safepoint/region on all threads)
+JL_DLLEXPORT void jl_gc_add_quiescent(jl_ptls_t ptls, void **v, void *f) JL_NOTSAFEPOINT
+{
+    /* TODO: unsupported? */
+}
+
+JL_DLLEXPORT void jl_gc_add_finalizer_th(jl_ptls_t ptls, jl_value_t *v, jl_function_t *f) JL_NOTSAFEPOINT
+{
+    if (__unlikely(jl_typeis(f, jl_voidpointer_type))) {
+        jl_gc_add_ptr_finalizer(ptls, v, jl_unbox_voidpointer(f));
+    }
+    else {
+        register_finalizer(v, f, 0);
+    }
+}
+
+JL_DLLEXPORT void jl_finalize_th(jl_task_t *ct, jl_value_t *o)
+{
+    run_finalizers_for_obj(o);
+}
+
+void jl_gc_run_all_finalizers(jl_task_t *ct)
+{
+    mmtk_jl_gc_run_all_finalizers();
+}
+
+void jl_gc_add_finalizer_(jl_ptls_t ptls, void *v, void *f) JL_NOTSAFEPOINT
+{
+    register_finalizer(v, f, 0);
+}
+
+
+// weak references
+// ---
+JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref_th(jl_ptls_t ptls, jl_value_t *value)
+{
+    jl_weakref_t *wr = (jl_weakref_t*)jl_gc_alloc(ptls, sizeof(void*), jl_weakref_type);
+    wr->value = value;  // NOTE: wb not needed here
+    mmtk_add_weak_candidate(wr);
+    return wr;
+}
+
+
+// big values
+// ---
+
+// Size includes the tag and the tag is not cleared!!
+inline jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz)
+{
+    // TODO: assertion needed here?
+    assert(jl_atomic_load_relaxed(&ptls->gc_state) == 0);
+    // TODO: drop this okay?
+    // maybe_collect(ptls);
+
+    jl_value_t *v = jl_mmtk_gc_alloc_big(ptls, sz);
+    // TODO: this is done (without atomic operations) in jl_mmtk_gc_alloc_big; enable
+    // here when that's edited?
+    /*
+    jl_atomic_store_relaxed(&ptls->gc_num.allocd,
+        jl_atomic_load_relaxed(&ptls->gc_num.allocd) + allocsz);
+    jl_atomic_store_relaxed(&ptls->gc_num.bigalloc,
+        jl_atomic_load_relaxed(&ptls->gc_num.bigalloc) + 1);
+    */
+    // TODO: move to jl_mmtk_gc_alloc_big if needed.
+/*
+#ifdef MEMDEBUG
+    memset(v, 0xee, allocsz);
+#endif
+*/
+    // TODO: need to set this? have to move to jl_mmtk_gc_alloc_big then.
+    // v->age = 0;
+    // TODO: dropping this; confirm okay? `sweep_big` no longer needed?
+    // gc_big_object_link(v, &ptls->heap.big_objects);
+    return v;
+}
+
+// Size includes the tag and the tag is not cleared!!
+inline jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset, int osize)
+{
+    assert(jl_atomic_load_relaxed(&ptls->gc_state) == 0);
+#ifdef MEMDEBUG
+    return jl_gc_big_alloc(ptls, osize);
+#endif
+    // TODO: drop this okay?
+    // maybe_collect(ptls);
+
+    jl_value_t *v = jl_mmtk_gc_alloc_default(ptls, pool_offset, osize, NULL);
+    // TODO: this is done (without atomic operations) in jl_mmtk_gc_alloc_default; enable
+    // here when that's edited?
+    /*
+    jl_atomic_store_relaxed(&ptls->gc_num.allocd,
+        jl_atomic_load_relaxed(&ptls->gc_num.allocd) + osize);
+    jl_atomic_store_relaxed(&ptls->gc_num.poolalloc,
+        jl_atomic_load_relaxed(&ptls->gc_num.poolalloc) + 1);
+    */
+   return v;
+}
+
+void jl_gc_free_array(jl_array_t *a) JL_NOTSAFEPOINT
+{
+    if (a->flags.how == 2) {
+        char *d = (char*)a->data - a->offset*a->elsize;
+        if (a->flags.isaligned)
+            mmtk_free_aligned(d);
+        else
+            mmtk_free(d);
+        gc_num.freed += jl_array_nbytes(a);
+        gc_num.freecall++;
+    }
+}
+
+
+// roots
+// ---
+
+JL_DLLEXPORT void jl_gc_queue_root(const jl_value_t *ptr)
+{
+    /* TODO: not needed? */
+}
+
+// TODO: exported, but not MMTk-specific?
+JL_DLLEXPORT void jl_gc_queue_multiroot(const jl_value_t *parent, const jl_value_t *ptr) JL_NOTSAFEPOINT
+{
+    /* TODO: confirm not needed? */
+}
+
+
+// marking
+// ---
+
+JL_DLLEXPORT int jl_gc_mark_queue_obj(jl_ptls_t ptls, jl_value_t *obj)
+{
+    return 0;
+}
+JL_DLLEXPORT void jl_gc_mark_queue_objarray(jl_ptls_t ptls, jl_value_t *parent,
+                                            jl_value_t **objs, size_t nobjs)
+{
+}
+
+
+// GC control
+// ---
+
+JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
+{
+    jl_task_t *ct = jl_current_task;
+    jl_ptls_t ptls = ct->ptls;
+    if (jl_atomic_load_relaxed(&jl_gc_disable_counter)) {
+        size_t localbytes = jl_atomic_load_relaxed(&ptls->gc_num.allocd) + gc_num.interval;
+        jl_atomic_store_relaxed(&ptls->gc_num.allocd, -(int64_t)gc_num.interval);
+        static_assert(sizeof(_Atomic(uint64_t)) == sizeof(gc_num.deferred_alloc), "");
+        jl_atomic_fetch_add((_Atomic(uint64_t)*)&gc_num.deferred_alloc, localbytes);
+        return;
+    }
+    handle_user_collection_request(ptls);
+}
+
+// Per-thread initialization
+// TODO: remove `norm_pools`, `weak_refs`, etc. from `heap`?
+// TODO: remove `gc_cache`?
+void jl_init_thread_heap(jl_ptls_t ptls)
+{
+    jl_thread_heap_t *heap = &ptls->heap;
+    jl_gc_pool_t *p = heap->norm_pools;
+    for (int i = 0; i < JL_GC_N_POOLS; i++) {
+        p[i].osize = jl_gc_sizeclasses[i];
+        p[i].freelist = NULL;
+        p[i].newpages = NULL;
+    }
+    arraylist_new(&heap->weak_refs, 0);
+    arraylist_new(&heap->live_tasks, 0);
+    heap->mallocarrays = NULL;
+    heap->mafreelist = NULL;
+    heap->big_objects = NULL;
+    heap->remset = &heap->_remset[0];
+    heap->last_remset = &heap->_remset[1];
+    arraylist_new(heap->remset, 0);
+    arraylist_new(heap->last_remset, 0);
+    arraylist_new(&ptls->finalizers, 0);
+    arraylist_new(&ptls->sweep_objs, 0);
+
+    jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache;
+    gc_cache->perm_scanned_bytes = 0;
+    gc_cache->scanned_bytes = 0;
+    gc_cache->nbig_obj = 0;
+
+    memset(&ptls->gc_num, 0, sizeof(ptls->gc_num));
+    jl_atomic_store_relaxed(&ptls->gc_num.allocd, -(int64_t)gc_num.interval);
+
+    MMTk_Mutator mmtk_mutator = bind_mutator((void *)ptls, ptls->tid);
+    ptls->mmtk_mutator_ptr = ((MMTkMutatorContext*)mmtk_mutator);
+}
+
+// System-wide initialization
+// TODO: remove locks? remove anything else?
+void jl_gc_init(void)
+{
+    if (jl_options.heap_size_hint)
+        jl_gc_set_max_memory(jl_options.heap_size_hint);
+
+    JL_MUTEX_INIT(&heapsnapshot_lock);
+    uv_mutex_init(&gc_perm_lock);
+
+    gc_num.interval = default_collect_interval;
+    last_long_collect_interval = default_collect_interval;
+    gc_num.allocd = 0;
+    gc_num.max_pause = 0;
+    gc_num.max_memory = 0;
+
+#ifdef _P64
+    total_mem = uv_get_total_memory();
+    uint64_t constrained_mem = uv_get_constrained_memory();
+    if (constrained_mem > 0 && constrained_mem < total_mem)
+        total_mem = constrained_mem;
+#endif
+
+    // We allocate with abandon until we get close to the free memory on the machine.
+    uint64_t free_mem = uv_get_available_memory();
+    uint64_t high_water_mark = free_mem / 10 * 7;  // 70% high water mark
+
+    if (high_water_mark < max_total_memory)
+       max_total_memory = high_water_mark;
+
+    // MMTk-specific
+    long long min_heap_size;
+    long long max_heap_size;
+    char* min_size_def = getenv("MMTK_MIN_HSIZE");
+    char* min_size_gb = getenv("MMTK_MIN_HSIZE_G");
+
+    char* max_size_def = getenv("MMTK_MAX_HSIZE");
+    char* max_size_gb = getenv("MMTK_MAX_HSIZE_G");
+
+    // default min heap currently set as Julia's default_collect_interval
+    if (min_size_def != NULL) {
+        char *p;
+        double min_size = strtod(min_size_def, &p);
+        min_heap_size = (long) 1024 * 1024 * min_size;
+    } else if (min_size_gb != NULL) {
+        char *p;
+        double min_size = strtod(min_size_gb, &p);
+        min_heap_size = (long) 1024 * 1024 * 1024 * min_size;
+    } else {
+        min_heap_size = default_collect_interval;
+    }
+
+    // default max heap currently set as 70% the free memory in the system
+    if (max_size_def != NULL) {
+        char *p;
+        double max_size = strtod(max_size_def, &p);
+        max_heap_size = (long) 1024 * 1024 * max_size;
+    } else if (max_size_gb != NULL) {
+        char *p;
+        double max_size = strtod(max_size_gb, &p);
+        max_heap_size = (long) 1024 * 1024 * 1024 * max_size;
+    } else {
+        max_heap_size = uv_get_free_memory() * 70 / 100;
+    }
+
+    // if only max size is specified initialize MMTk with a fixed size heap
+    if (max_size_def != NULL || (max_size_gb != NULL && (min_size_def == NULL && min_size_gb == NULL))) {
+        gc_init(0, max_heap_size, &mmtk_upcalls, (sizeof(jl_taggedvalue_t)));
+    } else {
+        gc_init(min_heap_size, max_heap_size, &mmtk_upcalls, (sizeof(jl_taggedvalue_t)));
+    }
+}
+
+// allocation wrappers that track allocation and let collection run
+
+JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
+{
+    jl_gcframe_t **pgcstack = jl_get_pgcstack();
+    jl_task_t *ct = jl_current_task;
+    if (pgcstack && ct->world_age) {
+        jl_ptls_t ptls = ct->ptls;
+        maybe_collect(ptls);
+        jl_atomic_store_relaxed(&ptls->gc_num.allocd,
+            jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz);
+        jl_atomic_store_relaxed(&ptls->gc_num.malloc,
+            jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1);
+        return mmtk_counted_malloc(sz);
+    }
+    return malloc(sz);
+}
+
+JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
+{
+    jl_gcframe_t **pgcstack = jl_get_pgcstack();
+    jl_task_t *ct = jl_current_task;
+    if (pgcstack && ct->world_age) {
+        jl_ptls_t ptls = ct->ptls;
+        maybe_collect(ptls);
+        jl_atomic_store_relaxed(&ptls->gc_num.allocd,
+            jl_atomic_load_relaxed(&ptls->gc_num.allocd) + nm*sz);
+        jl_atomic_store_relaxed(&ptls->gc_num.malloc,
+            jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1);
+        return mmtk_counted_calloc(nm, sz);
+    }
+    return calloc(nm, sz);
+}
+
+JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz)
+{
+    jl_gcframe_t **pgcstack = jl_get_pgcstack();
+    jl_task_t *ct = jl_current_task;
+    if (pgcstack && ct->world_age) {
+        jl_ptls_t ptls = ct->ptls;
+        jl_atomic_store_relaxed(&ptls->gc_num.freed,
+            jl_atomic_load_relaxed(&ptls->gc_num.freed) + sz);
+        jl_atomic_store_relaxed(&ptls->gc_num.freecall,
+            jl_atomic_load_relaxed(&ptls->gc_num.freecall) + 1);
+        mmtk_free_with_size(p, sz);
+        return;
+    }
+    free(p);
+}
+
+JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz)
+{
+    jl_gcframe_t **pgcstack = jl_get_pgcstack();
+    jl_task_t *ct = jl_current_task;
+    if (pgcstack && ct->world_age) {
+        jl_ptls_t ptls = ct->ptls;
+        maybe_collect(ptls);
+        if (sz < old)
+            jl_atomic_store_relaxed(&ptls->gc_num.freed,
+                jl_atomic_load_relaxed(&ptls->gc_num.freed) + (old - sz));
+        else
+            jl_atomic_store_relaxed(&ptls->gc_num.allocd,
+                jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (sz - old));
+        jl_atomic_store_relaxed(&ptls->gc_num.realloc,
+            jl_atomic_load_relaxed(&ptls->gc_num.realloc) + 1);
+        return mmtk_realloc_with_old_size(p, sz, old);
+    }
+    // TODO: correct?
+    return realloc(p, sz);
+}
+
+jl_value_t *jl_gc_realloc_string(jl_value_t *s, size_t sz)
+{
+    size_t len = jl_string_len(s);
+    jl_value_t *snew = jl_alloc_string(sz);
+    memcpy(jl_string_data(snew), jl_string_data(s), sz <= len ? sz : len);
+    return snew;
+}
+
+JL_DLLEXPORT int jl_gc_enable_conservative_gc_support(void)
+{
+    return 0;
+}
+
+JL_DLLEXPORT int jl_gc_conservative_gc_support_enabled(void)
+{
+    return 0;
+}
+
+// TODO: if this is needed, it can be added in MMTk
+JL_DLLEXPORT jl_value_t *jl_gc_internal_obj_base_ptr(void *p)
+{
+    return NULL;
+}
+
+
+// gc-debug functions
+// ---
+
+jl_gc_pagemeta_t *jl_gc_page_metadata(void *data)
+{
+    return NULL;
+}
+
+JL_DLLEXPORT jl_taggedvalue_t *jl_gc_find_taggedvalue_pool(char *p, size_t *osize_p)
+{
+    return NULL;
+}
+
+void jl_gc_debug_critical_error(void) JL_NOTSAFEPOINT
+{
+}
+
+void jl_gc_debug_print_status(void) JL_NOTSAFEPOINT
+{
+    // May not be accurate but should be helpful enough
+    uint64_t pool_count = gc_num.poolalloc;
+    uint64_t big_count = gc_num.bigalloc;
+    jl_safe_printf("Allocations: %" PRIu64 " "
+                   "(Pool: %" PRIu64 "; Big: %" PRIu64 "); GC: %d\n",
+                   pool_count + big_count, pool_count, big_count, gc_num.pause);
+}
+
+void jl_print_gc_stats(JL_STREAM *s)
+{
+}
+
+void objprofile_count(void *ty, int old, int sz) JL_NOTSAFEPOINT
+{
+}
+
+void objprofile_printall(void)
+{
+}
+
+void objprofile_reset(void)
+{
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // MMTK_GC
diff --git a/src/threading.c b/src/threading.c
index 52b3fc2d8c06d..bc31eb1e46bb6 100644
--- a/src/threading.c
+++ b/src/threading.c
@@ -345,12 +345,12 @@ jl_ptls_t jl_init_threadtls(int16_t tid)
 #endif
     ptls->system_id = (jl_thread_t)(uintptr_t)uv_thread_self();
     ptls->rngseed = jl_rand();
-    if (tid == 0)
+    if (tid == 0) {
         ptls->disable_gc = 1;
-#ifdef MMTKHEAP
-    if (tid == 0)
+#ifdef MMTK_GC
         disable_collection();
 #endif
+    }
 #ifdef _OS_WINDOWS_
     if (tid == 0) {
         if (!DuplicateHandle(GetCurrentProcess(), GetCurrentThread(),

From 9dbc8fc65e1e273cefbbe87b20e35e4c43a7ebaf Mon Sep 17 00:00:00 2001
From: Yi Lin <qinsoon@gmail.com>
Date: Wed, 3 May 2023 16:56:37 +1200
Subject: [PATCH 003/116] Make perm alloc calls specific to GC implementation
 (#9)

---
 src/gc-common.c      | 87 --------------------------------------
 src/gc.c             | 99 ++++++++++++++++++++++++++++++++++++++++++++
 src/julia_internal.h |  8 ++++
 src/mmtk-gc.c        | 22 ++++++++++
 src/staticdata.c     |  3 ++
 src/symbol.c         |  4 ++
 6 files changed, 136 insertions(+), 87 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index f5636c97fe32a..8abee999ec48a 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -521,94 +521,7 @@ JL_DLLEXPORT void *jl_gc_managed_realloc(void *d, size_t sz, size_t oldsz,
     return gc_managed_realloc_(ptls, d, sz, oldsz, isaligned, owner, 1);
 }
 
-// Perm gen allocator
-// 2M pool
-#define GC_PERM_POOL_SIZE (2 * 1024 * 1024)
-// 20k limit for pool allocation. At most 1% fragmentation
-#define GC_PERM_POOL_LIMIT (20 * 1024)
 uv_mutex_t gc_perm_lock;
-static uintptr_t gc_perm_pool = 0;
-static uintptr_t gc_perm_end = 0;
-
-static void *gc_perm_alloc_large(size_t sz, int zero, unsigned align, unsigned offset) JL_NOTSAFEPOINT
-{
-    // `align` must be power of two
-    assert(offset == 0 || offset < align);
-    const size_t malloc_align = sizeof(void*) == 8 ? 16 : 4;
-    if (align > 1 && (offset != 0 || align > malloc_align))
-        sz += align - 1;
-    int last_errno = errno;
-#ifdef _OS_WINDOWS_
-    DWORD last_error = GetLastError();
-#endif
-    void *base = zero ? calloc(1, sz) : malloc(sz);
-    if (base == NULL)
-        jl_throw(jl_memory_exception);
-#ifdef _OS_WINDOWS_
-    SetLastError(last_error);
-#endif
-    errno = last_errno;
-    jl_may_leak(base);
-    assert(align > 0);
-    unsigned diff = (offset - (uintptr_t)base) % align;
-    return (void*)((char*)base + diff);
-}
-
-STATIC_INLINE void *gc_try_perm_alloc_pool(size_t sz, unsigned align, unsigned offset) JL_NOTSAFEPOINT
-{
-    uintptr_t pool = LLT_ALIGN(gc_perm_pool + offset, (uintptr_t)align) - offset;
-    uintptr_t end = pool + sz;
-    if (end > gc_perm_end)
-        return NULL;
-    gc_perm_pool = end;
-    return (void*)jl_assume(pool);
-}
-
-// **NOT** a safepoint
-void *jl_gc_perm_alloc_nolock(size_t sz, int zero, unsigned align, unsigned offset)
-{
-    // The caller should have acquired `gc_perm_lock`
-    assert(align < GC_PERM_POOL_LIMIT);
-#ifndef MEMDEBUG
-    if (__unlikely(sz > GC_PERM_POOL_LIMIT))
-#endif
-        return gc_perm_alloc_large(sz, zero, align, offset);
-    void *ptr = gc_try_perm_alloc_pool(sz, align, offset);
-    if (__likely(ptr))
-        return ptr;
-    int last_errno = errno;
-#ifdef _OS_WINDOWS_
-    DWORD last_error = GetLastError();
-    void *pool = VirtualAlloc(NULL, GC_PERM_POOL_SIZE, MEM_COMMIT, PAGE_READWRITE);
-    SetLastError(last_error);
-    errno = last_errno;
-    if (__unlikely(pool == NULL))
-        return NULL;
-#else
-    void *pool = mmap(0, GC_PERM_POOL_SIZE, PROT_READ | PROT_WRITE,
-                      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-    errno = last_errno;
-    if (__unlikely(pool == MAP_FAILED))
-        return NULL;
-#endif
-    gc_perm_pool = (uintptr_t)pool;
-    gc_perm_end = gc_perm_pool + GC_PERM_POOL_SIZE;
-    return gc_try_perm_alloc_pool(sz, align, offset);
-}
-
-// **NOT** a safepoint
-void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align, unsigned offset)
-{
-    assert(align < GC_PERM_POOL_LIMIT);
-#ifndef MEMDEBUG
-    if (__unlikely(sz > GC_PERM_POOL_LIMIT))
-#endif
-        return gc_perm_alloc_large(sz, zero, align, offset);
-    uv_mutex_lock(&gc_perm_lock);
-    void *p = jl_gc_perm_alloc_nolock(sz, zero, align, offset);
-    uv_mutex_unlock(&gc_perm_lock);
-    return p;
-}
 
 JL_DLLEXPORT void jl_gc_add_finalizer(jl_value_t *v, jl_function_t *f)
 {
diff --git a/src/gc.c b/src/gc.c
index e656fa331be38..69ec08b6cf9d4 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -114,6 +114,105 @@ JL_DLLEXPORT void jl_gc_set_cb_notify_external_free(jl_gc_cb_notify_external_fre
         jl_gc_deregister_callback(&gc_cblist_notify_external_free, (jl_gc_cb_func_t)cb);
 }
 
+// Perm gen allocator
+// 2M pool
+#define GC_PERM_POOL_SIZE (2 * 1024 * 1024)
+// 20k limit for pool allocation. At most 1% fragmentation
+#define GC_PERM_POOL_LIMIT (20 * 1024)
+
+static uintptr_t gc_perm_pool = 0;
+static uintptr_t gc_perm_end = 0;
+
+static void *gc_perm_alloc_large(size_t sz, int zero, unsigned align, unsigned offset) JL_NOTSAFEPOINT
+{
+    // `align` must be power of two
+    assert(offset == 0 || offset < align);
+    const size_t malloc_align = sizeof(void*) == 8 ? 16 : 4;
+    if (align > 1 && (offset != 0 || align > malloc_align))
+        sz += align - 1;
+    int last_errno = errno;
+#ifdef _OS_WINDOWS_
+    DWORD last_error = GetLastError();
+#endif
+    void *base = zero ? calloc(1, sz) : malloc(sz);
+    if (base == NULL)
+        jl_throw(jl_memory_exception);
+#ifdef _OS_WINDOWS_
+    SetLastError(last_error);
+#endif
+    errno = last_errno;
+    jl_may_leak(base);
+    assert(align > 0);
+    unsigned diff = (offset - (uintptr_t)base) % align;
+    return (void*)((char*)base + diff);
+}
+
+STATIC_INLINE void *gc_try_perm_alloc_pool(size_t sz, unsigned align, unsigned offset) JL_NOTSAFEPOINT
+{
+    uintptr_t pool = LLT_ALIGN(gc_perm_pool + offset, (uintptr_t)align) - offset;
+    uintptr_t end = pool + sz;
+    if (end > gc_perm_end)
+        return NULL;
+    gc_perm_pool = end;
+    return (void*)jl_assume(pool);
+}
+
+// **NOT** a safepoint
+void *jl_gc_perm_alloc_nolock(size_t sz, int zero, unsigned align, unsigned offset)
+{
+    // The caller should have acquired `gc_perm_lock`
+    assert(align < GC_PERM_POOL_LIMIT);
+#ifndef MEMDEBUG
+    if (__unlikely(sz > GC_PERM_POOL_LIMIT))
+#endif
+        return gc_perm_alloc_large(sz, zero, align, offset);
+    void *ptr = gc_try_perm_alloc_pool(sz, align, offset);
+    if (__likely(ptr))
+        return ptr;
+    int last_errno = errno;
+#ifdef _OS_WINDOWS_
+    DWORD last_error = GetLastError();
+    void *pool = VirtualAlloc(NULL, GC_PERM_POOL_SIZE, MEM_COMMIT, PAGE_READWRITE);
+    SetLastError(last_error);
+    errno = last_errno;
+    if (__unlikely(pool == NULL))
+        return NULL;
+#else
+    void *pool = mmap(0, GC_PERM_POOL_SIZE, PROT_READ | PROT_WRITE,
+                      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    errno = last_errno;
+    if (__unlikely(pool == MAP_FAILED))
+        return NULL;
+#endif
+    gc_perm_pool = (uintptr_t)pool;
+    gc_perm_end = gc_perm_pool + GC_PERM_POOL_SIZE;
+    return gc_try_perm_alloc_pool(sz, align, offset);
+}
+
+// **NOT** a safepoint
+void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align, unsigned offset)
+{
+    assert(align < GC_PERM_POOL_LIMIT);
+#ifndef MEMDEBUG
+    if (__unlikely(sz > GC_PERM_POOL_LIMIT))
+#endif
+        return gc_perm_alloc_large(sz, zero, align, offset);
+    uv_mutex_lock(&gc_perm_lock);
+    void *p = jl_gc_perm_alloc_nolock(sz, zero, align, offset);
+    uv_mutex_unlock(&gc_perm_lock);
+    return p;
+}
+
+void jl_gc_notify_image_load(const char* img_data, size_t len)
+{
+    // Do nothing
+}
+
+void jl_gc_notify_image_alloc(char* img_data, size_t len)
+{
+    // Do nothing
+}
+
 // Protect all access to `finalizer_list_marked` and `to_finalize`.
 // For accessing `ptls->finalizers`, the lock is needed if a thread
 // is going to realloc the buffer (of its own list) or accessing the
diff --git a/src/julia_internal.h b/src/julia_internal.h
index b921c63444e86..6db9a7325baff 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -334,6 +334,7 @@ jl_value_t *jl_gc_big_alloc_noinline(jl_ptls_t ptls, size_t allocsz);
 #ifdef MMTK_GC
 JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_default(jl_ptls_t ptls, int pool_offset, int osize, void* ty);
 JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_big(jl_ptls_t ptls, size_t allocsz);
+extern void post_alloc(void* mutator, void* obj, size_t bytes, int allocator);
 #endif // MMTK_GC
 JL_DLLEXPORT int jl_gc_classify_pools(size_t sz, int *osize) JL_NOTSAFEPOINT;
 extern uv_mutex_t gc_perm_lock;
@@ -344,6 +345,8 @@ void *jl_gc_perm_alloc(size_t sz, int zero,
 void jl_gc_force_mark_old(jl_ptls_t ptls, jl_value_t *v);
 void gc_sweep_sysimg(void);
 
+void jl_gc_notify_image_load(const char* img_data, size_t len);
+void jl_gc_notify_image_alloc(char* img_data, size_t len);
 
 // pools are 16376 bytes large (GC_POOL_SZ - GC_PAGE_OFFSET)
 static const int jl_gc_sizeclasses[] = {
@@ -534,8 +537,13 @@ STATIC_INLINE jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT
                                                  sizeof(void*) * 2 : 16));
     jl_taggedvalue_t *o = (jl_taggedvalue_t*)jl_gc_perm_alloc(allocsz, 0, align,
                                                               sizeof(void*) % align);
+    // Possibly we do not need this for MMTk. We could declare a post_alloc func and define it differently in two GCs.
     uintptr_t tag = (uintptr_t)ty;
     o->header = tag | GC_OLD_MARKED;
+#ifdef MMTK_GC
+    jl_ptls_t ptls = jl_current_task->ptls;
+    post_alloc(ptls->mmtk_mutator_ptr, jl_valueof(o), allocsz, 1);
+#endif
     return jl_valueof(o);
 }
 jl_value_t *jl_permbox8(jl_datatype_t *t, int8_t x);
diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index 00cd54c9df920..943570167e1ff 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -480,6 +480,28 @@ void objprofile_reset(void)
 {
 }
 
+void *jl_gc_perm_alloc_nolock(size_t sz, int zero, unsigned align, unsigned offset)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    void* addr = alloc(ptls->mmtk_mutator_ptr, sz, align, offset, 1);
+    return addr;
+}
+
+void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align, unsigned offset)
+{
+    return jl_gc_perm_alloc_nolock(sz, zero, align, offset);
+}
+
+void jl_gc_notify_image_load(const char* img_data, size_t len)
+{
+    // TODO: We should notify MMTk about the image (VM space)
+}
+
+void jl_gc_notify_image_alloc(char* img_data, size_t len)
+{
+    // TODO: We should call MMTk to bulk set object metadata for the image region
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/staticdata.c b/src/staticdata.c
index 804193ff90229..16b4791bb4200 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -489,6 +489,7 @@ static void jl_load_sysimg_so(void)
     jl_dlsym(jl_sysimg_handle, "jl_system_image_data", (void **)&sysimg_data, 1);
     size_t *plen;
     jl_dlsym(jl_sysimg_handle, "jl_system_image_size", (void **)&plen, 1);
+    jl_gc_notify_image_load(sysimg_data, *plen);
     jl_restore_system_image_data(sysimg_data, *plen);
 }
 
@@ -3235,6 +3236,7 @@ static jl_value_t *jl_restore_package_image_from_stream(ios_t *f, jl_image_t *im
         JL_SIGATOMIC_BEGIN();
         size_t len = dataendpos - datastartpos;
         char *sysimg = (char*)jl_gc_perm_alloc(len, 0, 64, 0);
+        jl_gc_notify_image_alloc(sysimg, len);
         ios_seek(f, datastartpos);
         if (ios_readall(f, sysimg, len) != len || jl_crc32c(0, sysimg, len) != (uint32_t)checksum) {
             restored = jl_get_exceptionf(jl_errorexception_type, "Error reading system image file.");
@@ -3331,6 +3333,7 @@ JL_DLLEXPORT void jl_restore_system_image(const char *fname)
         ios_seek_end(&f);
         size_t len = ios_pos(&f);
         char *sysimg = (char*)jl_gc_perm_alloc(len, 0, 64, 0);
+        jl_gc_notify_image_alloc(sysimg, len);
         ios_seek(&f, 0);
         if (ios_readall(&f, sysimg, len) != len)
             jl_errorf("Error reading system image file.");
diff --git a/src/symbol.c b/src/symbol.c
index 14606c82b9778..0f8b41787ad13 100644
--- a/src/symbol.c
+++ b/src/symbol.c
@@ -41,6 +41,10 @@ static jl_sym_t *mk_symbol(const char *str, size_t len) JL_NOTSAFEPOINT
     sym = (jl_sym_t*)jl_valueof(tag);
     // set to old marked so that we won't look at it in the GC or write barrier.
     tag->header = ((uintptr_t)jl_symbol_type) | GC_OLD_MARKED;
+#ifdef MMTK_GC
+    jl_ptls_t ptls = jl_current_task->ptls;
+    post_alloc(ptls->mmtk_mutator_ptr, jl_valueof(tag), nb, 1);
+#endif
     jl_atomic_store_relaxed(&sym->left, NULL);
     jl_atomic_store_relaxed(&sym->right, NULL);
     sym->hash = hash_symbol(str, len);

From a760a7ee28150261669cb0b31a8284214b3635c7 Mon Sep 17 00:00:00 2001
From: Yi Lin <qinsoon@gmail.com>
Date: Thu, 4 May 2023 17:34:21 +1200
Subject: [PATCH 004/116] Implement MMTk write barrier (#11)

* Implement MMTk write barrier
* Check which barrier to use in codegen
---
 src/jl_exported_funcs.inc      |  2 +
 src/julia.h                    | 16 ++++++++
 src/julia_internal.h           |  5 +++
 src/llvm-final-gc-lowering.cpp | 49 ++++++++++++++++++++++++-
 src/llvm-late-gc-lowering.cpp  | 44 ++++++++++++++++++++++
 src/llvm-pass-helpers.cpp      | 67 ++++++++++++++++++++++++++++++++++
 src/llvm-pass-helpers.h        | 10 +++++
 src/mmtk-gc.c                  | 11 ++++++
 8 files changed, 203 insertions(+), 1 deletion(-)

diff --git a/src/jl_exported_funcs.inc b/src/jl_exported_funcs.inc
index c475184573faa..b51e55510e172 100644
--- a/src/jl_exported_funcs.inc
+++ b/src/jl_exported_funcs.inc
@@ -186,6 +186,8 @@
     XX(jl_gc_pool_alloc) \
     XX(jl_gc_queue_multiroot) \
     XX(jl_gc_queue_root) \
+    XX(jl_gc_wb1_noinline) \
+    XX(jl_gc_wb2_noinline) \
     XX(jl_gc_safepoint) \
     XX(jl_gc_schedule_foreign_sweepfunc) \
     XX(jl_gc_set_cb_notify_external_alloc) \
diff --git a/src/julia.h b/src/julia.h
index 8a8624360fc7a..710fd11cf9372 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -961,17 +961,23 @@ STATIC_INLINE void jl_gc_multi_wb(const void *parent, const jl_value_t *ptr) JL_
 }
 
 #else  // MMTK_GC
+// MMTk's write barrier method. This is the full write barier including fastpath and slowpath.
+// TODO: We should inline fastpath in the following functions, and only call slowpath.
+STATIC_INLINE void mmtk_gc_wb_full(const void *parent, const void *ptr) JL_NOTSAFEPOINT;
 
 STATIC_INLINE void jl_gc_wb(const void *parent, const void *ptr) JL_NOTSAFEPOINT
 {
+    mmtk_gc_wb_full(parent, ptr);
 }
 
 STATIC_INLINE void jl_gc_wb_back(const void *ptr) JL_NOTSAFEPOINT // ptr isa jl_value_t*
 {
+    mmtk_gc_wb_full(ptr, (void*)0);
 }
 
 STATIC_INLINE void jl_gc_multi_wb(const void *parent, const jl_value_t *ptr) JL_NOTSAFEPOINT
 {
+    mmtk_gc_wb_full(parent, (void*)0);
 }
 #endif // MMTK_GC
 
@@ -2268,6 +2274,16 @@ typedef struct {
 } jl_cgparams_t;
 extern JL_DLLEXPORT int jl_default_debug_info_kind;
 
+#ifdef MMTK_GC
+extern void mmtk_object_reference_write_post(void* mutator, const void* parent, const void* ptr);
+STATIC_INLINE void mmtk_gc_wb_full(const void *parent, const void *ptr) JL_NOTSAFEPOINT
+{
+    jl_task_t *ct = jl_current_task;
+    jl_ptls_t ptls = ct->ptls;
+    mmtk_object_reference_write_post(ptls->mmtk_mutator_ptr, parent, ptr);
+}
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 6db9a7325baff..65e1966385039 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -335,6 +335,7 @@ jl_value_t *jl_gc_big_alloc_noinline(jl_ptls_t ptls, size_t allocsz);
 JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_default(jl_ptls_t ptls, int pool_offset, int osize, void* ty);
 JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_big(jl_ptls_t ptls, size_t allocsz);
 extern void post_alloc(void* mutator, void* obj, size_t bytes, int allocator);
+extern uint8_t mmtk_needs_write_barrier(void);
 #endif // MMTK_GC
 JL_DLLEXPORT int jl_gc_classify_pools(size_t sz, int *osize) JL_NOTSAFEPOINT;
 extern uv_mutex_t gc_perm_lock;
@@ -616,12 +617,16 @@ STATIC_INLINE void jl_gc_wb_buf(void *parent, void *bufptr, size_t minsz) JL_NOT
 
 #else  // MMTK_GC
 
+// TODO: We should inline fastpath in the following functions, and only call slowpath.
+
 STATIC_INLINE void jl_gc_wb_binding(jl_binding_t *bnd, void *val) JL_NOTSAFEPOINT // val isa jl_value_t*
 {
+    mmtk_gc_wb_full(bnd, val);
 }
 
 STATIC_INLINE void jl_gc_wb_buf(void *parent, void *bufptr, size_t minsz) JL_NOTSAFEPOINT // parent isa jl_value_t*
 {
+    mmtk_gc_wb_full(parent, (void*)0);
 }
 #endif // MMTK_GC
 
diff --git a/src/llvm-final-gc-lowering.cpp b/src/llvm-final-gc-lowering.cpp
index 5b8eeb49f60ad..a41f69d74b1e5 100644
--- a/src/llvm-final-gc-lowering.cpp
+++ b/src/llvm-final-gc-lowering.cpp
@@ -48,6 +48,10 @@ struct FinalLowerGC: private JuliaPassContext {
     Function *queueRootFunc;
     Function *poolAllocFunc;
     Function *bigAllocFunc;
+#ifdef MMTK_GC
+    Function *writeBarrier1Func;
+    Function *writeBarrier2Func;
+#endif
     Instruction *pgcstack;
 
     // Lowers a `julia.new_gc_frame` intrinsic.
@@ -70,6 +74,11 @@ struct FinalLowerGC: private JuliaPassContext {
 
     // Lowers a `julia.safepoint` intrinsic.
     Value *lowerSafepoint(CallInst *target, Function &F);
+
+#ifdef MMTK_GC
+    Value *lowerWriteBarrier1(CallInst *target, Function &F);
+    Value *lowerWriteBarrier2(CallInst *target, Function &F);
+#endif
 };
 
 Value *FinalLowerGC::lowerNewGCFrame(CallInst *target, Function &F)
@@ -204,6 +213,22 @@ Value *FinalLowerGC::lowerSafepoint(CallInst *target, Function &F)
     return load;
 }
 
+#ifdef MMTK_GC
+Value *FinalLowerGC::lowerWriteBarrier1(CallInst *target, Function &F)
+{
+    assert(target->arg_size() == 1);
+    target->setCalledFunction(writeBarrier1Func);
+    return target;
+}
+
+Value *FinalLowerGC::lowerWriteBarrier2(CallInst *target, Function &F)
+{
+    assert(target->arg_size() == 2);
+    target->setCalledFunction(writeBarrier2Func);
+    return target;
+}
+#endif
+
 Value *FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
 {
     ++GCAllocBytesCount;
@@ -311,8 +336,13 @@ bool FinalLowerGC::doInitialization(Module &M) {
     queueRootFunc = getOrDeclare(jl_well_known::GCQueueRoot);
     poolAllocFunc = getOrDeclare(jl_well_known::GCPoolAlloc);
     bigAllocFunc = getOrDeclare(jl_well_known::GCBigAlloc);
-
+#ifdef MMTK_GC
+    writeBarrier1Func = getOrDeclare(jl_well_known::GCWriteBarrier1);
+    writeBarrier2Func = getOrDeclare(jl_well_known::GCWriteBarrier2);
+    GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc, writeBarrier1Func, writeBarrier2Func};
+#else
     GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc};
+#endif
     unsigned j = 0;
     for (unsigned i = 0; i < sizeof(functionList) / sizeof(void*); i++) {
         if (!functionList[i])
@@ -328,8 +358,13 @@ bool FinalLowerGC::doInitialization(Module &M) {
 
 bool FinalLowerGC::doFinalization(Module &M)
 {
+#ifdef MMTK_GC
+    GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc, writeBarrier1Func, writeBarrier2Func};
+    queueRootFunc = poolAllocFunc = bigAllocFunc = writeBarrier1Func = writeBarrier2Func = nullptr;
+#else
     GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc};
     queueRootFunc = poolAllocFunc = bigAllocFunc = nullptr;
+#endif
     auto used = M.getGlobalVariable("llvm.compiler.used");
     if (!used)
         return false;
@@ -399,6 +434,10 @@ bool FinalLowerGC::runOnFunction(Function &F)
     auto GCAllocBytesFunc = getOrNull(jl_intrinsics::GCAllocBytes);
     auto queueGCRootFunc = getOrNull(jl_intrinsics::queueGCRoot);
     auto safepointFunc = getOrNull(jl_intrinsics::safepoint);
+#ifdef MMTK_GC
+    auto writeBarrier1Func = getOrNull(jl_intrinsics::writeBarrier1);
+    auto writeBarrier2Func = getOrNull(jl_intrinsics::writeBarrier2);
+#endif
 
     // Lower all calls to supported intrinsics.
     for (BasicBlock &BB : F) {
@@ -432,6 +471,14 @@ bool FinalLowerGC::runOnFunction(Function &F)
             else if (callee == queueGCRootFunc) {
                 replaceInstruction(CI, lowerQueueGCRoot(CI, F), it);
             }
+#ifdef MMTK_GC
+            else if (callee == writeBarrier1Func) {
+                replaceInstruction(CI, lowerWriteBarrier1(CI, F), it);
+            }
+            else if (callee == writeBarrier2Func) {
+                replaceInstruction(CI, lowerWriteBarrier2(CI, F), it);
+            }
+#endif
             else if (callee == safepointFunc) {
                 lowerSafepoint(CI, F);
                 it = CI->eraseFromParent();
diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
index 6837dc505a503..11f807bdca33f 100644
--- a/src/llvm-late-gc-lowering.cpp
+++ b/src/llvm-late-gc-lowering.cpp
@@ -2512,6 +2512,50 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) {
         }
         IRBuilder<> builder(CI);
         builder.SetCurrentDebugLocation(CI->getDebugLoc());
+#ifndef MMTK_GC
+        auto parBits = builder.CreateAnd(EmitLoadTag(builder, parent), 3);
+        auto parOldMarked = builder.CreateICmpEQ(parBits, ConstantInt::get(T_size, 3));
+        auto mayTrigTerm = SplitBlockAndInsertIfThen(parOldMarked, CI, false);
+        builder.SetInsertPoint(mayTrigTerm);
+        Value *anyChldNotMarked = NULL;
+        for (unsigned i = 1; i < CI->arg_size(); i++) {
+            Value *child = CI->getArgOperand(i);
+            Value *chldBit = builder.CreateAnd(EmitLoadTag(builder, child), 1);
+            Value *chldNotMarked = builder.CreateICmpEQ(chldBit, ConstantInt::get(T_size, 0));
+            anyChldNotMarked = anyChldNotMarked ? builder.CreateOr(anyChldNotMarked, chldNotMarked) : chldNotMarked;
+        }
+        assert(anyChldNotMarked); // handled by all_of test above
+        MDBuilder MDB(parent->getContext());
+        SmallVector<uint32_t, 2> Weights{1, 9};
+        auto trigTerm = SplitBlockAndInsertIfThen(anyChldNotMarked, mayTrigTerm, false,
+                                                MDB.createBranchWeights(Weights));
+        builder.SetInsertPoint(trigTerm);
+        if (CI->getCalledOperand() == write_barrier_func) {
+            builder.CreateCall(getOrDeclare(jl_intrinsics::queueGCRoot), parent);
+        }
+        else {
+            assert(false);
+        }
+#else
+        if (CI->getCalledOperand() == write_barrier_func) {
+            // if (CI->arg_size() == 2) {
+            //     // parent, target
+            //     Function *wb_func = getOrDeclare(jl_intrinsics::writeBarrier2);
+            //     builder.CreateCall(wb_func, { parent, CI->getArgOperand(1) }); // We need to be careful about arg1, which may not match the type for wb_func. We probably need a bitcast
+            // } else {
+            //     // parent and many targets
+            //     Function *wb_func = getOrDeclare(jl_intrinsics::writeBarrier1);
+            //     builder.CreateCall(wb_func, { parent });
+            // }
+            auto barrier = mmtk_needs_write_barrier();
+            if (barrier == 1) {
+                // We only care about parent
+                Function *wb_func = getOrDeclare(jl_intrinsics::writeBarrier1);
+                builder.CreateCall(wb_func, { parent });
+            }
+        }
+#endif
+
         auto parBits = builder.CreateAnd(EmitLoadTag(builder, parent), 3);
         auto parOldMarked = builder.CreateICmpEQ(parBits, ConstantInt::get(T_size, 3));
         auto mayTrigTerm = SplitBlockAndInsertIfThen(parOldMarked, CI, false);
diff --git a/src/llvm-pass-helpers.cpp b/src/llvm-pass-helpers.cpp
index ea390f01010fd..ff65ec7de3aab 100644
--- a/src/llvm-pass-helpers.cpp
+++ b/src/llvm-pass-helpers.cpp
@@ -117,6 +117,10 @@ namespace jl_intrinsics {
     static const char *POP_GC_FRAME_NAME = "julia.pop_gc_frame";
     static const char *QUEUE_GC_ROOT_NAME = "julia.queue_gc_root";
     static const char *SAFEPOINT_NAME = "julia.safepoint";
+#ifdef MMTK_GC
+    static const char *WRITE_BARRIER_1_NAME = "julia.write_barrier1_noinline";
+    static const char *WRITE_BARRIER_2_NAME = "julia.write_barrier2_noinline";
+#endif
 
     // Annotates a function with attributes suitable for GC allocation
     // functions. Specifically, the return value is marked noalias and nonnull.
@@ -223,12 +227,45 @@ namespace jl_intrinsics {
             intrinsic->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
             return intrinsic;
         });
+
+#ifdef MMTK_GC
+    const IntrinsicDescription writeBarrier1(
+        WRITE_BARRIER_1_NAME,
+        [](const JuliaPassContext &context) {
+            auto intrinsic = Function::Create(
+                FunctionType::get(
+                    Type::getVoidTy(context.getLLVMContext()),
+                    { context.T_prjlvalue },
+                    false),
+                Function::ExternalLinkage,
+                WRITE_BARRIER_1_NAME);
+            intrinsic->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
+            return intrinsic;
+        });
+    const IntrinsicDescription writeBarrier2(
+        WRITE_BARRIER_2_NAME,
+        [](const JuliaPassContext &context) {
+            auto intrinsic = Function::Create(
+                FunctionType::get(
+                    Type::getVoidTy(context.getLLVMContext()),
+                    { context.T_prjlvalue, context.T_prjlvalue },
+                    false),
+                Function::ExternalLinkage,
+                WRITE_BARRIER_2_NAME);
+            intrinsic->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
+            return intrinsic;
+        });
+#endif
 }
 
 namespace jl_well_known {
     static const char *GC_BIG_ALLOC_NAME = XSTR(jl_gc_big_alloc);
     static const char *GC_POOL_ALLOC_NAME = XSTR(jl_gc_pool_alloc);
     static const char *GC_QUEUE_ROOT_NAME = XSTR(jl_gc_queue_root);
+#ifdef MMTK_GC
+    static const char *GC_WB_1_NAME = XSTR(jl_gc_wb1_noinline);
+    static const char *GC_WB_2_NAME = XSTR(jl_gc_wb2_noinline);
+#endif
 
     using jl_intrinsics::addGCAllocAttributes;
 
@@ -276,4 +313,34 @@ namespace jl_well_known {
             func->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
             return func;
         });
+
+#ifdef MMTK_GC
+    const WellKnownFunctionDescription GCWriteBarrier1(
+        GC_WB_1_NAME,
+        [](const JuliaPassContext &context) {
+            auto func = Function::Create(
+                FunctionType::get(
+                    Type::getVoidTy(context.getLLVMContext()),
+                    { context.T_prjlvalue },
+                    false),
+                Function::ExternalLinkage,
+                GC_WB_1_NAME);
+            func->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
+            return func;
+    });
+
+    const WellKnownFunctionDescription GCWriteBarrier2(
+        GC_WB_2_NAME,
+        [](const JuliaPassContext &context) {
+            auto func = Function::Create(
+                FunctionType::get(
+                    Type::getVoidTy(context.getLLVMContext()),
+                    { context.T_prjlvalue, context.T_prjlvalue },
+                    false),
+                Function::ExternalLinkage,
+                GC_WB_2_NAME);
+            func->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
+            return func;
+    });
+#endif
 }
diff --git a/src/llvm-pass-helpers.h b/src/llvm-pass-helpers.h
index 2b2bd50cd0e4d..7f4d7646829f3 100644
--- a/src/llvm-pass-helpers.h
+++ b/src/llvm-pass-helpers.h
@@ -129,6 +129,11 @@ namespace jl_intrinsics {
 
     // `julia.safepoint`: an intrinsic that triggers a GC safepoint.
     extern const IntrinsicDescription safepoint;
+
+#ifdef MMTK_GC
+    extern const IntrinsicDescription writeBarrier1;
+    extern const IntrinsicDescription writeBarrier2;
+#endif
 }
 
 // A namespace for well-known Julia runtime function descriptions.
@@ -149,6 +154,11 @@ namespace jl_well_known {
 
     // `jl_gc_queue_root`: queues a GC root.
     extern const WellKnownFunctionDescription GCQueueRoot;
+
+#ifdef MMTK_GC
+    extern const WellKnownFunctionDescription GCWriteBarrier1;
+    extern const WellKnownFunctionDescription GCWriteBarrier2;
+#endif
 }
 
 #endif
diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index 943570167e1ff..06a0a028303ab 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -480,6 +480,17 @@ void objprofile_reset(void)
 {
 }
 
+// No inline write barrier -- only used for debugging
+JL_DLLEXPORT void jl_gc_wb1_noinline(const void *parent) JL_NOTSAFEPOINT
+{
+    jl_gc_wb_back(parent);
+}
+
+JL_DLLEXPORT void jl_gc_wb2_noinline(const void *parent, const void *ptr) JL_NOTSAFEPOINT
+{
+    jl_gc_wb(parent, ptr);
+}
+
 void *jl_gc_perm_alloc_nolock(size_t sz, int zero, unsigned align, unsigned offset)
 {
     jl_ptls_t ptls = jl_current_task->ptls;

From 95bc54ab673fcdf11604324638f597f32158a22f Mon Sep 17 00:00:00 2001
From: Yi Lin <qinsoon@gmail.com>
Date: Fri, 5 May 2023 22:45:31 +1200
Subject: [PATCH 005/116] Allow GC to implement array ptr copy (#10)

---
 src/array.c          | 69 +-------------------------------------------
 src/gc.c             | 65 +++++++++++++++++++++++++++++++++++++++++
 src/julia.h          |  8 +++++
 src/julia_internal.h |  2 ++
 src/mmtk-gc.c        |  6 ++++
 5 files changed, 82 insertions(+), 68 deletions(-)

diff --git a/src/array.c b/src/array.c
index c6cefbebceb20..86b1056ef4d07 100644
--- a/src/array.c
+++ b/src/array.c
@@ -59,15 +59,6 @@ JL_DLLEXPORT char *jl_array_typetagdata(jl_array_t *a) JL_NOTSAFEPOINT
     return ((char*)jl_array_data(a)) + ((jl_array_ndims(a) == 1 ? (a->maxsize - a->offset) : jl_array_len(a)) * a->elsize) + a->offset;
 }
 
-STATIC_INLINE jl_value_t *jl_array_owner(jl_array_t *a JL_PROPAGATES_ROOT) JL_NOTSAFEPOINT
-{
-    if (a->flags.how == 3) {
-        a = (jl_array_t*)jl_array_data_owner(a);
-        assert(jl_is_string(a) || a->flags.how != 3);
-    }
-    return (jl_value_t*)a;
-}
-
 #if defined(_P64) && defined(UINT128MAX)
 typedef __uint128_t wideint_t;
 #else
@@ -1198,69 +1189,11 @@ JL_DLLEXPORT jl_array_t *jl_array_copy(jl_array_t *ary)
     return new_ary;
 }
 
-// Copy element by element until we hit a young object, at which point
-// we can finish by using `memmove`.
-static NOINLINE ssize_t jl_array_ptr_copy_forward(jl_value_t *owner,
-                                                  void **src_p, void **dest_p,
-                                                  ssize_t n) JL_NOTSAFEPOINT
-{
-    _Atomic(void*) *src_pa = (_Atomic(void*)*)src_p;
-    _Atomic(void*) *dest_pa = (_Atomic(void*)*)dest_p;
-    for (ssize_t i = 0; i < n; i++) {
-        void *val = jl_atomic_load_relaxed(src_pa + i);
-        jl_atomic_store_release(dest_pa + i, val);
-        // `val` is young or old-unmarked
-        if (val && !(jl_astaggedvalue(val)->bits.gc & GC_MARKED)) {
-            jl_gc_queue_root(owner);
-            return i;
-        }
-    }
-    return n;
-}
-
-static NOINLINE ssize_t jl_array_ptr_copy_backward(jl_value_t *owner,
-                                                   void **src_p, void **dest_p,
-                                                   ssize_t n) JL_NOTSAFEPOINT
-{
-    _Atomic(void*) *src_pa = (_Atomic(void*)*)src_p;
-    _Atomic(void*) *dest_pa = (_Atomic(void*)*)dest_p;
-    for (ssize_t i = 0; i < n; i++) {
-        void *val = jl_atomic_load_relaxed(src_pa + n - i - 1);
-        jl_atomic_store_release(dest_pa + n - i - 1, val);
-        // `val` is young or old-unmarked
-        if (val && !(jl_astaggedvalue(val)->bits.gc & GC_MARKED)) {
-            jl_gc_queue_root(owner);
-            return i;
-        }
-    }
-    return n;
-}
-
 // Unsafe, assume inbounds and that dest and src have the same eltype
 JL_DLLEXPORT void jl_array_ptr_copy(jl_array_t *dest, void **dest_p,
                                     jl_array_t *src, void **src_p, ssize_t n) JL_NOTSAFEPOINT
 {
-    assert(dest->flags.ptrarray && src->flags.ptrarray);
-    jl_value_t *owner = jl_array_owner(dest);
-    // Destination is old and doesn't refer to any young object
-    if (__unlikely(jl_astaggedvalue(owner)->bits.gc == GC_OLD_MARKED)) {
-        jl_value_t *src_owner = jl_array_owner(src);
-        // Source is young or being promoted or might refer to young objects
-        // (i.e. source is not an old object that doesn't have wb triggered)
-        if (jl_astaggedvalue(src_owner)->bits.gc != GC_OLD_MARKED) {
-            ssize_t done;
-            if (dest_p < src_p || dest_p > src_p + n) {
-                done = jl_array_ptr_copy_forward(owner, src_p, dest_p, n);
-                dest_p += done;
-                src_p += done;
-            }
-            else {
-                done = jl_array_ptr_copy_backward(owner, src_p, dest_p, n);
-            }
-            n -= done;
-        }
-    }
-    memmove_refs(dest_p, src_p, n);
+    jl_gc_array_ptr_copy(dest, dest_p, src, src_p, n);
 }
 
 JL_DLLEXPORT void jl_array_ptr_1d_push(jl_array_t *a, jl_value_t *item)
diff --git a/src/gc.c b/src/gc.c
index 69ec08b6cf9d4..0f6c13777e265 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -114,6 +114,71 @@ JL_DLLEXPORT void jl_gc_set_cb_notify_external_free(jl_gc_cb_notify_external_fre
         jl_gc_deregister_callback(&gc_cblist_notify_external_free, (jl_gc_cb_func_t)cb);
 }
 
+// Copy element by element until we hit a young object, at which point
+// we can finish by using `memmove`.
+static NOINLINE ssize_t jl_array_ptr_copy_forward(jl_value_t *owner,
+                                                  void **src_p, void **dest_p,
+                                                  ssize_t n) JL_NOTSAFEPOINT
+{
+    _Atomic(void*) *src_pa = (_Atomic(void*)*)src_p;
+    _Atomic(void*) *dest_pa = (_Atomic(void*)*)dest_p;
+    for (ssize_t i = 0; i < n; i++) {
+        void *val = jl_atomic_load_relaxed(src_pa + i);
+        jl_atomic_store_release(dest_pa + i, val);
+        // `val` is young or old-unmarked
+        if (val && !(jl_astaggedvalue(val)->bits.gc & GC_MARKED)) {
+            jl_gc_queue_root(owner);
+            return i;
+        }
+    }
+    return n;
+}
+
+static NOINLINE ssize_t jl_array_ptr_copy_backward(jl_value_t *owner,
+                                                   void **src_p, void **dest_p,
+                                                   ssize_t n) JL_NOTSAFEPOINT
+{
+    _Atomic(void*) *src_pa = (_Atomic(void*)*)src_p;
+    _Atomic(void*) *dest_pa = (_Atomic(void*)*)dest_p;
+    for (ssize_t i = 0; i < n; i++) {
+        void *val = jl_atomic_load_relaxed(src_pa + n - i - 1);
+        jl_atomic_store_release(dest_pa + n - i - 1, val);
+        // `val` is young or old-unmarked
+        if (val && !(jl_astaggedvalue(val)->bits.gc & GC_MARKED)) {
+            jl_gc_queue_root(owner);
+            return i;
+        }
+    }
+    return n;
+}
+
+// Unsafe, assume inbounds and that dest and src have the same eltype
+JL_DLLEXPORT void jl_gc_array_ptr_copy(jl_array_t *dest, void **dest_p,
+                                       jl_array_t *src, void **src_p, ssize_t n) JL_NOTSAFEPOINT
+{
+    assert(dest->flags.ptrarray && src->flags.ptrarray);
+    jl_value_t *owner = jl_array_owner(dest);
+    // Destination is old and doesn't refer to any young object
+    if (__unlikely(jl_astaggedvalue(owner)->bits.gc == GC_OLD_MARKED)) {
+        jl_value_t *src_owner = jl_array_owner(src);
+        // Source is young or being promoted or might refer to young objects
+        // (i.e. source is not an old object that doesn't have wb triggered)
+        if (jl_astaggedvalue(src_owner)->bits.gc != GC_OLD_MARKED) {
+            ssize_t done;
+            if (dest_p < src_p || dest_p > src_p + n) {
+                done = jl_array_ptr_copy_forward(owner, src_p, dest_p, n);
+                dest_p += done;
+                src_p += done;
+            }
+            else {
+                done = jl_array_ptr_copy_backward(owner, src_p, dest_p, n);
+            }
+            n -= done;
+        }
+    }
+    memmove_refs(dest_p, src_p, n);
+}
+
 // Perm gen allocator
 // 2M pool
 #define GC_PERM_POOL_SIZE (2 * 1024 * 1024)
diff --git a/src/julia.h b/src/julia.h
index 710fd11cf9372..2396b7a38a00d 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -1345,6 +1345,14 @@ STATIC_INLINE int jl_is_array(void *v) JL_NOTSAFEPOINT
     return jl_is_array_type(t);
 }
 
+STATIC_INLINE jl_value_t *jl_array_owner(jl_array_t *a JL_PROPAGATES_ROOT) JL_NOTSAFEPOINT
+{
+    if (a->flags.how == 3) {
+        a = (jl_array_t*)jl_array_data_owner(a);
+        assert(jl_is_string(a) || a->flags.how != 3);
+    }
+    return (jl_value_t*)a;
+}
 
 STATIC_INLINE int jl_is_opaque_closure_type(void *t) JL_NOTSAFEPOINT
 {
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 65e1966385039..6d456b470a116 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -630,6 +630,8 @@ STATIC_INLINE void jl_gc_wb_buf(void *parent, void *bufptr, size_t minsz) JL_NOT
 }
 #endif // MMTK_GC
 
+JL_DLLEXPORT void jl_gc_array_ptr_copy(jl_array_t *dest, void **dest_p, jl_array_t *src, void **src_p, ssize_t n) JL_NOTSAFEPOINT;
+
 void jl_gc_debug_print_status(void) JL_NOTSAFEPOINT;
 JL_DLLEXPORT void jl_gc_debug_critical_error(void) JL_NOTSAFEPOINT;
 void jl_print_gc_stats(JL_STREAM *s);
diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index 06a0a028303ab..9dc21c2ad48db 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -480,6 +480,12 @@ void objprofile_reset(void)
 {
 }
 
+JL_DLLEXPORT void jl_gc_array_ptr_copy(jl_array_t *dest, void **dest_p, jl_array_t *src, void **src_p, ssize_t n) JL_NOTSAFEPOINT
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    mmtk_memory_region_copy(ptls->mmtk_mutator_ptr, jl_array_owner(src), src_p, jl_array_owner(dest), dest_p, n);
+}
+
 // No inline write barrier -- only used for debugging
 JL_DLLEXPORT void jl_gc_wb1_noinline(const void *parent) JL_NOTSAFEPOINT
 {

From eb407eb68ea976df72c8e1cd6ec8607ff8d98fa5 Mon Sep 17 00:00:00 2001
From: Yi Lin <qinsoon@gmail.com>
Date: Mon, 8 May 2023 17:51:14 +1200
Subject: [PATCH 006/116] Use MMTk VM space (#12)

---
 src/mmtk-gc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index 9dc21c2ad48db..b354d287baa14 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -511,7 +511,7 @@ void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align, unsigned offset)
 
 void jl_gc_notify_image_load(const char* img_data, size_t len)
 {
-    // TODO: We should notify MMTk about the image (VM space)
+    mmtk_set_vm_space((void*)img_data, len);
 }
 
 void jl_gc_notify_image_alloc(char* img_data, size_t len)

From 620cb793090956892210c4233dd8e35c4d19d873 Mon Sep 17 00:00:00 2001
From: Yi Lin <qinsoon@gmail.com>
Date: Tue, 9 May 2023 20:37:17 +1200
Subject: [PATCH 007/116] Remove duplicate code that is possibly introduced
 during merging (#13)

---
 src/llvm-late-gc-lowering.cpp | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
index 11f807bdca33f..8a0210c626935 100644
--- a/src/llvm-late-gc-lowering.cpp
+++ b/src/llvm-late-gc-lowering.cpp
@@ -2556,29 +2556,6 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) {
         }
 #endif
 
-        auto parBits = builder.CreateAnd(EmitLoadTag(builder, parent), 3);
-        auto parOldMarked = builder.CreateICmpEQ(parBits, ConstantInt::get(T_size, 3));
-        auto mayTrigTerm = SplitBlockAndInsertIfThen(parOldMarked, CI, false);
-        builder.SetInsertPoint(mayTrigTerm);
-        Value *anyChldNotMarked = NULL;
-        for (unsigned i = 1; i < CI->arg_size(); i++) {
-            Value *child = CI->getArgOperand(i);
-            Value *chldBit = builder.CreateAnd(EmitLoadTag(builder, child), 1);
-            Value *chldNotMarked = builder.CreateICmpEQ(chldBit, ConstantInt::get(T_size, 0));
-            anyChldNotMarked = anyChldNotMarked ? builder.CreateOr(anyChldNotMarked, chldNotMarked) : chldNotMarked;
-        }
-        assert(anyChldNotMarked); // handled by all_of test above
-        MDBuilder MDB(parent->getContext());
-        SmallVector<uint32_t, 2> Weights{1, 9};
-        auto trigTerm = SplitBlockAndInsertIfThen(anyChldNotMarked, mayTrigTerm, false,
-                                                  MDB.createBranchWeights(Weights));
-        builder.SetInsertPoint(trigTerm);
-        if (CI->getCalledOperand() == write_barrier_func) {
-            builder.CreateCall(getOrDeclare(jl_intrinsics::queueGCRoot), parent);
-        }
-        else {
-            assert(false);
-        }
         CI->eraseFromParent();
     }
     if (maxframeargs == 0 && Frame) {

From e7e43f11687e7db62dc18e90f3ab9b24099539fd Mon Sep 17 00:00:00 2001
From: Yi Lin <qinsoon@gmail.com>
Date: Tue, 16 May 2023 16:07:44 +1200
Subject: [PATCH 008/116] Implement write barrier fastpath for sticky immix
 (#8)

This PR implements the write barrier fastpath for sticky immix in both the runtime write barrier and the codegen write barrier. There are also a few other changes: 1. pass collection type to MMTk's `handle_user_collection_request`, 2. call MMTk in `jl_gc_notify_image_alloc`.
---
 src/jl_exported_funcs.inc      |  2 ++
 src/julia.h                    | 37 ++++++++++++++++++----
 src/julia_internal.h           |  7 ++--
 src/llvm-final-gc-lowering.cpp | 35 ++++++++++++++++++--
 src/llvm-late-gc-lowering.cpp  | 57 +++++++++++++++++++++++++--------
 src/llvm-pass-helpers.cpp      | 58 ++++++++++++++++++++++++++++++++++
 src/llvm-pass-helpers.h        |  4 +++
 src/mmtk-gc.c                  | 24 +++++++++++---
 8 files changed, 192 insertions(+), 32 deletions(-)

diff --git a/src/jl_exported_funcs.inc b/src/jl_exported_funcs.inc
index b51e55510e172..1f182f37f938f 100644
--- a/src/jl_exported_funcs.inc
+++ b/src/jl_exported_funcs.inc
@@ -188,6 +188,8 @@
     XX(jl_gc_queue_root) \
     XX(jl_gc_wb1_noinline) \
     XX(jl_gc_wb2_noinline) \
+    XX(jl_gc_wb1_slow) \
+    XX(jl_gc_wb2_slow) \
     XX(jl_gc_safepoint) \
     XX(jl_gc_schedule_foreign_sweepfunc) \
     XX(jl_gc_set_cb_notify_external_alloc) \
diff --git a/src/julia.h b/src/julia.h
index 2396b7a38a00d..75ebab99dbbf7 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -961,23 +961,21 @@ STATIC_INLINE void jl_gc_multi_wb(const void *parent, const jl_value_t *ptr) JL_
 }
 
 #else  // MMTK_GC
-// MMTk's write barrier method. This is the full write barier including fastpath and slowpath.
-// TODO: We should inline fastpath in the following functions, and only call slowpath.
-STATIC_INLINE void mmtk_gc_wb_full(const void *parent, const void *ptr) JL_NOTSAFEPOINT;
+STATIC_INLINE void mmtk_gc_wb(const void *parent, const void *ptr) JL_NOTSAFEPOINT;
 
 STATIC_INLINE void jl_gc_wb(const void *parent, const void *ptr) JL_NOTSAFEPOINT
 {
-    mmtk_gc_wb_full(parent, ptr);
+    mmtk_gc_wb(parent, ptr);
 }
 
 STATIC_INLINE void jl_gc_wb_back(const void *ptr) JL_NOTSAFEPOINT // ptr isa jl_value_t*
 {
-    mmtk_gc_wb_full(ptr, (void*)0);
+    mmtk_gc_wb(ptr, (void*)0);
 }
 
 STATIC_INLINE void jl_gc_multi_wb(const void *parent, const jl_value_t *ptr) JL_NOTSAFEPOINT
 {
-    mmtk_gc_wb_full(parent, (void*)0);
+    mmtk_gc_wb(parent, (void*)0);
 }
 #endif // MMTK_GC
 
@@ -2284,12 +2282,39 @@ extern JL_DLLEXPORT int jl_default_debug_info_kind;
 
 #ifdef MMTK_GC
 extern void mmtk_object_reference_write_post(void* mutator, const void* parent, const void* ptr);
+extern void mmtk_object_reference_write_slow(void* mutator, const void* parent, const void* ptr);
+extern const uint8_t MMTK_NEEDS_WRITE_BARRIER;
+extern const uint8_t OBJECT_BARRIER;
+extern const void* MMTK_SIDE_LOG_BIT_BASE_ADDRESS;
+
+// Directly call into MMTk for write barrier (debugging only)
 STATIC_INLINE void mmtk_gc_wb_full(const void *parent, const void *ptr) JL_NOTSAFEPOINT
 {
     jl_task_t *ct = jl_current_task;
     jl_ptls_t ptls = ct->ptls;
     mmtk_object_reference_write_post(ptls->mmtk_mutator_ptr, parent, ptr);
 }
+
+// Inlined fastpath
+STATIC_INLINE void mmtk_gc_wb_fast(const void *parent, const void *ptr) JL_NOTSAFEPOINT
+{
+    if (MMTK_NEEDS_WRITE_BARRIER == OBJECT_BARRIER) {
+        intptr_t addr = (intptr_t) (void*) parent;
+        uint8_t* meta_addr = (uint8_t*) (MMTK_SIDE_LOG_BIT_BASE_ADDRESS) + (addr >> 6);
+        intptr_t shift = (addr >> 3) & 0b111;
+        uint8_t byte_val = *meta_addr;
+        if (((byte_val >> shift) & 1) == 1) {
+            jl_task_t *ct = jl_current_task;
+            jl_ptls_t ptls = ct->ptls;
+            mmtk_object_reference_write_slow(ptls->mmtk_mutator_ptr, parent, ptr);
+        }
+    }
+}
+
+STATIC_INLINE void mmtk_gc_wb(const void *parent, const void *ptr) JL_NOTSAFEPOINT
+{
+    mmtk_gc_wb_fast(parent, ptr);
+}
 #endif
 
 #ifdef __cplusplus
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 6d456b470a116..fb939e81b4a69 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -335,7 +335,6 @@ jl_value_t *jl_gc_big_alloc_noinline(jl_ptls_t ptls, size_t allocsz);
 JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_default(jl_ptls_t ptls, int pool_offset, int osize, void* ty);
 JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_big(jl_ptls_t ptls, size_t allocsz);
 extern void post_alloc(void* mutator, void* obj, size_t bytes, int allocator);
-extern uint8_t mmtk_needs_write_barrier(void);
 #endif // MMTK_GC
 JL_DLLEXPORT int jl_gc_classify_pools(size_t sz, int *osize) JL_NOTSAFEPOINT;
 extern uv_mutex_t gc_perm_lock;
@@ -617,16 +616,14 @@ STATIC_INLINE void jl_gc_wb_buf(void *parent, void *bufptr, size_t minsz) JL_NOT
 
 #else  // MMTK_GC
 
-// TODO: We should inline fastpath in the following functions, and only call slowpath.
-
 STATIC_INLINE void jl_gc_wb_binding(jl_binding_t *bnd, void *val) JL_NOTSAFEPOINT // val isa jl_value_t*
 {
-    mmtk_gc_wb_full(bnd, val);
+    mmtk_gc_wb(bnd, val);
 }
 
 STATIC_INLINE void jl_gc_wb_buf(void *parent, void *bufptr, size_t minsz) JL_NOTSAFEPOINT // parent isa jl_value_t*
 {
-    mmtk_gc_wb_full(parent, (void*)0);
+    mmtk_gc_wb(parent, (void*)0);
 }
 #endif // MMTK_GC
 
diff --git a/src/llvm-final-gc-lowering.cpp b/src/llvm-final-gc-lowering.cpp
index a41f69d74b1e5..d60a8e181177b 100644
--- a/src/llvm-final-gc-lowering.cpp
+++ b/src/llvm-final-gc-lowering.cpp
@@ -51,6 +51,8 @@ struct FinalLowerGC: private JuliaPassContext {
 #ifdef MMTK_GC
     Function *writeBarrier1Func;
     Function *writeBarrier2Func;
+    Function *writeBarrier1SlowFunc;
+    Function *writeBarrier2SlowFunc;
 #endif
     Instruction *pgcstack;
 
@@ -78,6 +80,8 @@ struct FinalLowerGC: private JuliaPassContext {
 #ifdef MMTK_GC
     Value *lowerWriteBarrier1(CallInst *target, Function &F);
     Value *lowerWriteBarrier2(CallInst *target, Function &F);
+    Value *lowerWriteBarrier1Slow(CallInst *target, Function &F);
+    Value *lowerWriteBarrier2Slow(CallInst *target, Function &F);
 #endif
 };
 
@@ -227,6 +231,21 @@ Value *FinalLowerGC::lowerWriteBarrier2(CallInst *target, Function &F)
     target->setCalledFunction(writeBarrier2Func);
     return target;
 }
+
+Value *FinalLowerGC::lowerWriteBarrier1Slow(CallInst *target, Function &F)
+{
+    assert(target->arg_size() == 1);
+    target->setCalledFunction(writeBarrier1SlowFunc);
+    return target;
+}
+
+Value *FinalLowerGC::lowerWriteBarrier2Slow(CallInst *target, Function &F)
+{
+    assert(target->arg_size() == 2);
+    target->setCalledFunction(writeBarrier2SlowFunc);
+    return target;
+}
+
 #endif
 
 Value *FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
@@ -339,7 +358,9 @@ bool FinalLowerGC::doInitialization(Module &M) {
 #ifdef MMTK_GC
     writeBarrier1Func = getOrDeclare(jl_well_known::GCWriteBarrier1);
     writeBarrier2Func = getOrDeclare(jl_well_known::GCWriteBarrier2);
-    GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc, writeBarrier1Func, writeBarrier2Func};
+    writeBarrier1SlowFunc = getOrDeclare(jl_well_known::GCWriteBarrier1Slow);
+    writeBarrier2SlowFunc = getOrDeclare(jl_well_known::GCWriteBarrier2Slow);
+    GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc, writeBarrier1Func, writeBarrier2Func, writeBarrier1SlowFunc, writeBarrier2SlowFunc};
 #else
     GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc};
 #endif
@@ -359,8 +380,8 @@ bool FinalLowerGC::doInitialization(Module &M) {
 bool FinalLowerGC::doFinalization(Module &M)
 {
 #ifdef MMTK_GC
-    GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc, writeBarrier1Func, writeBarrier2Func};
-    queueRootFunc = poolAllocFunc = bigAllocFunc = writeBarrier1Func = writeBarrier2Func = nullptr;
+    GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc, writeBarrier1Func, writeBarrier2Func, writeBarrier1SlowFunc, writeBarrier2SlowFunc};
+    queueRootFunc = poolAllocFunc = bigAllocFunc = writeBarrier1Func = writeBarrier2Func = writeBarrier1SlowFunc = writeBarrier2SlowFunc = nullptr;
 #else
     GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc};
     queueRootFunc = poolAllocFunc = bigAllocFunc = nullptr;
@@ -437,6 +458,8 @@ bool FinalLowerGC::runOnFunction(Function &F)
 #ifdef MMTK_GC
     auto writeBarrier1Func = getOrNull(jl_intrinsics::writeBarrier1);
     auto writeBarrier2Func = getOrNull(jl_intrinsics::writeBarrier2);
+    auto writeBarrier1SlowFunc = getOrNull(jl_intrinsics::writeBarrier1Slow);
+    auto writeBarrier2SlowFunc = getOrNull(jl_intrinsics::writeBarrier2Slow);
 #endif
 
     // Lower all calls to supported intrinsics.
@@ -478,6 +501,12 @@ bool FinalLowerGC::runOnFunction(Function &F)
             else if (callee == writeBarrier2Func) {
                 replaceInstruction(CI, lowerWriteBarrier2(CI, F), it);
             }
+            else if (callee == writeBarrier1SlowFunc) {
+                replaceInstruction(CI, lowerWriteBarrier1Slow(CI, F), it);
+            }
+            else if (callee == writeBarrier2SlowFunc) {
+                replaceInstruction(CI, lowerWriteBarrier2Slow(CI, F), it);
+            }
 #endif
             else if (callee == safepointFunc) {
                 lowerSafepoint(CI, F);
diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
index 8a0210c626935..eec21c0c64010 100644
--- a/src/llvm-late-gc-lowering.cpp
+++ b/src/llvm-late-gc-lowering.cpp
@@ -2537,22 +2537,51 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) {
             assert(false);
         }
 #else
+        // FIXME: Currently we call write barrier with the src object (parent).
+        // This works fine for object barrier for generational plans (such as stickyimmix), which does not use the target object at all.
+        // But for other MMTk plans, we need to be careful.
+        const bool INLINE_WRITE_BARRIER = true;
         if (CI->getCalledOperand() == write_barrier_func) {
-            // if (CI->arg_size() == 2) {
-            //     // parent, target
-            //     Function *wb_func = getOrDeclare(jl_intrinsics::writeBarrier2);
-            //     builder.CreateCall(wb_func, { parent, CI->getArgOperand(1) }); // We need to be careful about arg1, which may not match the type for wb_func. We probably need a bitcast
-            // } else {
-            //     // parent and many targets
-            //     Function *wb_func = getOrDeclare(jl_intrinsics::writeBarrier1);
-            //     builder.CreateCall(wb_func, { parent });
-            // }
-            auto barrier = mmtk_needs_write_barrier();
-            if (barrier == 1) {
-                // We only care about parent
-                Function *wb_func = getOrDeclare(jl_intrinsics::writeBarrier1);
-                builder.CreateCall(wb_func, { parent });
+            if (MMTK_NEEDS_WRITE_BARRIER == OBJECT_BARRIER) {
+                if (INLINE_WRITE_BARRIER) {
+                    auto i8_ty = Type::getInt8Ty(F.getContext());
+                    auto intptr_ty = T_size;
+
+                    // intptr_t addr = (intptr_t) (void*) src;
+                    // uint8_t* meta_addr = (uint8_t*) (SIDE_METADATA_BASE_ADDRESS + (addr >> 6));
+                    intptr_t metadata_base_address = reinterpret_cast<intptr_t>(MMTK_SIDE_LOG_BIT_BASE_ADDRESS);
+                    auto metadata_base_val = ConstantInt::get(intptr_ty, metadata_base_address);
+                    auto metadata_base_ptr = ConstantExpr::getIntToPtr(metadata_base_val, PointerType::get(i8_ty, 0));
+
+                    auto parent_val = builder.CreatePtrToInt(parent, intptr_ty);
+                    auto shr = builder.CreateLShr(parent_val, ConstantInt::get(intptr_ty, 6));
+                    auto metadata_ptr = builder.CreateGEP(i8_ty, metadata_base_ptr, shr);
+
+                    // intptr_t shift = (addr >> 3) & 0b111;
+                    auto shift = builder.CreateAnd(builder.CreateLShr(parent_val, ConstantInt::get(intptr_ty, 3)), ConstantInt::get(intptr_ty, 7));
+                    auto shift_i8 = builder.CreateTruncOrBitCast(shift, i8_ty);
+
+                    // uint8_t byte_val = *meta_addr;
+                    auto load_i8 = builder.CreateAlignedLoad(i8_ty, metadata_ptr, Align());
+
+                    // if (((byte_val >> shift) & 1) == 1) {
+                    auto shifted_load_i8 = builder.CreateLShr(load_i8, shift_i8);
+                    auto masked = builder.CreateAnd(shifted_load_i8, ConstantInt::get(i8_ty, 1));
+                    auto is_unlogged = builder.CreateICmpEQ(masked, ConstantInt::get(i8_ty, 1));
+
+                    // object_reference_write_slow_call((void*) src, (void*) slot, (void*) target);
+                    MDBuilder MDB(F.getContext());
+                    SmallVector<uint32_t, 2> Weights{1, 9};
+                    auto mayTriggerSlowpath = SplitBlockAndInsertIfThen(is_unlogged, CI, false, MDB.createBranchWeights(Weights));
+                    builder.SetInsertPoint(mayTriggerSlowpath);
+                    builder.CreateCall(getOrDeclare(jl_intrinsics::writeBarrier1Slow), { parent });
+                } else {
+                    Function *wb_func = getOrDeclare(jl_intrinsics::writeBarrier1);
+                    builder.CreateCall(wb_func, { parent });
+                }
             }
+        } else {
+            assert(false);
         }
 #endif
 
diff --git a/src/llvm-pass-helpers.cpp b/src/llvm-pass-helpers.cpp
index ff65ec7de3aab..1e1ae4bc7eada 100644
--- a/src/llvm-pass-helpers.cpp
+++ b/src/llvm-pass-helpers.cpp
@@ -120,6 +120,8 @@ namespace jl_intrinsics {
 #ifdef MMTK_GC
     static const char *WRITE_BARRIER_1_NAME = "julia.write_barrier1_noinline";
     static const char *WRITE_BARRIER_2_NAME = "julia.write_barrier2_noinline";
+    static const char *WRITE_BARRIER_1_SLOW_NAME = "julia.write_barrier_1_slow";
+    static const char *WRITE_BARRIER_2_SLOW_NAME = "julia.write_barrier_2_slow";
 #endif
 
     // Annotates a function with attributes suitable for GC allocation
@@ -255,6 +257,32 @@ namespace jl_intrinsics {
             intrinsic->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
             return intrinsic;
         });
+    const IntrinsicDescription writeBarrier1Slow(
+        WRITE_BARRIER_1_SLOW_NAME,
+        [](const JuliaPassContext &context) {
+            auto intrinsic = Function::Create(
+                FunctionType::get(
+                    Type::getVoidTy(context.getLLVMContext()),
+                    { context.T_prjlvalue },
+                    false),
+                Function::ExternalLinkage,
+                WRITE_BARRIER_1_SLOW_NAME);
+            intrinsic->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
+            return intrinsic;
+        });
+    const IntrinsicDescription writeBarrier2Slow(
+        WRITE_BARRIER_2_SLOW_NAME,
+        [](const JuliaPassContext &context) {
+            auto intrinsic = Function::Create(
+                FunctionType::get(
+                    Type::getVoidTy(context.getLLVMContext()),
+                    { context.T_prjlvalue, context.T_prjlvalue },
+                    false),
+                Function::ExternalLinkage,
+                WRITE_BARRIER_2_SLOW_NAME);
+            intrinsic->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
+            return intrinsic;
+        });
 #endif
 }
 
@@ -265,6 +293,8 @@ namespace jl_well_known {
 #ifdef MMTK_GC
     static const char *GC_WB_1_NAME = XSTR(jl_gc_wb1_noinline);
     static const char *GC_WB_2_NAME = XSTR(jl_gc_wb2_noinline);
+    static const char *GC_WB_1_SLOW_NAME = XSTR(jl_gc_wb1_slow);
+    static const char *GC_WB_2_SLOW_NAME = XSTR(jl_gc_wb2_slow);
 #endif
 
     using jl_intrinsics::addGCAllocAttributes;
@@ -342,5 +372,33 @@ namespace jl_well_known {
             func->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
             return func;
     });
+
+    const WellKnownFunctionDescription GCWriteBarrier1Slow(
+        GC_WB_1_SLOW_NAME,
+        [](const JuliaPassContext &context) {
+            auto func = Function::Create(
+                FunctionType::get(
+                    Type::getVoidTy(context.getLLVMContext()),
+                    { context.T_prjlvalue },
+                    false),
+                Function::ExternalLinkage,
+                GC_WB_1_SLOW_NAME);
+            func->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
+            return func;
+    });
+
+    const WellKnownFunctionDescription GCWriteBarrier2Slow(
+        GC_WB_2_SLOW_NAME,
+        [](const JuliaPassContext &context) {
+            auto func = Function::Create(
+                FunctionType::get(
+                    Type::getVoidTy(context.getLLVMContext()),
+                    { context.T_prjlvalue, context.T_prjlvalue },
+                    false),
+                Function::ExternalLinkage,
+                GC_WB_2_SLOW_NAME);
+            func->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
+            return func;
+    });
 #endif
 }
diff --git a/src/llvm-pass-helpers.h b/src/llvm-pass-helpers.h
index 7f4d7646829f3..d6e4be7e05338 100644
--- a/src/llvm-pass-helpers.h
+++ b/src/llvm-pass-helpers.h
@@ -133,6 +133,8 @@ namespace jl_intrinsics {
 #ifdef MMTK_GC
     extern const IntrinsicDescription writeBarrier1;
     extern const IntrinsicDescription writeBarrier2;
+    extern const IntrinsicDescription writeBarrier1Slow;
+    extern const IntrinsicDescription writeBarrier2Slow;
 #endif
 }
 
@@ -158,6 +160,8 @@ namespace jl_well_known {
 #ifdef MMTK_GC
     extern const WellKnownFunctionDescription GCWriteBarrier1;
     extern const WellKnownFunctionDescription GCWriteBarrier2;
+    extern const WellKnownFunctionDescription GCWriteBarrier1Slow;
+    extern const WellKnownFunctionDescription GCWriteBarrier2Slow;
 #endif
 }
 
diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index b354d287baa14..a9feeb6ef4921 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -192,13 +192,13 @@ void jl_gc_free_array(jl_array_t *a) JL_NOTSAFEPOINT
 
 JL_DLLEXPORT void jl_gc_queue_root(const jl_value_t *ptr)
 {
-    /* TODO: not needed? */
+    unreachable();
 }
 
 // TODO: exported, but not MMTk-specific?
 JL_DLLEXPORT void jl_gc_queue_multiroot(const jl_value_t *parent, const jl_value_t *ptr) JL_NOTSAFEPOINT
 {
-    /* TODO: confirm not needed? */
+    unreachable();
 }
 
 
@@ -207,11 +207,13 @@ JL_DLLEXPORT void jl_gc_queue_multiroot(const jl_value_t *parent, const jl_value
 
 JL_DLLEXPORT int jl_gc_mark_queue_obj(jl_ptls_t ptls, jl_value_t *obj)
 {
+    unreachable();
     return 0;
 }
 JL_DLLEXPORT void jl_gc_mark_queue_objarray(jl_ptls_t ptls, jl_value_t *parent,
                                             jl_value_t **objs, size_t nobjs)
 {
+    unreachable();
 }
 
 
@@ -229,7 +231,7 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
         jl_atomic_fetch_add((_Atomic(uint64_t)*)&gc_num.deferred_alloc, localbytes);
         return;
     }
-    handle_user_collection_request(ptls);
+    handle_user_collection_request(ptls, collection);
 }
 
 // Per-thread initialization
@@ -497,6 +499,20 @@ JL_DLLEXPORT void jl_gc_wb2_noinline(const void *parent, const void *ptr) JL_NOT
     jl_gc_wb(parent, ptr);
 }
 
+JL_DLLEXPORT void jl_gc_wb1_slow(const void *parent) JL_NOTSAFEPOINT
+{
+    jl_task_t *ct = jl_current_task;
+    jl_ptls_t ptls = ct->ptls;
+    mmtk_object_reference_write_slow(ptls->mmtk_mutator_ptr, parent, (const void*) 0);
+}
+
+JL_DLLEXPORT void jl_gc_wb2_slow(const void *parent, const void* ptr) JL_NOTSAFEPOINT
+{
+    jl_task_t *ct = jl_current_task;
+    jl_ptls_t ptls = ct->ptls;
+    mmtk_object_reference_write_slow(ptls->mmtk_mutator_ptr, parent, ptr);
+}
+
 void *jl_gc_perm_alloc_nolock(size_t sz, int zero, unsigned align, unsigned offset)
 {
     jl_ptls_t ptls = jl_current_task->ptls;
@@ -516,7 +532,7 @@ void jl_gc_notify_image_load(const char* img_data, size_t len)
 
 void jl_gc_notify_image_alloc(char* img_data, size_t len)
 {
-    // TODO: We should call MMTk to bulk set object metadata for the image region
+    mmtk_immortal_region_post_alloc((void*)img_data, len);
 }
 
 #ifdef __cplusplus

From ed8580ad3ae1d1ee46b84ec7bbe69ac9b37befca Mon Sep 17 00:00:00 2001
From: Yi Lin <qinsoon@gmail.com>
Date: Wed, 17 May 2023 01:03:35 +0000
Subject: [PATCH 009/116] WIP

---
 src/gc-common.c |  6 +++---
 src/gc.c        | 25 +++++++++++++++++++++++++
 src/gc.h        |  2 +-
 src/mmtk-gc.c   | 10 ++++++++--
 src/partr.c     | 25 -------------------------
 5 files changed, 37 insertions(+), 31 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index 0c6138231e24f..8fd368f9e0875 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -46,7 +46,7 @@ memsize_t max_total_memory = (memsize_t) 2 * 1024 * 1024 * 1024;
 
 // finalizers
 // ---
-static uint64_t finalizer_rngState[JL_RNG_SIZE];
+uint64_t finalizer_rngState[JL_RNG_SIZE];
 
 void jl_rng_split(uint64_t dst[JL_RNG_SIZE], uint64_t src[JL_RNG_SIZE]) JL_NOTSAFEPOINT;
 
@@ -259,7 +259,7 @@ static int64_t inc_live_bytes(int64_t inc) JL_NOTSAFEPOINT
 void jl_gc_reset_alloc_count(void) JL_NOTSAFEPOINT
 {
     combine_thread_gc_counts(&gc_num);
-    live_bytes += (gc_num.deferred_alloc + gc_num.allocd);
+    inc_live_bytes(gc_num.deferred_alloc + gc_num.allocd);
     gc_num.allocd = 0;
     gc_num.deferred_alloc = 0;
     reset_thread_gc_counts();
@@ -501,7 +501,7 @@ void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t oldsz,
     // TODO: not needed? gc_cache.*?
     if (jl_astaggedvalue(owner)->bits.gc == GC_OLD_MARKED) {
         ptls->gc_cache.perm_scanned_bytes += allocsz - oldsz;
-        live_bytes += allocsz - oldsz;
+        inc_live_bytes(allocsz - oldsz);
     }
     else if (allocsz < oldsz)
         jl_atomic_store_relaxed(&ptls->gc_num.freed,
diff --git a/src/gc.c b/src/gc.c
index 4a87980ae3924..d6d3955bdb68f 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -3677,6 +3677,31 @@ JL_DLLEXPORT jl_value_t *jl_gc_internal_obj_base_ptr(void *p)
     return NULL;
 }
 
+// gc thread function
+void jl_gc_threadfun(void *arg)
+{
+    jl_threadarg_t *targ = (jl_threadarg_t*)arg;
+
+    // initialize this thread (set tid and create heap)
+    jl_ptls_t ptls = jl_init_threadtls(targ->tid);
+
+    // wait for all threads
+    jl_gc_state_set(ptls, JL_GC_STATE_WAITING, 0);
+    uv_barrier_wait(targ->barrier);
+
+    // free the thread argument here
+    free(targ);
+
+    while (1) {
+        uv_mutex_lock(&gc_threads_lock);
+        while (jl_atomic_load(&gc_n_threads_marking) == 0) {
+            uv_cond_wait(&gc_threads_cond, &gc_threads_lock);
+        }
+        uv_mutex_unlock(&gc_threads_lock);
+        gc_mark_loop_parallel(ptls, 0);
+    }
+}
+
 // added for MMTk integration
 void enable_collection(void)
 {
diff --git a/src/gc.h b/src/gc.h
index a340a1ec0b545..3def80327ceda 100644
--- a/src/gc.h
+++ b/src/gc.h
@@ -42,7 +42,7 @@ extern void jl_finalize_th(jl_task_t *ct, jl_value_t *o);
 extern jl_weakref_t *jl_gc_new_weakref_th(jl_ptls_t ptls, jl_value_t *value);
 extern jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz);
 extern jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset, int osize);
-extern void jl_rng_split(uint64_t to[4], uint64_t from[4]);
+extern void jl_rng_split(uint64_t to[JL_RNG_SIZE], uint64_t from[JL_RNG_SIZE]);
 extern void gc_premark(jl_ptls_t ptls2);
 extern void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t oldsz,
                                  int isaligned, jl_value_t *owner, int8_t can_collect);
diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index b354d287baa14..08d0bed7b4304 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -229,7 +229,7 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
         jl_atomic_fetch_add((_Atomic(uint64_t)*)&gc_num.deferred_alloc, localbytes);
         return;
     }
-    handle_user_collection_request(ptls);
+    handle_user_collection_request(ptls, collection);
 }
 
 // Per-thread initialization
@@ -275,7 +275,7 @@ void jl_gc_init(void)
     if (jl_options.heap_size_hint)
         jl_gc_set_max_memory(jl_options.heap_size_hint);
 
-    JL_MUTEX_INIT(&heapsnapshot_lock);
+    JL_MUTEX_INIT(&heapsnapshot_lock, "heapsnapshot_lock");
     uv_mutex_init(&gc_perm_lock);
 
     gc_num.interval = default_collect_interval;
@@ -480,6 +480,12 @@ void objprofile_reset(void)
 {
 }
 
+// gc thread function
+void jl_gc_threadfun(void *arg)
+{
+    unreachable();
+}
+
 JL_DLLEXPORT void jl_gc_array_ptr_copy(jl_array_t *dest, void **dest_p, jl_array_t *src, void **src_p, ssize_t n) JL_NOTSAFEPOINT
 {
     jl_ptls_t ptls = jl_current_task->ptls;
diff --git a/src/partr.c b/src/partr.c
index 403f911b1284f..2c729add629e2 100644
--- a/src/partr.c
+++ b/src/partr.c
@@ -108,31 +108,6 @@ void jl_init_threadinginfra(void)
 
 void JL_NORETURN jl_finish_task(jl_task_t *t);
 
-// gc thread function
-void jl_gc_threadfun(void *arg)
-{
-    jl_threadarg_t *targ = (jl_threadarg_t*)arg;
-
-    // initialize this thread (set tid and create heap)
-    jl_ptls_t ptls = jl_init_threadtls(targ->tid);
-
-    // wait for all threads
-    jl_gc_state_set(ptls, JL_GC_STATE_WAITING, 0);
-    uv_barrier_wait(targ->barrier);
-
-    // free the thread argument here
-    free(targ);
-
-    while (1) {
-        uv_mutex_lock(&gc_threads_lock);
-        while (jl_atomic_load(&gc_n_threads_marking) == 0) {
-            uv_cond_wait(&gc_threads_cond, &gc_threads_lock);
-        }
-        uv_mutex_unlock(&gc_threads_lock);
-        gc_mark_loop_parallel(ptls, 0);
-    }
-}
-
 // thread function: used by all mutator threads except the main thread
 void jl_threadfun(void *arg)
 {

From ec37ebe24b43973746e4730572c00365dd4edf5e Mon Sep 17 00:00:00 2001
From: Yi Lin <qinsoon@gmail.com>
Date: Thu, 18 May 2023 04:45:30 +0000
Subject: [PATCH 010/116] Minor fix

---
 src/gc-common.c               | 2 ++
 src/gc.c                      | 2 --
 src/llvm-late-gc-lowering.cpp | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index 8fd368f9e0875..cfb83c08a7a6b 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -6,6 +6,8 @@ jl_gc_num_t gc_num = {0};
 size_t last_long_collect_interval;
 int gc_n_threads;
 jl_ptls_t* gc_all_tls_states;
+// `tid` of first GC thread
+int gc_first_tid;
 
 int64_t live_bytes = 0;
 
diff --git a/src/gc.c b/src/gc.c
index d6d3955bdb68f..932bb1d97c6db 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -17,8 +17,6 @@ extern "C" {
 _Atomic(int) gc_n_threads_marking;
 // `tid` of mutator thread that triggered GC
 _Atomic(int) gc_master_tid;
-// `tid` of first GC thread
-int gc_first_tid;
 // Mutex/cond used to synchronize sleep/wakeup of GC threads
 uv_mutex_t gc_threads_lock;
 uv_cond_t gc_threads_cond;
diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
index d812146027eba..4877565c61495 100644
--- a/src/llvm-late-gc-lowering.cpp
+++ b/src/llvm-late-gc-lowering.cpp
@@ -2513,6 +2513,7 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) {
         }
         IRBuilder<> builder(CI);
         builder.SetCurrentDebugLocation(CI->getDebugLoc());
+#ifndef MMTK_GC
         auto parBits = builder.CreateAnd(EmitLoadTag(builder, T_size, parent), 3);
         auto parOldMarked = builder.CreateICmpEQ(parBits, ConstantInt::get(T_size, 3));
         auto mayTrigTerm = SplitBlockAndInsertIfThen(parOldMarked, CI, false);

From 34930e502fac00cac2698ecf14564ff764b03527 Mon Sep 17 00:00:00 2001
From: Yi Lin <qinsoon@gmail.com>
Date: Thu, 18 May 2023 05:58:48 +0000
Subject: [PATCH 011/116] Use JL_DLLIMPORT for MMTk functions. Update uses of
 MMTk functions with prefix mmtk_

---
 src/init.c                    |  2 +-
 src/julia.h                   |  4 +--
 src/julia_internal.h          | 13 +++----
 src/llvm-late-gc-lowering.cpp |  2 +-
 src/llvm-pass-helpers.cpp     | 64 ++++++++++++++++++++++-------------
 src/mmtk-gc.c                 | 38 +++++++++++++--------
 src/symbol.c                  |  2 +-
 7 files changed, 76 insertions(+), 49 deletions(-)

diff --git a/src/init.c b/src/init.c
index a076b9d0fbed5..9c18a60eb8b06 100644
--- a/src/init.c
+++ b/src/init.c
@@ -825,7 +825,7 @@ JL_DLLEXPORT void julia_init(JL_IMAGE_SEARCH rel)
     jl_ptls_t ptls = jl_init_threadtls(0);
 
 #ifdef MMTK_GC
-    initialize_collection((void *)ptls);
+    mmtk_initialize_collection((void *)ptls);
 #endif
 #pragma GCC diagnostic push
 #if defined(_COMPILER_GCC_) && __GNUC__ >= 12
diff --git a/src/julia.h b/src/julia.h
index 5f692d1f4de2d..7950eca3e0f1d 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -2383,7 +2383,7 @@ extern JL_DLLEXPORT int jl_default_debug_info_kind;
 extern void mmtk_object_reference_write_post(void* mutator, const void* parent, const void* ptr);
 extern void mmtk_object_reference_write_slow(void* mutator, const void* parent, const void* ptr);
 extern const uint8_t MMTK_NEEDS_WRITE_BARRIER;
-extern const uint8_t OBJECT_BARRIER;
+extern const uint8_t MMTK_OBJECT_BARRIER;
 extern const void* MMTK_SIDE_LOG_BIT_BASE_ADDRESS;
 
 // Directly call into MMTk for write barrier (debugging only)
@@ -2397,7 +2397,7 @@ STATIC_INLINE void mmtk_gc_wb_full(const void *parent, const void *ptr) JL_NOTSA
 // Inlined fastpath
 STATIC_INLINE void mmtk_gc_wb_fast(const void *parent, const void *ptr) JL_NOTSAFEPOINT
 {
-    if (MMTK_NEEDS_WRITE_BARRIER == OBJECT_BARRIER) {
+    if (MMTK_NEEDS_WRITE_BARRIER == MMTK_OBJECT_BARRIER) {
         intptr_t addr = (intptr_t) (void*) parent;
         uint8_t* meta_addr = (uint8_t*) (MMTK_SIDE_LOG_BIT_BASE_ADDRESS) + (addr >> 6);
         intptr_t shift = (addr >> 3) & 0b111;
diff --git a/src/julia_internal.h b/src/julia_internal.h
index d89de5753c380..5e5b0ebb76e41 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -333,15 +333,16 @@ extern jl_array_t *jl_all_methods JL_GLOBALLY_ROOTED;
 JL_DLLEXPORT extern int jl_lineno;
 JL_DLLEXPORT extern const char *jl_filename;
 
-void enable_collection(void);
-void disable_collection(void);
+extern void enable_collection(void);
+extern void disable_collection(void);
 jl_value_t *jl_gc_pool_alloc_noinline(jl_ptls_t ptls, int pool_offset,
                                    int osize);
 jl_value_t *jl_gc_big_alloc_noinline(jl_ptls_t ptls, size_t allocsz);
 #ifdef MMTK_GC
-JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_default(jl_ptls_t ptls, int pool_offset, int osize, void* ty);
-JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_big(jl_ptls_t ptls, size_t allocsz);
-extern void post_alloc(void* mutator, void* obj, size_t bytes, int allocator);
+JL_DLLIMPORT jl_value_t *jl_mmtk_gc_alloc_default(jl_ptls_t ptls, int pool_offset, int osize, void* ty);
+JL_DLLIMPORT jl_value_t *jl_mmtk_gc_alloc_big(jl_ptls_t ptls, size_t allocsz);
+JL_DLLIMPORT extern void mmtk_post_alloc(void* mutator, void* obj, size_t bytes, int allocator);
+JL_DLLIMPORT extern void mmtk_initialize_collection(void* tls);
 #endif // MMTK_GC
 JL_DLLEXPORT int jl_gc_classify_pools(size_t sz, int *osize) JL_NOTSAFEPOINT;
 extern uv_mutex_t gc_perm_lock;
@@ -549,7 +550,7 @@ STATIC_INLINE jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT
     o->header = tag | GC_OLD_MARKED;
 #ifdef MMTK_GC
     jl_ptls_t ptls = jl_current_task->ptls;
-    post_alloc(ptls->mmtk_mutator_ptr, jl_valueof(o), allocsz, 1);
+    mmtk_post_alloc(ptls->mmtk_mutator_ptr, jl_valueof(o), allocsz, 1);
 #endif
     return jl_valueof(o);
 }
diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
index 4877565c61495..2bf340be13b62 100644
--- a/src/llvm-late-gc-lowering.cpp
+++ b/src/llvm-late-gc-lowering.cpp
@@ -2543,7 +2543,7 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) {
         // But for other MMTk plans, we need to be careful.
         const bool INLINE_WRITE_BARRIER = true;
         if (CI->getCalledOperand() == write_barrier_func) {
-            if (MMTK_NEEDS_WRITE_BARRIER == OBJECT_BARRIER) {
+            if (MMTK_NEEDS_WRITE_BARRIER == MMTK_OBJECT_BARRIER) {
                 if (INLINE_WRITE_BARRIER) {
                     auto i8_ty = Type::getInt8Ty(F.getContext());
                     auto intptr_ty = T_size;
diff --git a/src/llvm-pass-helpers.cpp b/src/llvm-pass-helpers.cpp
index 1aa7346516f62..df3ffa5e27486 100644
--- a/src/llvm-pass-helpers.cpp
+++ b/src/llvm-pass-helpers.cpp
@@ -243,11 +243,13 @@ namespace jl_intrinsics {
 #ifdef MMTK_GC
     const IntrinsicDescription writeBarrier1(
         WRITE_BARRIER_1_NAME,
-        [](const JuliaPassContext &context) {
+        [](Type *T_size) {
+            auto &ctx = T_size->getContext();
+            auto T_prjlvalue = JuliaType::get_prjlvalue_ty(ctx);
             auto intrinsic = Function::Create(
                 FunctionType::get(
-                    Type::getVoidTy(context.getLLVMContext()),
-                    { context.T_prjlvalue },
+                    Type::getVoidTy(ctx),
+                    { T_prjlvalue },
                     false),
                 Function::ExternalLinkage,
                 WRITE_BARRIER_1_NAME);
@@ -256,11 +258,13 @@ namespace jl_intrinsics {
         });
     const IntrinsicDescription writeBarrier2(
         WRITE_BARRIER_2_NAME,
-        [](const JuliaPassContext &context) {
+        [](Type *T_size) {
+            auto &ctx = T_size->getContext();
+            auto T_prjlvalue = JuliaType::get_prjlvalue_ty(ctx);
             auto intrinsic = Function::Create(
                 FunctionType::get(
-                    Type::getVoidTy(context.getLLVMContext()),
-                    { context.T_prjlvalue, context.T_prjlvalue },
+                    Type::getVoidTy(ctx),
+                    { T_prjlvalue, T_prjlvalue },
                     false),
                 Function::ExternalLinkage,
                 WRITE_BARRIER_2_NAME);
@@ -269,11 +273,13 @@ namespace jl_intrinsics {
         });
     const IntrinsicDescription writeBarrier1Slow(
         WRITE_BARRIER_1_SLOW_NAME,
-        [](const JuliaPassContext &context) {
+        [](Type *T_size) {
+            auto &ctx = T_size->getContext();
+            auto T_prjlvalue = JuliaType::get_prjlvalue_ty(ctx);
             auto intrinsic = Function::Create(
                 FunctionType::get(
-                    Type::getVoidTy(context.getLLVMContext()),
-                    { context.T_prjlvalue },
+                    Type::getVoidTy(ctx),
+                    { T_prjlvalue },
                     false),
                 Function::ExternalLinkage,
                 WRITE_BARRIER_1_SLOW_NAME);
@@ -282,11 +288,13 @@ namespace jl_intrinsics {
         });
     const IntrinsicDescription writeBarrier2Slow(
         WRITE_BARRIER_2_SLOW_NAME,
-        [](const JuliaPassContext &context) {
+        [](Type *T_size) {
+            auto &ctx = T_size->getContext();
+            auto T_prjlvalue = JuliaType::get_prjlvalue_ty(ctx);
             auto intrinsic = Function::Create(
                 FunctionType::get(
-                    Type::getVoidTy(context.getLLVMContext()),
-                    { context.T_prjlvalue, context.T_prjlvalue },
+                    Type::getVoidTy(ctx),
+                    { T_prjlvalue, T_prjlvalue },
                     false),
                 Function::ExternalLinkage,
                 WRITE_BARRIER_2_SLOW_NAME);
@@ -379,11 +387,13 @@ namespace jl_well_known {
 #ifdef MMTK_GC
     const WellKnownFunctionDescription GCWriteBarrier1(
         GC_WB_1_NAME,
-        [](const JuliaPassContext &context) {
+        [](Type *T_size) {
+            auto &ctx = T_size->getContext();
+            auto T_prjlvalue = JuliaType::get_prjlvalue_ty(ctx);
             auto func = Function::Create(
                 FunctionType::get(
-                    Type::getVoidTy(context.getLLVMContext()),
-                    { context.T_prjlvalue },
+                    Type::getVoidTy(ctx),
+                    { T_prjlvalue },
                     false),
                 Function::ExternalLinkage,
                 GC_WB_1_NAME);
@@ -393,11 +403,13 @@ namespace jl_well_known {
 
     const WellKnownFunctionDescription GCWriteBarrier2(
         GC_WB_2_NAME,
-        [](const JuliaPassContext &context) {
+        [](Type *T_size) {
+            auto &ctx = T_size->getContext();
+            auto T_prjlvalue = JuliaType::get_prjlvalue_ty(ctx);
             auto func = Function::Create(
                 FunctionType::get(
-                    Type::getVoidTy(context.getLLVMContext()),
-                    { context.T_prjlvalue, context.T_prjlvalue },
+                    Type::getVoidTy(ctx),
+                    { T_prjlvalue, T_prjlvalue },
                     false),
                 Function::ExternalLinkage,
                 GC_WB_2_NAME);
@@ -407,11 +419,13 @@ namespace jl_well_known {
 
     const WellKnownFunctionDescription GCWriteBarrier1Slow(
         GC_WB_1_SLOW_NAME,
-        [](const JuliaPassContext &context) {
+        [](Type *T_size) {
+            auto &ctx = T_size->getContext();
+            auto T_prjlvalue = JuliaType::get_prjlvalue_ty(ctx);
             auto func = Function::Create(
                 FunctionType::get(
-                    Type::getVoidTy(context.getLLVMContext()),
-                    { context.T_prjlvalue },
+                    Type::getVoidTy(ctx),
+                    { T_prjlvalue },
                     false),
                 Function::ExternalLinkage,
                 GC_WB_1_SLOW_NAME);
@@ -421,11 +435,13 @@ namespace jl_well_known {
 
     const WellKnownFunctionDescription GCWriteBarrier2Slow(
         GC_WB_2_SLOW_NAME,
-        [](const JuliaPassContext &context) {
+        [](Type *T_size) {
+            auto &ctx = T_size->getContext();
+            auto T_prjlvalue = JuliaType::get_prjlvalue_ty(ctx);
             auto func = Function::Create(
                 FunctionType::get(
-                    Type::getVoidTy(context.getLLVMContext()),
-                    { context.T_prjlvalue, context.T_prjlvalue },
+                    Type::getVoidTy(ctx),
+                    { T_prjlvalue, T_prjlvalue },
                     false),
                 Function::ExternalLinkage,
                 GC_WB_2_SLOW_NAME);
diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index b3646cb16dacf..6d232919a55f8 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -72,7 +72,7 @@ JL_DLLEXPORT void jl_gc_run_pending_finalizers(jl_task_t *ct)
 
 JL_DLLEXPORT void jl_gc_add_ptr_finalizer(jl_ptls_t ptls, jl_value_t *v, void *f) JL_NOTSAFEPOINT
 {
-    register_finalizer(v, f, 1);
+    mmtk_register_finalizer(v, f, 1);
 }
 
 // schedule f(v) to call at the next quiescent interval (aka after the next safepoint/region on all threads)
@@ -87,13 +87,13 @@ JL_DLLEXPORT void jl_gc_add_finalizer_th(jl_ptls_t ptls, jl_value_t *v, jl_funct
         jl_gc_add_ptr_finalizer(ptls, v, jl_unbox_voidpointer(f));
     }
     else {
-        register_finalizer(v, f, 0);
+        mmtk_register_finalizer(v, f, 0);
     }
 }
 
 JL_DLLEXPORT void jl_finalize_th(jl_task_t *ct, jl_value_t *o)
 {
-    run_finalizers_for_obj(o);
+    mmtk_run_finalizers_for_obj(o);
 }
 
 void jl_gc_run_all_finalizers(jl_task_t *ct)
@@ -103,7 +103,7 @@ void jl_gc_run_all_finalizers(jl_task_t *ct)
 
 void jl_gc_add_finalizer_(jl_ptls_t ptls, void *v, void *f) JL_NOTSAFEPOINT
 {
-    register_finalizer(v, f, 0);
+    mmtk_register_finalizer(v, f, 0);
 }
 
 
@@ -192,13 +192,13 @@ void jl_gc_free_array(jl_array_t *a) JL_NOTSAFEPOINT
 
 JL_DLLEXPORT void jl_gc_queue_root(const jl_value_t *ptr)
 {
-    unreachable();
+    mmtk_unreachable();
 }
 
 // TODO: exported, but not MMTk-specific?
 JL_DLLEXPORT void jl_gc_queue_multiroot(const jl_value_t *parent, const jl_value_t *ptr) JL_NOTSAFEPOINT
 {
-    unreachable();
+    mmtk_unreachable();
 }
 
 
@@ -207,13 +207,13 @@ JL_DLLEXPORT void jl_gc_queue_multiroot(const jl_value_t *parent, const jl_value
 
 JL_DLLEXPORT int jl_gc_mark_queue_obj(jl_ptls_t ptls, jl_value_t *obj)
 {
-    unreachable();
+    mmtk_unreachable();
     return 0;
 }
 JL_DLLEXPORT void jl_gc_mark_queue_objarray(jl_ptls_t ptls, jl_value_t *parent,
                                             jl_value_t **objs, size_t nobjs)
 {
-    unreachable();
+    mmtk_unreachable();
 }
 
 
@@ -231,7 +231,7 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
         jl_atomic_fetch_add((_Atomic(uint64_t)*)&gc_num.deferred_alloc, localbytes);
         return;
     }
-    handle_user_collection_request(ptls, collection);
+    mmtk_handle_user_collection_request(ptls, collection);
 }
 
 // Per-thread initialization
@@ -266,7 +266,7 @@ void jl_init_thread_heap(jl_ptls_t ptls)
     memset(&ptls->gc_num, 0, sizeof(ptls->gc_num));
     jl_atomic_store_relaxed(&ptls->gc_num.allocd, -(int64_t)gc_num.interval);
 
-    MMTk_Mutator mmtk_mutator = bind_mutator((void *)ptls, ptls->tid);
+    MMTk_Mutator mmtk_mutator = mmtk_bind_mutator((void *)ptls, ptls->tid);
     ptls->mmtk_mutator_ptr = ((MMTkMutatorContext*)mmtk_mutator);
 }
 
@@ -337,9 +337,9 @@ void jl_gc_init(void)
 
     // if only max size is specified initialize MMTk with a fixed size heap
     if (max_size_def != NULL || (max_size_gb != NULL && (min_size_def == NULL && min_size_gb == NULL))) {
-        gc_init(0, max_heap_size, &mmtk_upcalls, (sizeof(jl_taggedvalue_t)));
+        mmtk_gc_init(0, max_heap_size, &mmtk_upcalls, (sizeof(jl_taggedvalue_t)));
     } else {
-        gc_init(min_heap_size, max_heap_size, &mmtk_upcalls, (sizeof(jl_taggedvalue_t)));
+        mmtk_gc_init(min_heap_size, max_heap_size, &mmtk_upcalls, (sizeof(jl_taggedvalue_t)));
     }
 }
 
@@ -485,7 +485,17 @@ void objprofile_reset(void)
 // gc thread function
 void jl_gc_threadfun(void *arg)
 {
-    unreachable();
+    mmtk_unreachable();
+}
+
+// added for MMTk integration
+void enable_collection(void)
+{
+    mmtk_enable_collection();
+}
+void disable_collection(void)
+{
+    mmtk_disable_collection();
 }
 
 JL_DLLEXPORT void jl_gc_array_ptr_copy(jl_array_t *dest, void **dest_p, jl_array_t *src, void **src_p, ssize_t n) JL_NOTSAFEPOINT
@@ -522,7 +532,7 @@ JL_DLLEXPORT void jl_gc_wb2_slow(const void *parent, const void* ptr) JL_NOTSAFE
 void *jl_gc_perm_alloc_nolock(size_t sz, int zero, unsigned align, unsigned offset)
 {
     jl_ptls_t ptls = jl_current_task->ptls;
-    void* addr = alloc(ptls->mmtk_mutator_ptr, sz, align, offset, 1);
+    void* addr = mmtk_alloc(ptls->mmtk_mutator_ptr, sz, align, offset, 1);
     return addr;
 }
 
diff --git a/src/symbol.c b/src/symbol.c
index 00de9872e8255..dcfa0b6086846 100644
--- a/src/symbol.c
+++ b/src/symbol.c
@@ -41,7 +41,7 @@ static jl_sym_t *mk_symbol(const char *str, size_t len) JL_NOTSAFEPOINT
     jl_set_typetagof(sym, jl_symbol_tag, GC_OLD_MARKED);
 #ifdef MMTK_GC
     jl_ptls_t ptls = jl_current_task->ptls;
-    post_alloc(ptls->mmtk_mutator_ptr, jl_valueof(tag), nb, 1);
+    mmtk_post_alloc(ptls->mmtk_mutator_ptr, jl_valueof(tag), nb, 1);
 #endif
     jl_atomic_store_relaxed(&sym->left, NULL);
     jl_atomic_store_relaxed(&sym->right, NULL);

From 1af2dd00b700032ed5757a8e29b855732e989c81 Mon Sep 17 00:00:00 2001
From: Yi Lin <qinsoon@gmail.com>
Date: Thu, 18 May 2023 22:17:48 +0000
Subject: [PATCH 012/116] Pass n_gcthreads to mmtk_gc_init. Avoid spawning GC
 threads in Julia.

---
 src/mmtk-gc.c   | 4 ++--
 src/threading.c | 5 +++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index 6d232919a55f8..10635cc11a07a 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -337,9 +337,9 @@ void jl_gc_init(void)
 
     // if only max size is specified initialize MMTk with a fixed size heap
     if (max_size_def != NULL || (max_size_gb != NULL && (min_size_def == NULL && min_size_gb == NULL))) {
-        mmtk_gc_init(0, max_heap_size, &mmtk_upcalls, (sizeof(jl_taggedvalue_t)));
+        mmtk_gc_init(0, max_heap_size, jl_options.ngcthreads, &mmtk_upcalls, (sizeof(jl_taggedvalue_t)));
     } else {
-        mmtk_gc_init(min_heap_size, max_heap_size, &mmtk_upcalls, (sizeof(jl_taggedvalue_t)));
+        mmtk_gc_init(min_heap_size, max_heap_size, jl_options.ngcthreads, &mmtk_upcalls, (sizeof(jl_taggedvalue_t)));
     }
 }
 
diff --git a/src/threading.c b/src/threading.c
index 52d69805d0b79..4f24ce1aad704 100644
--- a/src/threading.c
+++ b/src/threading.c
@@ -663,6 +663,11 @@ void jl_init_threading(void)
             ngcthreads = (nthreads / 2) - 1;
         }
     }
+#ifdef MMTK_GC
+    // MMTk gets the number of GC threads from jl_options.ngcthreads. So we just set ngcthreads to 0 here
+    // to avoid spawning any GC threads in Julia.
+    ngcthreads = 0;
+#endif
 
     jl_all_tls_states_size = nthreads + nthreadsi + ngcthreads;
     jl_n_threads_per_pool = (int*)malloc_s(2 * sizeof(int));

From da45bf76f3431ae0662e44d581a91bee7cae987d Mon Sep 17 00:00:00 2001
From: Yi Lin <qinsoon@gmail.com>
Date: Fri, 19 May 2023 05:49:34 +0000
Subject: [PATCH 013/116] Set ngcthreads=0 in jl_start_threads

---
 src/mmtk-gc.c   |  3 +++
 src/threading.c | 10 +++++-----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index 10635cc11a07a..8b4d1f2c22397 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -335,6 +335,9 @@ void jl_gc_init(void)
         max_heap_size = uv_get_free_memory() * 70 / 100;
     }
 
+    // If the two values are the same, we can use either. Otherwise, we need to be careful.
+    assert(jl_n_gcthreads == jl_options.ngcthreads);
+
     // if only max size is specified initialize MMTk with a fixed size heap
     if (max_size_def != NULL || (max_size_gb != NULL && (min_size_def == NULL && min_size_gb == NULL))) {
         mmtk_gc_init(0, max_heap_size, jl_options.ngcthreads, &mmtk_upcalls, (sizeof(jl_taggedvalue_t)));
diff --git a/src/threading.c b/src/threading.c
index 4f24ce1aad704..51bdd6e8107da 100644
--- a/src/threading.c
+++ b/src/threading.c
@@ -663,11 +663,6 @@ void jl_init_threading(void)
             ngcthreads = (nthreads / 2) - 1;
         }
     }
-#ifdef MMTK_GC
-    // MMTk gets the number of GC threads from jl_options.ngcthreads. So we just set ngcthreads to 0 here
-    // to avoid spawning any GC threads in Julia.
-    ngcthreads = 0;
-#endif
 
     jl_all_tls_states_size = nthreads + nthreadsi + ngcthreads;
     jl_n_threads_per_pool = (int*)malloc_s(2 * sizeof(int));
@@ -686,6 +681,11 @@ void jl_start_threads(void)
 {
     int nthreads = jl_atomic_load_relaxed(&jl_n_threads);
     int ngcthreads = jl_n_gcthreads;
+#ifdef MMTK_GC
+    // MMTk gets the number of GC threads from jl_options.ngcthreads, and spawn its GC threads.
+    // So we just set ngcthreads to 0 here to avoid spawning any GC threads in Julia.
+    ngcthreads = 0;
+#endif
     int cpumasksize = uv_cpumask_size();
     char *cp;
     int i, exclusive;

From 8194356082ec76655dc4fb14a909e9b721730b79 Mon Sep 17 00:00:00 2001
From: Yi Lin <qinsoon@gmail.com>
Date: Thu, 25 May 2023 23:48:23 +0000
Subject: [PATCH 014/116] Fix stock Julia build

---
 src/gc-debug.c | 203 ++++++++++++++-----------------------------------
 src/gc.c       | 138 ++++++++++++++++++++++++++++-----
 src/gc.h       |  49 +++++-------
 src/mmtk-gc.c  |   2 +
 4 files changed, 201 insertions(+), 191 deletions(-)

diff --git a/src/gc-debug.c b/src/gc-debug.c
index fc3da5b2ba282..df2e3487506fa 100644
--- a/src/gc-debug.c
+++ b/src/gc-debug.c
@@ -647,91 +647,6 @@ void jl_gc_debug_print_status(void)
 }
 #endif
 
-#ifdef OBJPROFILE
-static htable_t obj_counts[3];
-static htable_t obj_sizes[3];
-void objprofile_count(void *ty, int old, int sz)
-{
-    if (gc_verifying) return;
-    if ((intptr_t)ty <= 0x10) {
-        ty = (void*)jl_buff_tag;
-    }
-    else if (ty != (void*)jl_buff_tag && ty != jl_malloc_tag &&
-             jl_typeof(ty) == (jl_value_t*)jl_datatype_type &&
-             ((jl_datatype_t*)ty)->instance) {
-        ty = jl_singleton_tag;
-    }
-    void **bp = ptrhash_bp(&obj_counts[old], ty);
-    if (*bp == HT_NOTFOUND)
-        *bp = (void*)2;
-    else
-        (*((intptr_t*)bp))++;
-    bp = ptrhash_bp(&obj_sizes[old], ty);
-    if (*bp == HT_NOTFOUND)
-        *bp = (void*)(intptr_t)(1 + sz);
-    else
-        *((intptr_t*)bp) += sz;
-}
-
-void objprofile_reset(void)
-{
-    for (int g = 0; g < 3; g++) {
-        htable_reset(&obj_counts[g], 0);
-        htable_reset(&obj_sizes[g], 0);
-    }
-}
-
-static void objprofile_print(htable_t nums, htable_t sizes)
-{
-    for(int i=0; i < nums.size; i+=2) {
-        if (nums.table[i+1] != HT_NOTFOUND) {
-            void *ty = nums.table[i];
-            int num = (intptr_t)nums.table[i + 1] - 1;
-            size_t sz = (uintptr_t)ptrhash_get(&sizes, ty) - 1;
-            static const int ptr_hex_width = 2 * sizeof(void*);
-            if (sz > 2e9) {
-                jl_safe_printf(" %6d : %*.1f GB of (%*p) ",
-                               num, 6, ((double)sz) / 1024 / 1024 / 1024,
-                               ptr_hex_width, ty);
-            }
-            else if (sz > 2e6) {
-                jl_safe_printf(" %6d : %*.1f MB of (%*p) ",
-                               num, 6, ((double)sz) / 1024 / 1024,
-                               ptr_hex_width, ty);
-            }
-            else if (sz > 2e3) {
-                jl_safe_printf(" %6d : %*.1f kB of (%*p) ",
-                               num, 6, ((double)sz) / 1024,
-                               ptr_hex_width, ty);
-            }
-            else {
-                jl_safe_printf(" %6d : %*d  B of (%*p) ",
-                          num, 6, (int)sz, ptr_hex_width, ty);
-            }
-            if (ty == (void*)jl_buff_tag)
-                jl_safe_printf("#<buffer>");
-            else if (ty == jl_malloc_tag)
-                jl_safe_printf("#<malloc>");
-            else if (ty == jl_singleton_tag)
-                jl_safe_printf("#<singletons>");
-            else
-                jl_static_show(JL_STDERR, (jl_value_t*)ty);
-            jl_safe_printf("\n");
-        }
-    }
-}
-
-void objprofile_printall(void)
-{
-    jl_safe_printf("Transient mark :\n");
-    objprofile_print(obj_counts[0], obj_sizes[0]);
-    jl_safe_printf("Perm mark :\n");
-    objprofile_print(obj_counts[1], obj_sizes[1]);
-    jl_safe_printf("Remset :\n");
-    objprofile_print(obj_counts[2], obj_sizes[2]);
-}
-#endif
-
 #if defined(GC_TIME) || defined(GC_FINAL_STATS)
 STATIC_INLINE double jl_ns2ms(int64_t t)
 {
@@ -1257,68 +1172,68 @@ void gc_count_pool(void)
 // `offset` will be added to `mq->current` for convenience in the debugger.
 NOINLINE void gc_mark_loop_unwind(jl_ptls_t ptls, jl_gc_markqueue_t *mq, int offset)
 {
-    jl_jmp_buf *old_buf = jl_get_safe_restore();
-    jl_jmp_buf buf;
-    jl_set_safe_restore(&buf);
-    if (jl_setjmp(buf, 0) != 0) {
-        jl_safe_printf("\n!!! ERROR when unwinding gc mark loop -- ABORTING !!!\n");
-        jl_set_safe_restore(old_buf);
-        return;
-    }
-    jl_value_t **start = mq->start;
-    jl_value_t **end = mq->current + offset;
-    for (; start < end; start++) {
-        jl_value_t *obj = *start;
-        jl_taggedvalue_t *o = jl_astaggedvalue(obj);
-        jl_safe_printf("Queued object: %p :: (tag: %zu) (bits: %zu)\n", obj,
-                       (uintptr_t)o->header, ((uintptr_t)o->header & 3));
-        jl_((void*)(jl_datatype_t *)(o->header & ~(uintptr_t)0xf));
-    }
-    jl_set_safe_restore(old_buf);
-}
-
-int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
-{
-    int nf = (int)jl_datatype_nfields(vt);
-    for (int i = 1; i < nf; i++) {
-        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
-            return i - 1;
-    }
-    return nf - 1;
-}
-
-int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
-{
-    char *slot = (char*)_slot;
-    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
-    char *start = NULL;
-    size_t len = 0;
-    size_t elsize = sizeof(void*);
-    if (vt == jl_module_type) {
-        jl_module_t *m = (jl_module_t*)obj;
-        start = (char*)m->usings.items;
-        len = m->usings.len;
-    }
-    else if (vt == jl_simplevector_type) {
-        start = (char*)jl_svec_data(obj);
-        len = jl_svec_len(obj);
-    }
-    else if (vt->name == jl_array_typename) {
-        jl_array_t *a = (jl_array_t*)obj;
-        start = (char*)a->data;
-        len = jl_array_len(a);
-        elsize = a->elsize;
-    }
-    if (slot < start || slot >= start + elsize * len)
-        return -1;
-    return (slot - start) / elsize;
-}
+    // jl_jmp_buf *old_buf = jl_get_safe_restore();
+    // jl_jmp_buf buf;
+    // jl_set_safe_restore(&buf);
+    // if (jl_setjmp(buf, 0) != 0) {
+    //     jl_safe_printf("\n!!! ERROR when unwinding gc mark loop -- ABORTING !!!\n");
+    //     jl_set_safe_restore(old_buf);
+    //     return;
+    // }
+    // jl_value_t **start = mq->start;
+    // jl_value_t **end = mq->current + offset;
+    // for (; start < end; start++) {
+    //     jl_value_t *obj = *start;
+    //     jl_taggedvalue_t *o = jl_astaggedvalue(obj);
+    //     jl_safe_printf("Queued object: %p :: (tag: %zu) (bits: %zu)\n", obj,
+    //                    (uintptr_t)o->header, ((uintptr_t)o->header & 3));
+    //     jl_((void*)(jl_datatype_t *)(o->header & ~(uintptr_t)0xf));
+    // }
+    // jl_set_safe_restore(old_buf);
+}
+
+// int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
+// {
+//     int nf = (int)jl_datatype_nfields(vt);
+//     for (int i = 1; i < nf; i++) {
+//         if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
+//             return i - 1;
+//     }
+//     return nf - 1;
+// }
+
+// int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
+// {
+//     char *slot = (char*)_slot;
+//     jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
+//     char *start = NULL;
+//     size_t len = 0;
+//     size_t elsize = sizeof(void*);
+//     if (vt == jl_module_type) {
+//         jl_module_t *m = (jl_module_t*)obj;
+//         start = (char*)m->usings.items;
+//         len = m->usings.len;
+//     }
+//     else if (vt == jl_simplevector_type) {
+//         start = (char*)jl_svec_data(obj);
+//         len = jl_svec_len(obj);
+//     }
+//     else if (vt->name == jl_array_typename) {
+//         jl_array_t *a = (jl_array_t*)obj;
+//         start = (char*)a->data;
+//         len = jl_array_len(a);
+//         elsize = a->elsize;
+//     }
+//     if (slot < start || slot >= start + elsize * len)
+//         return -1;
+//     return (slot - start) / elsize;
+// }
 
 static int gc_logging_enabled = 0;
 
-JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
-    gc_logging_enabled = enable;
-}
+// JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
+//     gc_logging_enabled = enable;
+// }
 
 void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect) JL_NOTSAFEPOINT {
     if (!gc_logging_enabled) {
diff --git a/src/gc.c b/src/gc.c
index 932bb1d97c6db..ce80597a937f1 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -376,10 +376,6 @@ void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads)
     }
 }
 
-<<<<<<< HEAD
-
-=======
->>>>>>> upstream/master
 // malloc wrappers, aligned allocation
 
 #if defined(_OS_WINDOWS_)
@@ -2648,6 +2644,8 @@ JL_EXTENSION NOINLINE void gc_mark_loop_serial(jl_ptls_t ptls)
     gc_drain_own_chunkqueue(ptls, &ptls->mark_queue);
 }
 
+extern int gc_first_tid;
+
 void gc_mark_and_steal(jl_ptls_t ptls)
 {
     jl_gc_markqueue_t *mq = &ptls->mark_queue;
@@ -2799,24 +2797,109 @@ void gc_mark_clean_reclaim_sets(void)
     }
 }
 
-static void gc_premark(jl_ptls_t ptls2)
+// void gc_premark(jl_ptls_t ptls2)
+// {
+//     arraylist_t *remset = ptls2->heap.remset;
+//     ptls2->heap.remset = ptls2->heap.last_remset;
+//     ptls2->heap.last_remset = remset;
+//     ptls2->heap.remset->len = 0;
+//     ptls2->heap.remset_nptr = 0;
+//     // avoid counting remembered objects
+//     // in `perm_scanned_bytes`
+//     size_t len = remset->len;
+//     void **items = remset->items;
+//     for (size_t i = 0; i < len; i++) {
+//         jl_value_t *item = (jl_value_t *)items[i];
+//         objprofile_count(jl_typeof(item), 2, 0);
+//         jl_astaggedvalue(item)->bits.gc = GC_OLD_MARKED;
+//     }
+// }
+
+#ifdef OBJPROFILE
+static htable_t obj_counts[3];
+static htable_t obj_sizes[3];
+void objprofile_count(void *ty, int old, int sz)
+{
+    if (gc_verifying) return;
+    if ((intptr_t)ty <= 0x10) {
+        ty = (void*)jl_buff_tag;
+    }
+    else if (ty != (void*)jl_buff_tag && ty != jl_malloc_tag &&
+             jl_typeof(ty) == (jl_value_t*)jl_datatype_type &&
+             ((jl_datatype_t*)ty)->instance) {
+        ty = jl_singleton_tag;
+    }
+    void **bp = ptrhash_bp(&obj_counts[old], ty);
+    if (*bp == HT_NOTFOUND)
+        *bp = (void*)2;
+    else
+        (*((intptr_t*)bp))++;
+    bp = ptrhash_bp(&obj_sizes[old], ty);
+    if (*bp == HT_NOTFOUND)
+        *bp = (void*)(intptr_t)(1 + sz);
+    else
+        *((intptr_t*)bp) += sz;
+}
+
+void objprofile_reset(void)
 {
-    arraylist_t *remset = ptls2->heap.remset;
-    ptls2->heap.remset = ptls2->heap.last_remset;
-    ptls2->heap.last_remset = remset;
-    ptls2->heap.remset->len = 0;
-    ptls2->heap.remset_nptr = 0;
-    // avoid counting remembered objects
-    // in `perm_scanned_bytes`
-    size_t len = remset->len;
-    void **items = remset->items;
-    for (size_t i = 0; i < len; i++) {
-        jl_value_t *item = (jl_value_t *)items[i];
-        objprofile_count(jl_typeof(item), 2, 0);
-        jl_astaggedvalue(item)->bits.gc = GC_OLD_MARKED;
+    for (int g = 0; g < 3; g++) {
+        htable_reset(&obj_counts[g], 0);
+        htable_reset(&obj_sizes[g], 0);
+    }
+}
+
+static void objprofile_print(htable_t nums, htable_t sizes)
+{
+    for(int i=0; i < nums.size; i+=2) {
+        if (nums.table[i+1] != HT_NOTFOUND) {
+            void *ty = nums.table[i];
+            int num = (intptr_t)nums.table[i + 1] - 1;
+            size_t sz = (uintptr_t)ptrhash_get(&sizes, ty) - 1;
+            static const int ptr_hex_width = 2 * sizeof(void*);
+            if (sz > 2e9) {
+                jl_safe_printf(" %6d : %*.1f GB of (%*p) ",
+                               num, 6, ((double)sz) / 1024 / 1024 / 1024,
+                               ptr_hex_width, ty);
+            }
+            else if (sz > 2e6) {
+                jl_safe_printf(" %6d : %*.1f MB of (%*p) ",
+                               num, 6, ((double)sz) / 1024 / 1024,
+                               ptr_hex_width, ty);
+            }
+            else if (sz > 2e3) {
+                jl_safe_printf(" %6d : %*.1f kB of (%*p) ",
+                               num, 6, ((double)sz) / 1024,
+                               ptr_hex_width, ty);
+            }
+            else {
+                jl_safe_printf(" %6d : %*d  B of (%*p) ",
+                          num, 6, (int)sz, ptr_hex_width, ty);
+            }
+            if (ty == (void*)jl_buff_tag)
+                jl_safe_printf("#<buffer>");
+            else if (ty == jl_malloc_tag)
+                jl_safe_printf("#<malloc>");
+            else if (ty == jl_singleton_tag)
+                jl_safe_printf("#<singletons>");
+            else
+                jl_static_show(JL_STDERR, (jl_value_t*)ty);
+            jl_safe_printf("\n");
+        }
     }
 }
 
+void objprofile_printall(void)
+{
+    jl_safe_printf("Transient mark :\n");
+    objprofile_print(obj_counts[0], obj_sizes[0]);
+    jl_safe_printf("Perm mark :\n");
+    objprofile_print(obj_counts[1], obj_sizes[1]);
+    jl_safe_printf("Remset :\n");
+    objprofile_print(obj_counts[2], obj_sizes[2]);
+}
+#endif
+
 static void gc_queue_thread_local(jl_gc_markqueue_t *mq, jl_ptls_t ptls2)
 {
     jl_task_t *task;
@@ -2955,6 +3038,9 @@ static void sweep_finalizer_list(arraylist_t *list)
 
 size_t jl_maxrss(void);
 
+extern void objprofile_printall(void);
+extern void objprofile_reset(void);
+
 // Only one thread should be running in this function
 static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
 {
@@ -3708,6 +3794,22 @@ void disable_collection(void)
 {
 }
 
+JL_DLLEXPORT void jl_gc_wb1_noinline(const void *parent) JL_NOTSAFEPOINT
+{
+}
+
+JL_DLLEXPORT void jl_gc_wb2_noinline(const void *parent, const void *ptr) JL_NOTSAFEPOINT
+{
+}
+
+JL_DLLEXPORT void jl_gc_wb1_slow(const void *parent) JL_NOTSAFEPOINT
+{
+}
+
+JL_DLLEXPORT void jl_gc_wb2_slow(const void *parent, const void* ptr) JL_NOTSAFEPOINT
+{
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gc.h b/src/gc.h
index 3def80327ceda..701c2c769e1b4 100644
--- a/src/gc.h
+++ b/src/gc.h
@@ -47,7 +47,24 @@ extern void gc_premark(jl_ptls_t ptls2);
 extern void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t oldsz,
                                  int isaligned, jl_value_t *owner, int8_t can_collect);
 extern size_t jl_array_nbytes(jl_array_t *a);
-extern void objprofile_count(void *ty, int old, int sz);
+
+#ifdef OBJPROFILE
+void objprofile_count(void *ty, int old, int sz) JL_NOTSAFEPOINT;
+void objprofile_printall(void);
+void objprofile_reset(void);
+#else
+static inline void objprofile_count(void *ty, int old, int sz) JL_NOTSAFEPOINT
+{
+}
+
+static inline void objprofile_printall(void)
+{
+}
+
+static inline void objprofile_reset(void)
+{
+}
+#endif
 
 #define malloc_cache_align(sz) jl_malloc_aligned(sz, JL_CACHE_BYTE_ALIGNMENT)
 #define realloc_cache_align(p, sz, oldsz) jl_realloc_aligned(p, sz, oldsz, JL_CACHE_BYTE_ALIGNMENT)
@@ -70,7 +87,7 @@ extern uint64_t finalizer_rngState[];
 extern int gc_n_threads;
 extern jl_ptls_t* gc_all_tls_states;
 
-// keep in sync with the Julia type of the same name in base/timing.jl
+// This struct must be kept in sync with the Julia type of the same name in base/timing.jl
 typedef struct {
     int64_t     allocd;
     int64_t     deferred_alloc;
@@ -82,7 +99,6 @@ typedef struct {
     uint64_t    freecall;
     uint64_t    total_time;
     uint64_t    total_allocd;
-    uint64_t    since_sweep;
     size_t      interval;
     int         pause;
     int         full_sweep;
@@ -90,6 +106,7 @@ typedef struct {
     uint64_t    max_memory;
     uint64_t    time_to_safepoint;
     uint64_t    max_time_to_safepoint;
+    uint64_t    total_time_to_safepoint;
     uint64_t    sweep_time;
     uint64_t    mark_time;
     uint64_t    total_sweep_time;
@@ -217,32 +234,6 @@ typedef struct {
     jl_alloc_num_t print;
 } jl_gc_debug_env_t;
 
-// This struct must be kept in sync with the Julia type of the same name in base/timing.jl
-typedef struct {
-    int64_t     allocd;
-    int64_t     deferred_alloc;
-    int64_t     freed;
-    uint64_t    malloc;
-    uint64_t    realloc;
-    uint64_t    poolalloc;
-    uint64_t    bigalloc;
-    uint64_t    freecall;
-    uint64_t    total_time;
-    uint64_t    total_allocd;
-    size_t      interval;
-    int         pause;
-    int         full_sweep;
-    uint64_t    max_pause;
-    uint64_t    max_memory;
-    uint64_t    time_to_safepoint;
-    uint64_t    max_time_to_safepoint;
-    uint64_t    total_time_to_safepoint;
-    uint64_t    sweep_time;
-    uint64_t    mark_time;
-    uint64_t    total_sweep_time;
-    uint64_t    total_mark_time;
-} jl_gc_num_t;
-
 // Array chunks (work items representing suffixes of
 // large arrays of pointers left to be marked)
 
diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index 8b4d1f2c22397..5e868ef11c1d2 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -473,6 +473,7 @@ void jl_print_gc_stats(JL_STREAM *s)
 {
 }
 
+#ifdef OBJPROFILE
 void objprofile_count(void *ty, int old, int sz) JL_NOTSAFEPOINT
 {
 }
@@ -484,6 +485,7 @@ void objprofile_printall(void)
 void objprofile_reset(void)
 {
 }
+#endif
 
 // gc thread function
 void jl_gc_threadfun(void *arg)

From fb024c6f51849fde6ec10783a5eb595f34269e78 Mon Sep 17 00:00:00 2001
From: Yi Lin <qinsoon@gmail.com>
Date: Tue, 30 May 2023 16:00:51 +1200
Subject: [PATCH 015/116] Copy libmmtk_julia to usr/lib (#15)

Copies libmmtk_julia.so from the source directory (`mmtk-julia/mmtk/target/debug/libmmtk_julia.so`) to `build/usr/lib`.
---
 Make.inc     | 15 ++++++++++++++-
 src/Makefile |  4 +++-
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/Make.inc b/Make.inc
index 65b1468781632..6920fc64ecf70 100644
--- a/Make.inc
+++ b/Make.inc
@@ -750,7 +750,14 @@ endif
 MMTK_DIR = ${MMTK_JULIA_DIR}/mmtk
 MMTK_API_INC = $(MMTK_DIR)/api
 MMTK_JULIA_INC = ${MMTK_JULIA_DIR}/julia
-MMTK_LIB := -L$(MMTK_DIR)/target/$(MMTK_BUILD) -lmmtk_julia
+ifeq ($(OS),Linux)
+MMTK_LIB_NAME := libmmtk_julia.so
+else
+$(error "Unsupported OS for MMTk")
+endif
+MMTK_LIB_SRC := $(MMTK_DIR)/target/$(MMTK_BUILD)/$(MMTK_LIB_NAME)
+MMTK_LIB_DST := $(BUILDROOT)/usr/lib/$(MMTK_LIB_NAME)
+MMTK_LIB := -lmmtk_julia
 LDFLAGS += -Wl,-rpath=$(MMTK_DIR)/target/$(MMTK_BUILD)/
 else
 MMTK_JULIA_INC :=
@@ -1692,6 +1699,9 @@ PRINT_PERL = printf '    %b %b\n' $(PERLCOLOR)PERL$(ENDCOLOR) $(BINCOLOR)$(GOAL)
 PRINT_FLISP = printf '    %b %b\n' $(FLISPCOLOR)FLISP$(ENDCOLOR) $(BINCOLOR)$(GOAL)$(ENDCOLOR); $(1)
 PRINT_JULIA = printf '    %b %b\n' $(JULIACOLOR)JULIA$(ENDCOLOR) $(BINCOLOR)$(GOAL)$(ENDCOLOR); $(1)
 PRINT_DTRACE = printf '    %b %b\n' $(DTRACECOLOR)DTRACE$(ENDCOLOR) $(BINCOLOR)$(GOAL)$(ENDCOLOR); $(1)
+ifeq ($(WITH_MMTK), 1)
+PRINT_MMTK = printf '    %b %b\n' $(LINKCOLOR)MMTK$(ENDCOLOR) $(BINCOLOR)$(GOAL)$(ENDCOLOR); $(1)
+endif
 
 else
 QUIET_MAKE =
@@ -1702,6 +1712,9 @@ PRINT_PERL = echo '$(subst ','\'',$(1))'; $(1)
 PRINT_FLISP = echo '$(subst ','\'',$(1))'; $(1)
 PRINT_JULIA = echo '$(subst ','\'',$(1))'; $(1)
 PRINT_DTRACE = echo '$(subst ','\'',$(1))'; $(1)
+ifeq ($(WITH_MMTK), 1)
+PRINT_MMTK = echo '$(subst ','\'',$(1))'; $(1)
+endif
 
 endif
 
diff --git a/src/Makefile b/src/Makefile
index 66a3f3ac1c24b..ff5f4ce8b99d6 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -175,7 +175,7 @@ DOBJS := $(SRCS:%=$(BUILDDIR)/%.dbg.obj)
 
 ifeq ($(WITH_MMTK), 1)
 MMTK_SRCS := mmtk_julia
-MMTK_OBJS := $(MMTK_SRCS:%=$(MMTK_JULIA_INC)/%.o)
+MMTK_OBJS := $(MMTK_SRCS:%=$(MMTK_JULIA_INC)/%.o) $(MMTK_LIB_DST)
 MMTK_DOBJS := $(MMTK_SRCS:%=$(MMTK_JULIA_INC)/%.dbg.obj)
 else
 MMTK_OBJS :=
@@ -254,6 +254,8 @@ $(MMTK_JULIA_INC)/%.o: $(MMTK_JULIA_INC)/%.c $(HEADERS) | $(MMTK_JULIA_INC)
 	@$(call PRINT_CC, $(CC) $(JCPPFLAGS) $(JCFLAGS) $(SHIPFLAGS) $(DISABLE_ASSERTIONS) -c $< -o $@)
 $(MMTK_JULIA_INC)/%.dbg.obj: $(MMTK_JULIA_INC)/%.c $(HEADERS) | $(MMTK_JULIA_INC)
 	@$(call PRINT_CC, $(CC) $(JCPPFLAGS) $(JCFLAGS) $(DEBUGFLAGS) -c $< -o $@)
+$(MMTK_LIB_DST): $(MMTK_LIB_SRC)
+	@$(call PRINT_MMTK, cp $< $@)
 endif
 
 # public header rules

From 98a66ba3c0925ea21bfe051a191210eeae7df0f2 Mon Sep 17 00:00:00 2001
From: Yi Lin <qinsoon@gmail.com>
Date: Fri, 16 Jun 2023 10:10:34 +1200
Subject: [PATCH 016/116] Embed mutator in _jl_tls_states_t (#16)

This PR embeds the MMTk mutator struct in `_jl_tls_states_t`, and also adds `jl_deinit_thread_heap` to allow a proper destruction of the mutator struct.
---
 src/gc.c                       |  5 +++++
 src/julia.h                    |  4 ++--
 src/julia_internal.h           |  3 ++-
 src/julia_threads.h            |  4 +---
 src/llvm-final-gc-lowering.cpp | 12 +++++++++---
 src/mmtk-gc.c                  | 19 ++++++++++++++-----
 src/symbol.c                   |  2 +-
 src/threading.c                |  3 +++
 8 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/src/gc.c b/src/gc.c
index ce80597a937f1..90eae32f0affc 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -3501,6 +3501,11 @@ void jl_init_thread_heap(jl_ptls_t ptls)
     jl_atomic_store_relaxed(&ptls->gc_num.allocd, -(int64_t)gc_num.interval);
 }
 
+void jl_deinit_thread_heap(jl_ptls_t ptls)
+{
+    // Do nothing
+}
+
 // System-wide initializations
 void jl_gc_init(void)
 {
diff --git a/src/julia.h b/src/julia.h
index 7950eca3e0f1d..253105ef94386 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -2391,7 +2391,7 @@ STATIC_INLINE void mmtk_gc_wb_full(const void *parent, const void *ptr) JL_NOTSA
 {
     jl_task_t *ct = jl_current_task;
     jl_ptls_t ptls = ct->ptls;
-    mmtk_object_reference_write_post(ptls->mmtk_mutator_ptr, parent, ptr);
+    mmtk_object_reference_write_post(&ptls->mmtk_mutator, parent, ptr);
 }
 
 // Inlined fastpath
@@ -2405,7 +2405,7 @@ STATIC_INLINE void mmtk_gc_wb_fast(const void *parent, const void *ptr) JL_NOTSA
         if (((byte_val >> shift) & 1) == 1) {
             jl_task_t *ct = jl_current_task;
             jl_ptls_t ptls = ct->ptls;
-            mmtk_object_reference_write_slow(ptls->mmtk_mutator_ptr, parent, ptr);
+            mmtk_object_reference_write_slow(&ptls->mmtk_mutator, parent, ptr);
         }
     }
 }
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 5e5b0ebb76e41..76ed8f977dc7a 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -550,7 +550,7 @@ STATIC_INLINE jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT
     o->header = tag | GC_OLD_MARKED;
 #ifdef MMTK_GC
     jl_ptls_t ptls = jl_current_task->ptls;
-    mmtk_post_alloc(ptls->mmtk_mutator_ptr, jl_valueof(o), allocsz, 1);
+    mmtk_post_alloc(&ptls->mmtk_mutator, jl_valueof(o), allocsz, 1);
 #endif
     return jl_valueof(o);
 }
@@ -918,6 +918,7 @@ void jl_init_serializer(void);
 void jl_gc_init(void);
 void jl_init_uv(void);
 void jl_init_thread_heap(jl_ptls_t ptls) JL_NOTSAFEPOINT;
+void jl_deinit_thread_heap(jl_ptls_t ptls) JL_NOTSAFEPOINT;
 void jl_init_int32_int64_cache(void);
 JL_DLLEXPORT void jl_init_options(void);
 
diff --git a/src/julia_threads.h b/src/julia_threads.h
index 46ad724b71aa0..f79d17d35cb64 100644
--- a/src/julia_threads.h
+++ b/src/julia_threads.h
@@ -282,9 +282,7 @@ typedef struct _jl_tls_states_t {
     )
 
 #ifdef MMTK_GC
-    MMTkMutatorContext* mmtk_mutator_ptr;
-    void* cursor;
-    void* limit;
+    MMTkMutatorContext mmtk_mutator;
 #endif
 
     // some hidden state (usually just because we don't have the type's size declaration)
diff --git a/src/llvm-final-gc-lowering.cpp b/src/llvm-final-gc-lowering.cpp
index 6ad46f1eb01d4..48eb584b81893 100644
--- a/src/llvm-final-gc-lowering.cpp
+++ b/src/llvm-final-gc-lowering.cpp
@@ -280,17 +280,23 @@ Value *FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
         #else // MMTK_GC
             auto pool_osize_i32 = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize);
             auto pool_osize = ConstantInt::get(Type::getInt64Ty(F.getContext()), osize);
-            auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, cursor));
-            auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()),  offsetof(jl_tls_states_t, limit));
+
+            // Assuming we use the first immix allocator.
+            // FIXME: We should get the allocator index and type from MMTk.
+            auto allocator_offset = offsetof(jl_tls_states_t, mmtk_mutator) + offsetof(MMTkMutatorContext, allocators) + offsetof(Allocators, immix);
+
+            auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, cursor));
+            auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()),  allocator_offset + offsetof(ImmixAllocator, limit));
 
             auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos);
             auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr");
             auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor");
 
-
+            // offset = 8
             auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8));
             auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor);
             auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor);
+            // alignment 16 (15 = 16 - 1)
             auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta");
             auto result = builder.CreateNSWAdd(cursor, delta, "result");
 
diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index 5e868ef11c1d2..db3affd603cb2 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -266,8 +266,17 @@ void jl_init_thread_heap(jl_ptls_t ptls)
     memset(&ptls->gc_num, 0, sizeof(ptls->gc_num));
     jl_atomic_store_relaxed(&ptls->gc_num.allocd, -(int64_t)gc_num.interval);
 
+    // Create mutator
     MMTk_Mutator mmtk_mutator = mmtk_bind_mutator((void *)ptls, ptls->tid);
-    ptls->mmtk_mutator_ptr = ((MMTkMutatorContext*)mmtk_mutator);
+    // Copy the mutator to the thread local storage
+    memcpy(&ptls->mmtk_mutator, mmtk_mutator, sizeof(MMTkMutatorContext));
+    // Call post_bind to maintain a list of active mutators and to reclaim the old mutator (which is no longer needed)
+    mmtk_post_bind_mutator(&ptls->mmtk_mutator, mmtk_mutator);
+}
+
+void jl_deinit_thread_heap(jl_ptls_t ptls)
+{
+    mmtk_destroy_mutator(&ptls->mmtk_mutator);
 }
 
 // System-wide initialization
@@ -506,7 +515,7 @@ void disable_collection(void)
 JL_DLLEXPORT void jl_gc_array_ptr_copy(jl_array_t *dest, void **dest_p, jl_array_t *src, void **src_p, ssize_t n) JL_NOTSAFEPOINT
 {
     jl_ptls_t ptls = jl_current_task->ptls;
-    mmtk_memory_region_copy(ptls->mmtk_mutator_ptr, jl_array_owner(src), src_p, jl_array_owner(dest), dest_p, n);
+    mmtk_memory_region_copy(&ptls->mmtk_mutator, jl_array_owner(src), src_p, jl_array_owner(dest), dest_p, n);
 }
 
 // No inline write barrier -- only used for debugging
@@ -524,20 +533,20 @@ JL_DLLEXPORT void jl_gc_wb1_slow(const void *parent) JL_NOTSAFEPOINT
 {
     jl_task_t *ct = jl_current_task;
     jl_ptls_t ptls = ct->ptls;
-    mmtk_object_reference_write_slow(ptls->mmtk_mutator_ptr, parent, (const void*) 0);
+    mmtk_object_reference_write_slow(&ptls->mmtk_mutator, parent, (const void*) 0);
 }
 
 JL_DLLEXPORT void jl_gc_wb2_slow(const void *parent, const void* ptr) JL_NOTSAFEPOINT
 {
     jl_task_t *ct = jl_current_task;
     jl_ptls_t ptls = ct->ptls;
-    mmtk_object_reference_write_slow(ptls->mmtk_mutator_ptr, parent, ptr);
+    mmtk_object_reference_write_slow(&ptls->mmtk_mutator, parent, ptr);
 }
 
 void *jl_gc_perm_alloc_nolock(size_t sz, int zero, unsigned align, unsigned offset)
 {
     jl_ptls_t ptls = jl_current_task->ptls;
-    void* addr = mmtk_alloc(ptls->mmtk_mutator_ptr, sz, align, offset, 1);
+    void* addr = mmtk_alloc(&ptls->mmtk_mutator, sz, align, offset, 1);
     return addr;
 }
 
diff --git a/src/symbol.c b/src/symbol.c
index dcfa0b6086846..f1cd18cfb84cc 100644
--- a/src/symbol.c
+++ b/src/symbol.c
@@ -41,7 +41,7 @@ static jl_sym_t *mk_symbol(const char *str, size_t len) JL_NOTSAFEPOINT
     jl_set_typetagof(sym, jl_symbol_tag, GC_OLD_MARKED);
 #ifdef MMTK_GC
     jl_ptls_t ptls = jl_current_task->ptls;
-    mmtk_post_alloc(ptls->mmtk_mutator_ptr, jl_valueof(tag), nb, 1);
+    mmtk_post_alloc(&ptls->mmtk_mutator, jl_valueof(tag), nb, 1);
 #endif
     jl_atomic_store_relaxed(&sym->left, NULL);
     jl_atomic_store_relaxed(&sym->right, NULL);
diff --git a/src/threading.c b/src/threading.c
index 51bdd6e8107da..d58528fa183be 100644
--- a/src/threading.c
+++ b/src/threading.c
@@ -478,6 +478,9 @@ static void jl_delete_thread(void *value) JL_NOTSAFEPOINT_ENTER
 #else
     pthread_mutex_unlock(&in_signal_lock);
 #endif
+
+    jl_deinit_thread_heap(ptls);
+
     // then park in safe-region
     (void)jl_gc_safe_enter(ptls);
 }

From 0d8bbd943af2642f04dc6d8c2a74543c4ec8e84f Mon Sep 17 00:00:00 2001
From: Yi Lin <qinsoon@gmail.com>
Date: Thu, 29 Jun 2023 19:36:39 +1200
Subject: [PATCH 017/116] Align up alloc size (#18)

---
 src/julia.h   | 8 ++++++++
 src/mmtk-gc.c | 3 ++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/julia.h b/src/julia.h
index 253105ef94386..44650a7d6ed0a 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -2414,6 +2414,14 @@ STATIC_INLINE void mmtk_gc_wb(const void *parent, const void *ptr) JL_NOTSAFEPOI
 {
     mmtk_gc_wb_fast(parent, ptr);
 }
+
+#define MMTK_MIN_ALIGNMENT 4
+// MMTk assumes allocation size is aligned to min alignment.
+STATIC_INLINE size_t mmtk_align_alloc_sz(size_t sz) JL_NOTSAFEPOINT
+{
+    return (sz + MMTK_MIN_ALIGNMENT - 1) & ~(MMTK_MIN_ALIGNMENT - 1);
+}
+
 #endif
 
 #ifdef __cplusplus
diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index db3affd603cb2..84df79f432b6a 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -546,7 +546,8 @@ JL_DLLEXPORT void jl_gc_wb2_slow(const void *parent, const void* ptr) JL_NOTSAFE
 void *jl_gc_perm_alloc_nolock(size_t sz, int zero, unsigned align, unsigned offset)
 {
     jl_ptls_t ptls = jl_current_task->ptls;
-    void* addr = mmtk_alloc(&ptls->mmtk_mutator, sz, align, offset, 1);
+    size_t allocsz = mmtk_align_alloc_sz(sz);
+    void* addr = mmtk_alloc(&ptls->mmtk_mutator, allocsz, align, offset, 1);
     return addr;
 }
 

From bf1c43e84b4177513b082fb1727f6c360b5c33d4 Mon Sep 17 00:00:00 2001
From: Yi Lin <qinsoon@gmail.com>
Date: Fri, 30 Jun 2023 13:21:56 +1200
Subject: [PATCH 018/116] Allow skip inlined fastpath (#19)

---
 src/llvm-final-gc-lowering.cpp | 146 ++++++++++++++++++---------------
 1 file changed, 78 insertions(+), 68 deletions(-)

diff --git a/src/llvm-final-gc-lowering.cpp b/src/llvm-final-gc-lowering.cpp
index 48eb584b81893..3f644a365a86c 100644
--- a/src/llvm-final-gc-lowering.cpp
+++ b/src/llvm-final-gc-lowering.cpp
@@ -281,74 +281,84 @@ Value *FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
             auto pool_osize_i32 = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize);
             auto pool_osize = ConstantInt::get(Type::getInt64Ty(F.getContext()), osize);
 
-            // Assuming we use the first immix allocator.
-            // FIXME: We should get the allocator index and type from MMTk.
-            auto allocator_offset = offsetof(jl_tls_states_t, mmtk_mutator) + offsetof(MMTkMutatorContext, allocators) + offsetof(Allocators, immix);
-
-            auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, cursor));
-            auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()),  allocator_offset + offsetof(ImmixAllocator, limit));
-
-            auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos);
-            auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr");
-            auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor");
-
-            // offset = 8
-            auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8));
-            auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor);
-            auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor);
-            // alignment 16 (15 = 16 - 1)
-            auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta");
-            auto result = builder.CreateNSWAdd(cursor, delta, "result");
-
-            auto new_cursor = builder.CreateNSWAdd(result, pool_osize);
-
-            auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos);
-            auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr");
-            auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit");
-
-            auto gt_limit = builder.CreateICmpSGT(new_cursor, limit);
-
-            auto current_block = target->getParent();
-            builder.SetInsertPoint(target->getNextNode());
-            auto phiNode = builder.CreatePHI(poolAllocFunc->getReturnType(), 2, "phi_fast_slow");
-            auto top_cont = current_block->splitBasicBlock(target->getNextNode(), "top_cont");
-
-            auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction());
-            auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction(), top_cont);
-
-            auto next_br = current_block->getTerminator();
-            next_br->eraseFromParent();
-            builder.SetInsertPoint(current_block);
-            builder.CreateCondBr(gt_limit, slowpath, fastpath);
-
-            // slowpath
-            builder.SetInsertPoint(slowpath);
-            auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1);
-            auto new_call = builder.CreateCall(poolAllocFunc, { ptls, pool_offs, pool_osize_i32 });
-            new_call->setAttributes(new_call->getCalledFunction()->getAttributes());
-            builder.CreateBr(top_cont);
-
-            // // fastpath
-            builder.SetInsertPoint(fastpath);
-            builder.CreateStore(new_cursor, cursor_ptr);
-
-            // ptls->gc_num.allocd += osize;
-            auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_num));
-            auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos);
-            auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc");
-            auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls);
-            auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize);
-            builder.CreateStore(pool_allocd_total, pool_alloc_tls);
-
-            auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t)));
-            auto v_as_ptr = builder.CreateIntToPtr(v_raw, poolAllocFunc->getReturnType());
-            builder.CreateBr(top_cont);
-
-            phiNode->addIncoming(new_call, slowpath);
-            phiNode->addIncoming(v_as_ptr, fastpath);
-            phiNode->takeName(target);
-
-            return phiNode;
+            // Should we generate fastpath allocation sequence here? We should always generate fastpath here for MMTk.
+            // Setting this to false will increase allocation overhead a lot, and should only be used for debugging.
+            const bool INLINE_FASTPATH_ALLOCATION = true;
+
+            if (INLINE_FASTPATH_ALLOCATION) {
+                // Assuming we use the first immix allocator.
+                // FIXME: We should get the allocator index and type from MMTk.
+                auto allocator_offset = offsetof(jl_tls_states_t, mmtk_mutator) + offsetof(MMTkMutatorContext, allocators) + offsetof(Allocators, immix);
+
+                auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, cursor));
+                auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()),  allocator_offset + offsetof(ImmixAllocator, limit));
+
+                auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos);
+                auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr");
+                auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor");
+
+                // offset = 8
+                auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8));
+                auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor);
+                auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor);
+                // alignment 16 (15 = 16 - 1)
+                auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta");
+                auto result = builder.CreateNSWAdd(cursor, delta, "result");
+
+                auto new_cursor = builder.CreateNSWAdd(result, pool_osize);
+
+                auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos);
+                auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr");
+                auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit");
+
+                auto gt_limit = builder.CreateICmpSGT(new_cursor, limit);
+
+                auto current_block = target->getParent();
+                builder.SetInsertPoint(target->getNextNode());
+                auto phiNode = builder.CreatePHI(poolAllocFunc->getReturnType(), 2, "phi_fast_slow");
+                auto top_cont = current_block->splitBasicBlock(target->getNextNode(), "top_cont");
+
+                auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction());
+                auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction(), top_cont);
+
+                auto next_br = current_block->getTerminator();
+                next_br->eraseFromParent();
+                builder.SetInsertPoint(current_block);
+                builder.CreateCondBr(gt_limit, slowpath, fastpath);
+
+                // slowpath
+                builder.SetInsertPoint(slowpath);
+                auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1);
+                auto new_call = builder.CreateCall(poolAllocFunc, { ptls, pool_offs, pool_osize_i32 });
+                new_call->setAttributes(new_call->getCalledFunction()->getAttributes());
+                builder.CreateBr(top_cont);
+
+                // // fastpath
+                builder.SetInsertPoint(fastpath);
+                builder.CreateStore(new_cursor, cursor_ptr);
+
+                // ptls->gc_num.allocd += osize;
+                auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_num));
+                auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos);
+                auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc");
+                auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls);
+                auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize);
+                builder.CreateStore(pool_allocd_total, pool_alloc_tls);
+
+                auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t)));
+                auto v_as_ptr = builder.CreateIntToPtr(v_raw, poolAllocFunc->getReturnType());
+                builder.CreateBr(top_cont);
+
+                phiNode->addIncoming(new_call, slowpath);
+                phiNode->addIncoming(v_as_ptr, fastpath);
+                phiNode->takeName(target);
+
+                return phiNode;
+            } else {
+                auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1);
+                newI = builder.CreateCall(poolAllocFunc, { ptls, pool_offs, pool_osize_i32 });
+                derefAttr = Attribute::getWithDereferenceableBytes(F.getContext(), osize);
+            }
         #endif // MMTK_GC
         }
     } else {

From e5fc5ddebd884ee15124a1be8e4d599518433c1b Mon Sep 17 00:00:00 2001
From: Yi Lin <qinsoon@gmail.com>
Date: Mon, 3 Jul 2023 11:50:56 +1200
Subject: [PATCH 019/116] Avoid calling mmtk_gc_poll frequently (#20)

---
 src/julia_threads.h |  1 +
 src/mmtk-gc.c       | 28 ++++++++++++++++++++++++----
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/src/julia_threads.h b/src/julia_threads.h
index f79d17d35cb64..3e9db5b676577 100644
--- a/src/julia_threads.h
+++ b/src/julia_threads.h
@@ -283,6 +283,7 @@ typedef struct _jl_tls_states_t {
 
 #ifdef MMTK_GC
     MMTkMutatorContext mmtk_mutator;
+    size_t malloc_sz_since_last_poll;
 #endif
 
     // some hidden state (usually just because we don't have the type's size declaration)
diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index 84df79f432b6a..6f7e5f124e4b0 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -33,7 +33,24 @@ JL_DLLEXPORT void jl_gc_set_cb_notify_external_free(jl_gc_cb_notify_external_fre
 
 inline void maybe_collect(jl_ptls_t ptls)
 {
-    mmtk_gc_poll(ptls);
+    // Just do a safe point for general maybe_collect
+    jl_gc_safepoint_(ptls);
+}
+
+// This is only used for malloc. We need to know if we need to do GC. However, keeping checking with MMTk (mmtk_gc_poll),
+// is expensive. So we only check for every few allocations.
+static inline void malloc_maybe_collect(jl_ptls_t ptls, size_t sz)
+{
+    // We do not need to carefully maintain malloc_sz_since_last_poll. We just need to
+    // avoid using mmtk_gc_poll too frequently, and try to be precise on our heap usage
+    // as much as we can.
+    if (ptls->malloc_sz_since_last_poll > 4096) {
+        jl_atomic_store_relaxed(&ptls->malloc_sz_since_last_poll, 0);
+        mmtk_gc_poll(ptls);
+    } else {
+        jl_atomic_fetch_add_relaxed(&ptls->malloc_sz_since_last_poll, sz);
+        jl_gc_safepoint_(ptls);
+    }
 }
 
 
@@ -266,6 +283,9 @@ void jl_init_thread_heap(jl_ptls_t ptls)
     memset(&ptls->gc_num, 0, sizeof(ptls->gc_num));
     jl_atomic_store_relaxed(&ptls->gc_num.allocd, -(int64_t)gc_num.interval);
 
+    // Clear the malloc sz count
+    jl_atomic_store_relaxed(&ptls->malloc_sz_since_last_poll, 0);
+
     // Create mutator
     MMTk_Mutator mmtk_mutator = mmtk_bind_mutator((void *)ptls, ptls->tid);
     // Copy the mutator to the thread local storage
@@ -363,7 +383,7 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
     jl_task_t *ct = jl_current_task;
     if (pgcstack && ct->world_age) {
         jl_ptls_t ptls = ct->ptls;
-        maybe_collect(ptls);
+        malloc_maybe_collect(ptls, sz);
         jl_atomic_store_relaxed(&ptls->gc_num.allocd,
             jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz);
         jl_atomic_store_relaxed(&ptls->gc_num.malloc,
@@ -379,7 +399,7 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
     jl_task_t *ct = jl_current_task;
     if (pgcstack && ct->world_age) {
         jl_ptls_t ptls = ct->ptls;
-        maybe_collect(ptls);
+        malloc_maybe_collect(ptls, sz);
         jl_atomic_store_relaxed(&ptls->gc_num.allocd,
             jl_atomic_load_relaxed(&ptls->gc_num.allocd) + nm*sz);
         jl_atomic_store_relaxed(&ptls->gc_num.malloc,
@@ -411,7 +431,7 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size
     jl_task_t *ct = jl_current_task;
     if (pgcstack && ct->world_age) {
         jl_ptls_t ptls = ct->ptls;
-        maybe_collect(ptls);
+        malloc_maybe_collect(ptls, sz);
         if (sz < old)
             jl_atomic_store_relaxed(&ptls->gc_num.freed,
                 jl_atomic_load_relaxed(&ptls->gc_num.freed) + (old - sz));

From 05c42ddc757a0aa2b308f4edb2aa01f9c905ca14 Mon Sep 17 00:00:00 2001
From: Yi Lin <qinsoon@gmail.com>
Date: Mon, 3 Jul 2023 05:09:40 +0000
Subject: [PATCH 020/116] Notify GC when loading pkg image

---
 src/staticdata.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/staticdata.c b/src/staticdata.c
index 6b21b2f80437d..452e4380deb02 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -3540,6 +3540,7 @@ JL_DLLEXPORT jl_value_t *jl_restore_package_image_from_file(const char *fname, j
     jl_dlsym(pkgimg_handle, "jl_system_image_data", (void **)&pkgimg_data, 1);
     size_t *plen;
     jl_dlsym(pkgimg_handle, "jl_system_image_size", (void **)&plen, 1);
+    jl_gc_notify_image_load(pkgimg_data, *plen);
 
     jl_image_t pkgimage = jl_init_processor_pkgimg(pkgimg_handle);
 

From 67c5c32e61c7e28ef44eabc57e3a6fa2154a45cc Mon Sep 17 00:00:00 2001
From: Yi Lin <qinsoon@gmail.com>
Date: Tue, 25 Jul 2023 10:32:47 +1200
Subject: [PATCH 021/116] Remove MMTk counted malloc (#24)

This PR changes the implementation of malloc methods for MMTk. We no longer use malloc and counted malloc methods from MMTk for those. Instead, we maintain a global counter for the size, and report the size to MMTk.
---
 src/gc.h      |  4 ++-
 src/mmtk-gc.c | 86 +++++++++++++++++++++++++++------------------------
 2 files changed, 48 insertions(+), 42 deletions(-)

diff --git a/src/gc.h b/src/gc.h
index 701c2c769e1b4..6c689c4d5478e 100644
--- a/src/gc.h
+++ b/src/gc.h
@@ -33,10 +33,11 @@
 extern void maybe_collect(jl_ptls_t ptls);
 extern void run_finalizer(jl_task_t *ct, void *o, void *ff);
 extern void *jl_malloc_aligned(size_t sz, size_t align);
+extern void *jl_realloc_aligned(void *d, size_t sz, size_t oldsz, size_t align);
+extern void jl_free_aligned(void *p);
 extern void *jl_gc_counted_calloc(size_t nm, size_t sz);
 extern void jl_gc_counted_free_with_size(void *p, size_t sz);
 extern void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz);
-extern void *jl_realloc_aligned(void *d, size_t sz, size_t oldsz, size_t align);
 extern void jl_gc_add_finalizer_th(jl_ptls_t ptls, jl_value_t *v, jl_function_t *f);
 extern void jl_finalize_th(jl_task_t *ct, jl_value_t *o);
 extern jl_weakref_t *jl_gc_new_weakref_th(jl_ptls_t ptls, jl_value_t *value);
@@ -47,6 +48,7 @@ extern void gc_premark(jl_ptls_t ptls2);
 extern void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t oldsz,
                                  int isaligned, jl_value_t *owner, int8_t can_collect);
 extern size_t jl_array_nbytes(jl_array_t *a);
+extern void run_finalizers(jl_task_t *ct);
 
 #ifdef OBJPROFILE
 void objprofile_count(void *ty, int old, int sz) JL_NOTSAFEPOINT;
diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index 6f7e5f124e4b0..fa9c4acd0aa9f 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -53,31 +53,56 @@ static inline void malloc_maybe_collect(jl_ptls_t ptls, size_t sz)
     }
 }
 
-
 // malloc wrappers, aligned allocation
-// ---
+// We currently just duplicate what Julia GC does. We will in the future replace the malloc calls with MMTK's malloc.
 
+#if defined(_OS_WINDOWS_)
+inline void *jl_malloc_aligned(size_t sz, size_t align)
+{
+    return _aligned_malloc(sz ? sz : 1, align);
+}
+inline void *jl_realloc_aligned(void *p, size_t sz, size_t oldsz,
+                                       size_t align)
+{
+    (void)oldsz;
+    return _aligned_realloc(p, sz ? sz : 1, align);
+}
+inline void jl_free_aligned(void *p) JL_NOTSAFEPOINT
+{
+    _aligned_free(p);
+}
+#else
 inline void *jl_malloc_aligned(size_t sz, size_t align)
 {
-    return mmtk_malloc_aligned(sz ? sz : 1, align); // XXX sz
+#if defined(_P64) || defined(__APPLE__)
+    if (align <= 16)
+        return malloc(sz);
+#endif
+    void *ptr;
+    if (posix_memalign(&ptr, align, sz))
+        return NULL;
+    return ptr;
 }
 inline void *jl_realloc_aligned(void *d, size_t sz, size_t oldsz,
                                        size_t align)
 {
-    void *res = jl_malloc_aligned(sz, align);
-    if (res != NULL) {
-        memcpy(res, d, oldsz > sz ? sz : oldsz);
-        mmtk_free_aligned(d);
+#if defined(_P64) || defined(__APPLE__)
+    if (align <= 16)
+        return realloc(d, sz);
+#endif
+    void *b = jl_malloc_aligned(sz, align);
+    if (b != NULL) {
+        memcpy(b, d, oldsz > sz ? sz : oldsz);
+        free(d);
     }
-    return res;
+    return b;
 }
 inline void jl_free_aligned(void *p) JL_NOTSAFEPOINT
 {
-    mmtk_free_aligned(p);
+    free(p);
 }
+#endif
 
-
-// finalizers
 // ---
 
 JL_DLLEXPORT void jl_gc_run_pending_finalizers(jl_task_t *ct)
@@ -195,15 +220,14 @@ void jl_gc_free_array(jl_array_t *a) JL_NOTSAFEPOINT
     if (a->flags.how == 2) {
         char *d = (char*)a->data - a->offset*a->elsize;
         if (a->flags.isaligned)
-            mmtk_free_aligned(d);
+            jl_free_aligned(d);
         else
-            mmtk_free(d);
+            free(d);
         gc_num.freed += jl_array_nbytes(a);
         gc_num.freecall++;
     }
 }
 
-
 // roots
 // ---
 
@@ -384,11 +408,7 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
     if (pgcstack && ct->world_age) {
         jl_ptls_t ptls = ct->ptls;
         malloc_maybe_collect(ptls, sz);
-        jl_atomic_store_relaxed(&ptls->gc_num.allocd,
-            jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz);
-        jl_atomic_store_relaxed(&ptls->gc_num.malloc,
-            jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1);
-        return mmtk_counted_malloc(sz);
+        jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, sz);
     }
     return malloc(sz);
 }
@@ -399,12 +419,8 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
     jl_task_t *ct = jl_current_task;
     if (pgcstack && ct->world_age) {
         jl_ptls_t ptls = ct->ptls;
-        malloc_maybe_collect(ptls, sz);
-        jl_atomic_store_relaxed(&ptls->gc_num.allocd,
-            jl_atomic_load_relaxed(&ptls->gc_num.allocd) + nm*sz);
-        jl_atomic_store_relaxed(&ptls->gc_num.malloc,
-            jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1);
-        return mmtk_counted_calloc(nm, sz);
+        malloc_maybe_collect(ptls, nm * sz);
+        jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, nm * sz);
     }
     return calloc(nm, sz);
 }
@@ -413,16 +429,10 @@ JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz)
 {
     jl_gcframe_t **pgcstack = jl_get_pgcstack();
     jl_task_t *ct = jl_current_task;
+    free(p);
     if (pgcstack && ct->world_age) {
-        jl_ptls_t ptls = ct->ptls;
-        jl_atomic_store_relaxed(&ptls->gc_num.freed,
-            jl_atomic_load_relaxed(&ptls->gc_num.freed) + sz);
-        jl_atomic_store_relaxed(&ptls->gc_num.freecall,
-            jl_atomic_load_relaxed(&ptls->gc_num.freecall) + 1);
-        mmtk_free_with_size(p, sz);
-        return;
+        jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, -sz);
     }
-    free(p);
 }
 
 JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz)
@@ -433,16 +443,10 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size
         jl_ptls_t ptls = ct->ptls;
         malloc_maybe_collect(ptls, sz);
         if (sz < old)
-            jl_atomic_store_relaxed(&ptls->gc_num.freed,
-                jl_atomic_load_relaxed(&ptls->gc_num.freed) + (old - sz));
+            jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, old - sz);
         else
-            jl_atomic_store_relaxed(&ptls->gc_num.allocd,
-                jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (sz - old));
-        jl_atomic_store_relaxed(&ptls->gc_num.realloc,
-            jl_atomic_load_relaxed(&ptls->gc_num.realloc) + 1);
-        return mmtk_realloc_with_old_size(p, sz, old);
+            jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, sz - old);
     }
-    // TODO: correct?
     return realloc(p, sz);
 }
 

From f690aa3a5621bfa1d6a07f911818f203d3f8d650 Mon Sep 17 00:00:00 2001
From: Yi Lin <qinsoon@gmail.com>
Date: Tue, 25 Jul 2023 10:39:15 +1200
Subject: [PATCH 022/116] Use Julia's finalizer implementation (#22)

This PR moves code about registering and running finalizers to `gc-common`.
---
 src/gc-common.c | 264 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/gc.c        | 258 +---------------------------------------------
 src/gc.h        |   1 +
 src/mmtk-gc.c   |  54 ++--------
 4 files changed, 274 insertions(+), 303 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index cfb83c08a7a6b..3eacc2b2fd92d 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -49,6 +49,14 @@ memsize_t max_total_memory = (memsize_t) 2 * 1024 * 1024 * 1024;
 // finalizers
 // ---
 uint64_t finalizer_rngState[JL_RNG_SIZE];
+jl_mutex_t finalizers_lock;
+// `ptls->finalizers` and `finalizer_list_marked` might have tagged pointers.
+// If an object pointer has the lowest bit set, the next pointer is an unboxed c function pointer.
+// If an object pointer has the second lowest bit set, the current pointer is a c object pointer.
+//   It must be aligned at least 4, and it finalized immediately (at "quiescence").
+// `to_finalize` should not have tagged pointers.
+arraylist_t finalizer_list_marked;
+arraylist_t to_finalize;
 
 void jl_rng_split(uint64_t dst[JL_RNG_SIZE], uint64_t src[JL_RNG_SIZE]) JL_NOTSAFEPOINT;
 
@@ -57,6 +65,25 @@ JL_DLLEXPORT void jl_gc_init_finalizer_rng_state(void)
     jl_rng_split(finalizer_rngState, jl_current_task->rngState);
 }
 
+// The first two entries are assumed to be empty and the rest are assumed to
+// be pointers to `jl_value_t` objects
+STATIC_INLINE void jl_gc_push_arraylist(jl_task_t *ct, arraylist_t *list) JL_NOTSAFEPOINT
+{
+    void **items = list->items;
+    items[0] = (void*)JL_GC_ENCODE_PUSHARGS(list->len - 2);
+    items[1] = ct->gcstack;
+    ct->gcstack = (jl_gcframe_t*)items;
+}
+
+STATIC_INLINE void schedule_finalization(void *o, void *f) JL_NOTSAFEPOINT
+{
+    arraylist_push(&to_finalize, o);
+    arraylist_push(&to_finalize, f);
+    // doesn't need release, since we'll keep checking (on the reader) until we see the work and
+    // release our lock, and that will have a release barrier by then
+    jl_atomic_store_relaxed(&jl_gc_have_pending_finalizers, 1);
+}
+
 void run_finalizer(jl_task_t *ct, void *o, void *ff)
 {
     int ptr_finalizer = gc_ptr_tag(o, 1);
@@ -79,6 +106,243 @@ void run_finalizer(jl_task_t *ct, void *o, void *ff)
     }
 }
 
+void jl_gc_add_finalizer_(jl_ptls_t ptls, void *v, void *f) JL_NOTSAFEPOINT
+{
+    assert(jl_atomic_load_relaxed(&ptls->gc_state) == 0);
+    arraylist_t *a = &ptls->finalizers;
+    // This acquire load and the release store at the end are used to
+    // synchronize with `finalize_object` on another thread. Apart from the GC,
+    // which is blocked by entering a unsafe region, there might be only
+    // one other thread accessing our list in `finalize_object`
+    // (only one thread since it needs to acquire the finalizer lock).
+    // Similar to `finalize_object`, all content mutation has to be done
+    // between the acquire and the release of the length.
+    size_t oldlen = jl_atomic_load_acquire((_Atomic(size_t)*)&a->len);
+    if (__unlikely(oldlen + 2 > a->max)) {
+        JL_LOCK_NOGC(&finalizers_lock);
+        // `a->len` might have been modified.
+        // Another possibility is to always grow the array to `oldlen + 2` but
+        // it's simpler this way and uses slightly less memory =)
+        oldlen = a->len;
+        arraylist_grow(a, 2);
+        a->len = oldlen;
+        JL_UNLOCK_NOGC(&finalizers_lock);
+    }
+    void **items = a->items;
+    items[oldlen] = v;
+    items[oldlen + 1] = f;
+    jl_atomic_store_release((_Atomic(size_t)*)&a->len, oldlen + 2);
+}
+
+// Same assumption as `jl_gc_push_arraylist`. Requires the finalizers lock
+// to be hold for the current thread and will release the lock when the
+// function returns.
+void jl_gc_run_finalizers_in_list(jl_task_t *ct, arraylist_t *list) JL_NOTSAFEPOINT_LEAVE
+{
+    // Avoid marking `ct` as non-migratable via an `@async` task (as noted in the docstring
+    // of `finalizer`) in a finalizer:
+    uint8_t sticky = ct->sticky;
+    // empty out the first two entries for the GC frame
+    arraylist_push(list, list->items[0]);
+    arraylist_push(list, list->items[1]);
+    jl_gc_push_arraylist(ct, list);
+    void **items = list->items;
+    size_t len = list->len;
+    JL_UNLOCK_NOGC(&finalizers_lock);
+    // run finalizers in reverse order they were added, so lower-level finalizers run last
+    for (size_t i = len-4; i >= 2; i -= 2)
+        run_finalizer(ct, items[i], items[i + 1]);
+    // first entries were moved last to make room for GC frame metadata
+    run_finalizer(ct, items[len-2], items[len-1]);
+    // matches the jl_gc_push_arraylist above
+    JL_GC_POP();
+    ct->sticky = sticky;
+}
+
+void run_finalizers(jl_task_t *ct)
+{
+    // Racy fast path:
+    // The race here should be OK since the race can only happen if
+    // another thread is writing to it with the lock held. In such case,
+    // we don't need to run pending finalizers since the writer thread
+    // will flush it.
+    if (to_finalize.len == 0)
+        return;
+    JL_LOCK_NOGC(&finalizers_lock);
+    if (to_finalize.len == 0) {
+        JL_UNLOCK_NOGC(&finalizers_lock);
+        return;
+    }
+    arraylist_t copied_list;
+    memcpy(&copied_list, &to_finalize, sizeof(copied_list));
+    if (to_finalize.items == to_finalize._space) {
+        copied_list.items = copied_list._space;
+    }
+    jl_atomic_store_relaxed(&jl_gc_have_pending_finalizers, 0);
+    arraylist_new(&to_finalize, 0);
+
+    uint64_t save_rngState[JL_RNG_SIZE];
+    memcpy(&save_rngState[0], &ct->rngState[0], sizeof(save_rngState));
+    jl_rng_split(ct->rngState, finalizer_rngState);
+
+    // This releases the finalizers lock.
+    int8_t was_in_finalizer = ct->ptls->in_finalizer;
+    ct->ptls->in_finalizer = 1;
+    jl_gc_run_finalizers_in_list(ct, &copied_list);
+    ct->ptls->in_finalizer = was_in_finalizer;
+    arraylist_free(&copied_list);
+
+    memcpy(&ct->rngState[0], &save_rngState[0], sizeof(save_rngState));
+}
+
+// if `need_sync` is true, the `list` is the `finalizers` list of another
+// thread and we need additional synchronizations
+void finalize_object(arraylist_t *list, jl_value_t *o,
+                            arraylist_t *copied_list, int need_sync) JL_NOTSAFEPOINT
+{
+    // The acquire load makes sure that the first `len` objects are valid.
+    // If `need_sync` is true, all mutations of the content should be limited
+    // to the first `oldlen` elements and no mutation is allowed after the
+    // new length is published with the `cmpxchg` at the end of the function.
+    // This way, the mutation should not conflict with the owning thread,
+    // which only writes to locations later than `len`
+    // and will not resize the buffer without acquiring the lock.
+    size_t len = need_sync ? jl_atomic_load_acquire((_Atomic(size_t)*)&list->len) : list->len;
+    size_t oldlen = len;
+    void **items = list->items;
+    size_t j = 0;
+    for (size_t i = 0; i < len; i += 2) {
+        void *v = items[i];
+        int move = 0;
+        if (o == (jl_value_t*)gc_ptr_clear_tag(v, 1)) {
+            void *f = items[i + 1];
+            move = 1;
+            arraylist_push(copied_list, v);
+            arraylist_push(copied_list, f);
+        }
+        if (move || __unlikely(!v)) {
+            // remove item
+        }
+        else {
+            if (j < i) {
+                items[j] = items[i];
+                items[j+1] = items[i+1];
+            }
+            j += 2;
+        }
+    }
+    len = j;
+    if (oldlen == len)
+        return;
+    if (need_sync) {
+        // The memset needs to be unconditional since the thread might have
+        // already read the length.
+        // The `memset` (like any other content mutation) has to be done
+        // **before** the `cmpxchg` which publishes the length.
+        memset(&items[len], 0, (oldlen - len) * sizeof(void*));
+        jl_atomic_cmpswap((_Atomic(size_t)*)&list->len, &oldlen, len);
+    }
+    else {
+        list->len = len;
+    }
+}
+
+JL_DLLEXPORT void jl_gc_add_ptr_finalizer(jl_ptls_t ptls, jl_value_t *v, void *f) JL_NOTSAFEPOINT
+{
+    jl_gc_add_finalizer_(ptls, (void*)(((uintptr_t)v) | 1), f);
+}
+
+// schedule f(v) to call at the next quiescent interval (aka after the next safepoint/region on all threads)
+JL_DLLEXPORT void jl_gc_add_quiescent(jl_ptls_t ptls, void **v, void *f) JL_NOTSAFEPOINT
+{
+    assert(!gc_ptr_tag(v, 3));
+    jl_gc_add_finalizer_(ptls, (void*)(((uintptr_t)v) | 3), f);
+}
+
+JL_DLLEXPORT void jl_gc_add_finalizer_th(jl_ptls_t ptls, jl_value_t *v, jl_function_t *f) JL_NOTSAFEPOINT
+{
+    if (__unlikely(jl_typetagis(f, jl_voidpointer_type))) {
+        jl_gc_add_ptr_finalizer(ptls, v, jl_unbox_voidpointer(f));
+    }
+    else {
+        jl_gc_add_finalizer_(ptls, v, f);
+    }
+}
+
+JL_DLLEXPORT void jl_gc_run_pending_finalizers(jl_task_t *ct)
+{
+    if (ct == NULL)
+        ct = jl_current_task;
+    jl_ptls_t ptls = ct->ptls;
+    if (!ptls->in_finalizer && ptls->locks.len == 0 && ptls->finalizers_inhibited == 0) {
+        run_finalizers(ct);
+    }
+}
+
+JL_DLLEXPORT void jl_finalize_th(jl_task_t *ct, jl_value_t *o)
+{
+    JL_LOCK_NOGC(&finalizers_lock);
+    // Copy the finalizers into a temporary list so that code in the finalizer
+    // won't change the list as we loop through them.
+    // This list is also used as the GC frame when we are running the finalizers
+    arraylist_t copied_list;
+    arraylist_new(&copied_list, 0);
+    // No need to check the to_finalize list since the user is apparently
+    // still holding a reference to the object
+    int gc_n_threads;
+    jl_ptls_t* gc_all_tls_states;
+    gc_n_threads = jl_atomic_load_acquire(&jl_n_threads);
+    gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[i];
+        if (ptls2 != NULL)
+            finalize_object(&ptls2->finalizers, o, &copied_list, jl_atomic_load_relaxed(&ct->tid) != i);
+    }
+    finalize_object(&finalizer_list_marked, o, &copied_list, 0);
+    gc_n_threads = 0;
+    gc_all_tls_states = NULL;
+    if (copied_list.len > 0) {
+        // This releases the finalizers lock.
+        jl_gc_run_finalizers_in_list(ct, &copied_list);
+    }
+    else {
+        JL_UNLOCK_NOGC(&finalizers_lock);
+    }
+    arraylist_free(&copied_list);
+}
+
+void schedule_all_finalizers(arraylist_t *flist) JL_NOTSAFEPOINT
+{
+    void **items = flist->items;
+    size_t len = flist->len;
+    for(size_t i = 0; i < len; i+=2) {
+        void *v = items[i];
+        void *f = items[i + 1];
+        if (__unlikely(!v))
+            continue;
+        schedule_finalization(v, f);
+    }
+    flist->len = 0;
+}
+
+void jl_gc_run_all_finalizers(jl_task_t *ct)
+{
+    if (!ct) return;
+    int gc_n_threads;
+    jl_ptls_t* gc_all_tls_states;
+    gc_n_threads = jl_atomic_load_acquire(&jl_n_threads);
+    gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
+    schedule_all_finalizers(&finalizer_list_marked);
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[i];
+        if (ptls2 != NULL)
+            schedule_all_finalizers(&ptls2->finalizers);
+    }
+    gc_n_threads = 0;
+    gc_all_tls_states = NULL;
+    run_finalizers(ct);
+}
+
 JL_DLLEXPORT int jl_gc_get_finalizers_inhibited(jl_ptls_t ptls)
 {
     if (ptls == NULL)
diff --git a/src/gc.c b/src/gc.c
index 90eae32f0affc..08741df919dfa 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -290,7 +290,6 @@ void jl_gc_notify_image_alloc(char* img_data, size_t len)
 // For accessing `ptls->finalizers`, the lock is needed if a thread
 // is going to realloc the buffer (of its own list) or accessing the
 // list of another thread
-static jl_mutex_t finalizers_lock;
 static uv_mutex_t gc_cache_lock;
 
 // Flag that tells us whether we need to support conservative marking
@@ -335,14 +334,6 @@ pagetable_t memory_map;
 bigval_t *big_objects_marked = NULL;
 
 // -- Finalization --
-// `ptls->finalizers` and `finalizer_list_marked` might have tagged pointers.
-// If an object pointer has the lowest bit set, the next pointer is an unboxed c function pointer.
-// If an object pointer has the second lowest bit set, the current pointer is a c object pointer.
-//   It must be aligned at least 4, and it finalized immediately (at "quiescence").
-// `to_finalize` should not have tagged pointers.
-arraylist_t finalizer_list_marked;
-arraylist_t to_finalize;
-
 
 NOINLINE uintptr_t gc_get_stack_ptr(void)
 {
@@ -425,7 +416,7 @@ inline void jl_free_aligned(void *p) JL_NOTSAFEPOINT
 }
 #endif
 
-static void schedule_finalization(void *o, void *f) JL_NOTSAFEPOINT
+STATIC_INLINE void schedule_finalization(void *o, void *f) JL_NOTSAFEPOINT
 {
     arraylist_push(&to_finalize, o);
     arraylist_push(&to_finalize, f);
@@ -434,253 +425,6 @@ static void schedule_finalization(void *o, void *f) JL_NOTSAFEPOINT
     jl_atomic_store_relaxed(&jl_gc_have_pending_finalizers, 1);
 }
 
-// if `need_sync` is true, the `list` is the `finalizers` list of another
-// thread and we need additional synchronizations
-static void finalize_object(arraylist_t *list, jl_value_t *o,
-                            arraylist_t *copied_list, int need_sync) JL_NOTSAFEPOINT
-{
-    // The acquire load makes sure that the first `len` objects are valid.
-    // If `need_sync` is true, all mutations of the content should be limited
-    // to the first `oldlen` elements and no mutation is allowed after the
-    // new length is published with the `cmpxchg` at the end of the function.
-    // This way, the mutation should not conflict with the owning thread,
-    // which only writes to locations later than `len`
-    // and will not resize the buffer without acquiring the lock.
-    size_t len = need_sync ? jl_atomic_load_acquire((_Atomic(size_t)*)&list->len) : list->len;
-    size_t oldlen = len;
-    void **items = list->items;
-    size_t j = 0;
-    for (size_t i = 0; i < len; i += 2) {
-        void *v = items[i];
-        int move = 0;
-        if (o == (jl_value_t*)gc_ptr_clear_tag(v, 1)) {
-            void *f = items[i + 1];
-            move = 1;
-            arraylist_push(copied_list, v);
-            arraylist_push(copied_list, f);
-        }
-        if (move || __unlikely(!v)) {
-            // remove item
-        }
-        else {
-            if (j < i) {
-                items[j] = items[i];
-                items[j+1] = items[i+1];
-            }
-            j += 2;
-        }
-    }
-    len = j;
-    if (oldlen == len)
-        return;
-    if (need_sync) {
-        // The memset needs to be unconditional since the thread might have
-        // already read the length.
-        // The `memset` (like any other content mutation) has to be done
-        // **before** the `cmpxchg` which publishes the length.
-        memset(&items[len], 0, (oldlen - len) * sizeof(void*));
-        jl_atomic_cmpswap((_Atomic(size_t)*)&list->len, &oldlen, len);
-    }
-    else {
-        list->len = len;
-    }
-}
-
-// The first two entries are assumed to be empty and the rest are assumed to
-// be pointers to `jl_value_t` objects
-static void jl_gc_push_arraylist(jl_task_t *ct, arraylist_t *list) JL_NOTSAFEPOINT
-{
-    void **items = list->items;
-    items[0] = (void*)JL_GC_ENCODE_PUSHARGS(list->len - 2);
-    items[1] = ct->gcstack;
-    ct->gcstack = (jl_gcframe_t*)items;
-}
-
-// Same assumption as `jl_gc_push_arraylist`. Requires the finalizers lock
-// to be hold for the current thread and will release the lock when the
-// function returns.
-static void jl_gc_run_finalizers_in_list(jl_task_t *ct, arraylist_t *list) JL_NOTSAFEPOINT_LEAVE
-{
-    // Avoid marking `ct` as non-migratable via an `@async` task (as noted in the docstring
-    // of `finalizer`) in a finalizer:
-    uint8_t sticky = ct->sticky;
-    // empty out the first two entries for the GC frame
-    arraylist_push(list, list->items[0]);
-    arraylist_push(list, list->items[1]);
-    jl_gc_push_arraylist(ct, list);
-    void **items = list->items;
-    size_t len = list->len;
-    JL_UNLOCK_NOGC(&finalizers_lock);
-    // run finalizers in reverse order they were added, so lower-level finalizers run last
-    for (size_t i = len-4; i >= 2; i -= 2)
-        run_finalizer(ct, items[i], items[i + 1]);
-    // first entries were moved last to make room for GC frame metadata
-    run_finalizer(ct, items[len-2], items[len-1]);
-    // matches the jl_gc_push_arraylist above
-    JL_GC_POP();
-    ct->sticky = sticky;
-}
-
-static void run_finalizers(jl_task_t *ct)
-{
-    // Racy fast path:
-    // The race here should be OK since the race can only happen if
-    // another thread is writing to it with the lock held. In such case,
-    // we don't need to run pending finalizers since the writer thread
-    // will flush it.
-    if (to_finalize.len == 0)
-        return;
-    JL_LOCK_NOGC(&finalizers_lock);
-    if (to_finalize.len == 0) {
-        JL_UNLOCK_NOGC(&finalizers_lock);
-        return;
-    }
-    arraylist_t copied_list;
-    memcpy(&copied_list, &to_finalize, sizeof(copied_list));
-    if (to_finalize.items == to_finalize._space) {
-        copied_list.items = copied_list._space;
-    }
-    jl_atomic_store_relaxed(&jl_gc_have_pending_finalizers, 0);
-    arraylist_new(&to_finalize, 0);
-
-    uint64_t save_rngState[JL_RNG_SIZE];
-    memcpy(&save_rngState[0], &ct->rngState[0], sizeof(save_rngState));
-    jl_rng_split(ct->rngState, finalizer_rngState);
-
-    // This releases the finalizers lock.
-    int8_t was_in_finalizer = ct->ptls->in_finalizer;
-    ct->ptls->in_finalizer = 1;
-    jl_gc_run_finalizers_in_list(ct, &copied_list);
-    ct->ptls->in_finalizer = was_in_finalizer;
-    arraylist_free(&copied_list);
-
-    memcpy(&ct->rngState[0], &save_rngState[0], sizeof(save_rngState));
-}
-
-JL_DLLEXPORT void jl_gc_run_pending_finalizers(jl_task_t *ct)
-{
-    if (ct == NULL)
-        ct = jl_current_task;
-    jl_ptls_t ptls = ct->ptls;
-    if (!ptls->in_finalizer && ptls->locks.len == 0 && ptls->finalizers_inhibited == 0) {
-        run_finalizers(ct);
-    }
-}
-
-static void schedule_all_finalizers(arraylist_t *flist) JL_NOTSAFEPOINT
-{
-    void **items = flist->items;
-    size_t len = flist->len;
-    for(size_t i = 0; i < len; i+=2) {
-        void *v = items[i];
-        void *f = items[i + 1];
-        if (__unlikely(!v))
-            continue;
-        schedule_finalization(v, f);
-    }
-    flist->len = 0;
-}
-
-void jl_gc_run_all_finalizers(jl_task_t *ct)
-{
-    if (!ct) return;
-    int gc_n_threads;
-    jl_ptls_t* gc_all_tls_states;
-    gc_n_threads = jl_atomic_load_acquire(&jl_n_threads);
-    gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
-    schedule_all_finalizers(&finalizer_list_marked);
-    for (int i = 0; i < gc_n_threads; i++) {
-        jl_ptls_t ptls2 = gc_all_tls_states[i];
-        if (ptls2 != NULL)
-            schedule_all_finalizers(&ptls2->finalizers);
-    }
-    gc_n_threads = 0;
-    gc_all_tls_states = NULL;
-    run_finalizers(ct);
-}
-
-void jl_gc_add_finalizer_(jl_ptls_t ptls, void *v, void *f) JL_NOTSAFEPOINT
-{
-    assert(jl_atomic_load_relaxed(&ptls->gc_state) == 0);
-    arraylist_t *a = &ptls->finalizers;
-    // This acquire load and the release store at the end are used to
-    // synchronize with `finalize_object` on another thread. Apart from the GC,
-    // which is blocked by entering a unsafe region, there might be only
-    // one other thread accessing our list in `finalize_object`
-    // (only one thread since it needs to acquire the finalizer lock).
-    // Similar to `finalize_object`, all content mutation has to be done
-    // between the acquire and the release of the length.
-    size_t oldlen = jl_atomic_load_acquire((_Atomic(size_t)*)&a->len);
-    if (__unlikely(oldlen + 2 > a->max)) {
-        JL_LOCK_NOGC(&finalizers_lock);
-        // `a->len` might have been modified.
-        // Another possibility is to always grow the array to `oldlen + 2` but
-        // it's simpler this way and uses slightly less memory =)
-        oldlen = a->len;
-        arraylist_grow(a, 2);
-        a->len = oldlen;
-        JL_UNLOCK_NOGC(&finalizers_lock);
-    }
-    void **items = a->items;
-    items[oldlen] = v;
-    items[oldlen + 1] = f;
-    jl_atomic_store_release((_Atomic(size_t)*)&a->len, oldlen + 2);
-}
-
-JL_DLLEXPORT void jl_gc_add_ptr_finalizer(jl_ptls_t ptls, jl_value_t *v, void *f) JL_NOTSAFEPOINT
-{
-    jl_gc_add_finalizer_(ptls, (void*)(((uintptr_t)v) | 1), f);
-}
-
-// schedule f(v) to call at the next quiescent interval (aka after the next safepoint/region on all threads)
-JL_DLLEXPORT void jl_gc_add_quiescent(jl_ptls_t ptls, void **v, void *f) JL_NOTSAFEPOINT
-{
-    assert(!gc_ptr_tag(v, 3));
-    jl_gc_add_finalizer_(ptls, (void*)(((uintptr_t)v) | 3), f);
-}
-
-JL_DLLEXPORT void jl_gc_add_finalizer_th(jl_ptls_t ptls, jl_value_t *v, jl_function_t *f) JL_NOTSAFEPOINT
-{
-    if (__unlikely(jl_typetagis(f, jl_voidpointer_type))) {
-        jl_gc_add_ptr_finalizer(ptls, v, jl_unbox_voidpointer(f));
-    }
-    else {
-        jl_gc_add_finalizer_(ptls, v, f);
-    }
-}
-
-JL_DLLEXPORT void jl_finalize_th(jl_task_t *ct, jl_value_t *o)
-{
-    JL_LOCK_NOGC(&finalizers_lock);
-    // Copy the finalizers into a temporary list so that code in the finalizer
-    // won't change the list as we loop through them.
-    // This list is also used as the GC frame when we are running the finalizers
-    arraylist_t copied_list;
-    arraylist_new(&copied_list, 0);
-    // No need to check the to_finalize list since the user is apparently
-    // still holding a reference to the object
-    int gc_n_threads;
-    jl_ptls_t* gc_all_tls_states;
-    gc_n_threads = jl_atomic_load_acquire(&jl_n_threads);
-    gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
-    for (int i = 0; i < gc_n_threads; i++) {
-        jl_ptls_t ptls2 = gc_all_tls_states[i];
-        if (ptls2 != NULL)
-            finalize_object(&ptls2->finalizers, o, &copied_list, jl_atomic_load_relaxed(&ct->tid) != i);
-    }
-    finalize_object(&finalizer_list_marked, o, &copied_list, 0);
-    gc_n_threads = 0;
-    gc_all_tls_states = NULL;
-    if (copied_list.len > 0) {
-        // This releases the finalizers lock.
-        jl_gc_run_finalizers_in_list(ct, &copied_list);
-    }
-    else {
-        JL_UNLOCK_NOGC(&finalizers_lock);
-    }
-    arraylist_free(&copied_list);
-}
-
 // explicitly scheduled objects for the sweepfunc callback
 static void gc_sweep_foreign_objs_in_list(arraylist_t *objs)
 {
diff --git a/src/gc.h b/src/gc.h
index 6c689c4d5478e..9fa780c24c30f 100644
--- a/src/gc.h
+++ b/src/gc.h
@@ -382,6 +382,7 @@ extern pagetable_t memory_map;
 extern bigval_t *big_objects_marked;
 extern arraylist_t finalizer_list_marked;
 extern arraylist_t to_finalize;
+extern jl_mutex_t finalizers_lock;
 extern int64_t lazy_freed_pages;
 
 STATIC_INLINE bigval_t *bigval_header(jl_taggedvalue_t *o) JL_NOTSAFEPOINT
diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index fa9c4acd0aa9f..86c6fd17eb571 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -103,52 +103,6 @@ inline void jl_free_aligned(void *p) JL_NOTSAFEPOINT
 }
 #endif
 
-// ---
-
-JL_DLLEXPORT void jl_gc_run_pending_finalizers(jl_task_t *ct)
-{
-    if (ct == NULL)
-        ct = jl_current_task;
-    mmtk_jl_run_pending_finalizers(ct->ptls);
-}
-
-JL_DLLEXPORT void jl_gc_add_ptr_finalizer(jl_ptls_t ptls, jl_value_t *v, void *f) JL_NOTSAFEPOINT
-{
-    mmtk_register_finalizer(v, f, 1);
-}
-
-// schedule f(v) to call at the next quiescent interval (aka after the next safepoint/region on all threads)
-JL_DLLEXPORT void jl_gc_add_quiescent(jl_ptls_t ptls, void **v, void *f) JL_NOTSAFEPOINT
-{
-    /* TODO: unsupported? */
-}
-
-JL_DLLEXPORT void jl_gc_add_finalizer_th(jl_ptls_t ptls, jl_value_t *v, jl_function_t *f) JL_NOTSAFEPOINT
-{
-    if (__unlikely(jl_typeis(f, jl_voidpointer_type))) {
-        jl_gc_add_ptr_finalizer(ptls, v, jl_unbox_voidpointer(f));
-    }
-    else {
-        mmtk_register_finalizer(v, f, 0);
-    }
-}
-
-JL_DLLEXPORT void jl_finalize_th(jl_task_t *ct, jl_value_t *o)
-{
-    mmtk_run_finalizers_for_obj(o);
-}
-
-void jl_gc_run_all_finalizers(jl_task_t *ct)
-{
-    mmtk_jl_gc_run_all_finalizers();
-}
-
-void jl_gc_add_finalizer_(jl_ptls_t ptls, void *v, void *f) JL_NOTSAFEPOINT
-{
-    mmtk_register_finalizer(v, f, 0);
-}
-
-
 // weak references
 // ---
 JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref_th(jl_ptls_t ptls, jl_value_t *value)
@@ -323,6 +277,10 @@ void jl_deinit_thread_heap(jl_ptls_t ptls)
     mmtk_destroy_mutator(&ptls->mmtk_mutator);
 }
 
+extern jl_mutex_t finalizers_lock;
+extern arraylist_t to_finalize;
+extern arraylist_t finalizer_list_marked;
+
 // System-wide initialization
 // TODO: remove locks? remove anything else?
 void jl_gc_init(void)
@@ -331,8 +289,12 @@ void jl_gc_init(void)
         jl_gc_set_max_memory(jl_options.heap_size_hint);
 
     JL_MUTEX_INIT(&heapsnapshot_lock, "heapsnapshot_lock");
+    JL_MUTEX_INIT(&finalizers_lock, "finalizers_lock");
     uv_mutex_init(&gc_perm_lock);
 
+    arraylist_new(&to_finalize, 0);
+    arraylist_new(&finalizer_list_marked, 0);
+
     gc_num.interval = default_collect_interval;
     last_long_collect_interval = default_collect_interval;
     gc_num.allocd = 0;

From f41239c9d91dd7bc84e3735ceae0c3fbdeaac2a1 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Tue, 25 Jul 2023 23:29:11 +0000
Subject: [PATCH 023/116] Refactor the code for scanning, getting object size
 in Rust, and removing scan_obj_c option

---
 src/mmtk-gc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index 86c6fd17eb571..3e2dd17fc7447 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -355,9 +355,9 @@ void jl_gc_init(void)
 
     // if only max size is specified initialize MMTk with a fixed size heap
     if (max_size_def != NULL || (max_size_gb != NULL && (min_size_def == NULL && min_size_gb == NULL))) {
-        mmtk_gc_init(0, max_heap_size, jl_options.ngcthreads, &mmtk_upcalls, (sizeof(jl_taggedvalue_t)));
+        mmtk_gc_init(0, max_heap_size, jl_options.ngcthreads, &mmtk_upcalls, (sizeof(jl_taggedvalue_t)), jl_buff_tag);
     } else {
-        mmtk_gc_init(min_heap_size, max_heap_size, jl_options.ngcthreads, &mmtk_upcalls, (sizeof(jl_taggedvalue_t)));
+        mmtk_gc_init(min_heap_size, max_heap_size, jl_options.ngcthreads, &mmtk_upcalls, (sizeof(jl_taggedvalue_t)), jl_buff_tag);
     }
 }
 

From ed30d1c925865dd5c4ca2482701e20d6bc4700ec Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 26 Jul 2023 11:34:46 +0000
Subject: [PATCH 024/116] Adding check for COPY_STACKS flag and
 julia_copy_stacks feature

---
 src/mmtk-gc.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index 3e2dd17fc7447..10336d3f7d1db 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -353,6 +353,17 @@ void jl_gc_init(void)
     // If the two values are the same, we can use either. Otherwise, we need to be careful.
     assert(jl_n_gcthreads == jl_options.ngcthreads);
 
+    // Check that the julia_copy_stack rust feature has been defined when the COPY_STACK has been defined
+    int copy_stacks;
+
+#ifdef COPY_STACKS
+    copy_stacks = 1;
+#else
+    copy_stacks = 0;
+#endif
+
+    mmtk_julia_copy_stack_check(copy_stacks);
+
     // if only max size is specified initialize MMTk with a fixed size heap
     if (max_size_def != NULL || (max_size_gb != NULL && (min_size_def == NULL && min_size_gb == NULL))) {
         mmtk_gc_init(0, max_heap_size, jl_options.ngcthreads, &mmtk_upcalls, (sizeof(jl_taggedvalue_t)), jl_buff_tag);

From 73411572e118d7cfd0110da46663b34cb82eb520 Mon Sep 17 00:00:00 2001
From: Yi Lin <qinsoon@gmail.com>
Date: Thu, 27 Jul 2023 12:14:07 +1200
Subject: [PATCH 025/116] Inline runtime alloc (#23)

This PR:
* makes MMTk's plan as a compiler option when building Julia
* implements MMTk allocation fastpath for runtime allocations.
---
 Make.inc             |  8 ++++++
 src/julia.h          | 59 ++++++++++++++++++++++++++++++++++++++++++--
 src/julia_internal.h |  2 +-
 src/mmtk-gc.c        |  2 +-
 src/symbol.c         |  2 +-
 5 files changed, 68 insertions(+), 5 deletions(-)

diff --git a/Make.inc b/Make.inc
index 6920fc64ecf70..bef6d1747b7d7 100644
--- a/Make.inc
+++ b/Make.inc
@@ -747,6 +747,14 @@ else
 MMTK_BUILD = release
 endif
 endif
+ifeq (${MMTK_PLAN},Immix)
+JCXXFLAGS += -DMMTK_PLAN_IMMIX
+JCFLAGS += -DMMTK_PLAN_IMMIX
+endif
+ifeq (${MMTK_PLAN},StickyImmix)
+JCXXFLAGS += -DMMTK_PLAN_STICKYIMMIX
+JCFLAGS += -DMMTK_PLAN_STICKYIMMIX
+endif
 MMTK_DIR = ${MMTK_JULIA_DIR}/mmtk
 MMTK_API_INC = $(MMTK_DIR)/api
 MMTK_JULIA_INC = ${MMTK_JULIA_DIR}/julia
diff --git a/src/julia.h b/src/julia.h
index 44650a7d6ed0a..77a95bf625b80 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -2382,10 +2382,24 @@ extern JL_DLLEXPORT int jl_default_debug_info_kind;
 #ifdef MMTK_GC
 extern void mmtk_object_reference_write_post(void* mutator, const void* parent, const void* ptr);
 extern void mmtk_object_reference_write_slow(void* mutator, const void* parent, const void* ptr);
-extern const uint8_t MMTK_NEEDS_WRITE_BARRIER;
-extern const uint8_t MMTK_OBJECT_BARRIER;
+extern void* mmtk_alloc(void* mutator, size_t size, size_t align, size_t offset, int allocator);
+
 extern const void* MMTK_SIDE_LOG_BIT_BASE_ADDRESS;
 
+// These need to be constants.
+
+#define MMTK_OBJECT_BARRIER (1)
+// Stickyimmix needs write barrier. Immix does not need write barrier.
+#ifdef MMTK_PLAN_IMMIX
+#define MMTK_NEEDS_WRITE_BARRIER (0)
+#endif
+#ifdef MMTK_PLAN_STICKYIMMIX
+#define MMTK_NEEDS_WRITE_BARRIER (1)
+#endif
+
+#define MMTK_DEFAULT_IMMIX_ALLOCATOR (0)
+#define MMTK_IMMORTAL_BUMP_ALLOCATOR (0)
+
 // Directly call into MMTk for write barrier (debugging only)
 STATIC_INLINE void mmtk_gc_wb_full(const void *parent, const void *ptr) JL_NOTSAFEPOINT
 {
@@ -2422,6 +2436,47 @@ STATIC_INLINE size_t mmtk_align_alloc_sz(size_t sz) JL_NOTSAFEPOINT
     return (sz + MMTK_MIN_ALIGNMENT - 1) & ~(MMTK_MIN_ALIGNMENT - 1);
 }
 
+STATIC_INLINE void* bump_alloc_fast(MMTkMutatorContext* mutator, uintptr_t* cursor, uintptr_t limit, size_t size, size_t align, size_t offset, int allocator) {
+    intptr_t delta = (-offset - *cursor) & (align - 1);
+    uintptr_t result = *cursor + (uintptr_t)delta;
+
+    if (__unlikely(result + size > limit)) {
+        return (void*) mmtk_alloc(mutator, size, align, offset, allocator);
+    } else{
+        *cursor = result + size;
+        return (void*)result;
+    }
+}
+
+STATIC_INLINE void* mmtk_immix_alloc_fast(MMTkMutatorContext* mutator, size_t size, size_t align, size_t offset) {
+    ImmixAllocator* allocator = &mutator->allocators.immix[MMTK_DEFAULT_IMMIX_ALLOCATOR];
+    return bump_alloc_fast(mutator, (uintptr_t*)&allocator->cursor, (intptr_t)allocator->limit, size, align, offset, 0);
+}
+
+STATIC_INLINE void mmtk_immix_post_alloc_fast(MMTkMutatorContext* mutator, void* obj, size_t size) {
+    // We do not need post alloc for immix objects in immix/stickyimmix
+}
+
+STATIC_INLINE void* mmtk_immortal_alloc_fast(MMTkMutatorContext* mutator, size_t size, size_t align, size_t offset) {
+    BumpAllocator* allocator = &mutator->allocators.bump_pointer[MMTK_IMMORTAL_BUMP_ALLOCATOR];
+    return bump_alloc_fast(mutator, (uintptr_t*)&allocator->cursor, (uintptr_t)allocator->limit, size, align, offset, 1);
+}
+
+STATIC_INLINE void mmtk_immortal_post_alloc_fast(MMTkMutatorContext* mutator, void* obj, size_t size) {
+    if (MMTK_NEEDS_WRITE_BARRIER == MMTK_OBJECT_BARRIER) {
+        intptr_t addr = (intptr_t) obj;
+        uint8_t* meta_addr = (uint8_t*) (MMTK_SIDE_LOG_BIT_BASE_ADDRESS) + (addr >> 6);
+        intptr_t shift = (addr >> 3) & 0b111;
+        while(1) {
+            uint8_t old_val = *meta_addr;
+            uint8_t new_val = old_val | (1 << shift);
+            if (jl_atomic_cmpswap((_Atomic(uint8_t)*)meta_addr, &old_val, new_val)) {
+                break;
+            }
+        }
+    }
+}
+
 #endif
 
 #ifdef __cplusplus
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 76ed8f977dc7a..cbd0bf7750251 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -550,7 +550,7 @@ STATIC_INLINE jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT
     o->header = tag | GC_OLD_MARKED;
 #ifdef MMTK_GC
     jl_ptls_t ptls = jl_current_task->ptls;
-    mmtk_post_alloc(&ptls->mmtk_mutator, jl_valueof(o), allocsz, 1);
+    mmtk_immortal_post_alloc_fast(&ptls->mmtk_mutator, jl_valueof(o), allocsz);
 #endif
     return jl_valueof(o);
 }
diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index 86c6fd17eb571..f45aa14692ad4 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -533,7 +533,7 @@ void *jl_gc_perm_alloc_nolock(size_t sz, int zero, unsigned align, unsigned offs
 {
     jl_ptls_t ptls = jl_current_task->ptls;
     size_t allocsz = mmtk_align_alloc_sz(sz);
-    void* addr = mmtk_alloc(&ptls->mmtk_mutator, allocsz, align, offset, 1);
+    void* addr = mmtk_immortal_alloc_fast(&ptls->mmtk_mutator, allocsz, align, offset);
     return addr;
 }
 
diff --git a/src/symbol.c b/src/symbol.c
index f1cd18cfb84cc..b745adbfba80c 100644
--- a/src/symbol.c
+++ b/src/symbol.c
@@ -41,7 +41,7 @@ static jl_sym_t *mk_symbol(const char *str, size_t len) JL_NOTSAFEPOINT
     jl_set_typetagof(sym, jl_symbol_tag, GC_OLD_MARKED);
 #ifdef MMTK_GC
     jl_ptls_t ptls = jl_current_task->ptls;
-    mmtk_post_alloc(&ptls->mmtk_mutator, jl_valueof(tag), nb, 1);
+    mmtk_immortal_post_alloc_fast(&ptls->mmtk_mutator, jl_valueof(tag), nb);
 #endif
     jl_atomic_store_relaxed(&sym->left, NULL);
     jl_atomic_store_relaxed(&sym->right, NULL);

From ae2fa58013b841600cc416b65056813054dce3d5 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 3 Aug 2023 09:44:13 +0000
Subject: [PATCH 026/116] Supporting moving immix (wip)

---
 src/builtins.c         | 12 +++++++++++-
 src/datatype.c         |  2 ++
 src/interpreter.c      |  2 ++
 src/ircode.c           |  5 ++++-
 src/julia.h            | 13 +++++++++++++
 src/julia_internal.h   |  4 +++-
 src/runtime_ccall.cpp  |  2 ++
 src/staticdata.c       |  9 +++++++++
 src/staticdata_utils.c |  9 +++++++++
 src/toplevel.c         |  2 ++
 10 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/src/builtins.c b/src/builtins.c
index a6c904c851c95..f3fa4248b3fa4 100644
--- a/src/builtins.c
+++ b/src/builtins.c
@@ -344,6 +344,9 @@ static uintptr_t type_object_id_(jl_value_t *v, jl_varidx_t *env) JL_NOTSAFEPOIN
             i++;
             pe = pe->prev;
         }
+        if(mmtk_object_is_managed_by_mmtk(v)) {
+            mmtk_pin_object(v);
+        }
         return inthash((uintptr_t)v);
     }
     if (tv == jl_uniontype_type) {
@@ -392,6 +395,9 @@ static uintptr_t immut_id_(jl_datatype_t *dt, jl_value_t *v, uintptr_t h) JL_NOT
         return ~h;
     size_t f, nf = jl_datatype_nfields(dt);
     if (nf == 0 || (!dt->layout->haspadding && dt->layout->npointers == 0)) {
+        if(mmtk_object_is_managed_by_mmtk(v)) {
+            mmtk_pin_object(v);
+        }
         // operate element-wise if there are unused bits inside,
         // otherwise just take the whole data block at once
         // a few select pointers (notably symbol) also have special hash values
@@ -452,8 +458,12 @@ static uintptr_t NOINLINE jl_object_id__cold(jl_datatype_t *dt, jl_value_t *v) J
         jl_module_t *m = (jl_module_t*)v;
         return m->hash;
     }
-    if (dt->name->mutabl)
+    if (dt->name->mutabl) {
+        if(mmtk_object_is_managed_by_mmtk(v)) {
+            mmtk_pin_object(v);
+        }
         return inthash((uintptr_t)v);
+    }
     return immut_id_(dt, v, dt->hash);
 }
 
diff --git a/src/datatype.c b/src/datatype.c
index 95c3b11c9abdc..20c3af1555675 100644
--- a/src/datatype.c
+++ b/src/datatype.c
@@ -65,6 +65,7 @@ JL_DLLEXPORT jl_typename_t *jl_new_typename_in(jl_sym_t *name, jl_module_t *modu
     jl_typename_t *tn =
         (jl_typename_t*)jl_gc_alloc(ct->ptls, sizeof(jl_typename_t),
                                     jl_typename_type);
+    mmtk_pin_object(tn);
     tn->name = name;
     tn->module = module;
     tn->wrapper = NULL;
@@ -96,6 +97,7 @@ jl_datatype_t *jl_new_uninitialized_datatype(void)
 {
     jl_task_t *ct = jl_current_task;
     jl_datatype_t *t = (jl_datatype_t*)jl_gc_alloc(ct->ptls, sizeof(jl_datatype_t), jl_datatype_type);
+    mmtk_pin_object(t);
     jl_set_typetagof(t, jl_datatype_tag, 0);
     t->hash = 0;
     t->hasfreetypevars = 0;
diff --git a/src/interpreter.c b/src/interpreter.c
index c08496f72ce04..cdc2a5a96beef 100644
--- a/src/interpreter.c
+++ b/src/interpreter.c
@@ -748,6 +748,7 @@ jl_value_t *NOINLINE jl_interpret_toplevel_thunk(jl_module_t *m, jl_code_info_t
     unsigned nroots = jl_source_nslots(src) + jl_source_nssavalues(src);
     JL_GC_PUSHFRAME(s, s->locals, nroots);
     jl_array_t *stmts = src->code;
+    JL_GC_PUSH1(&stmts);
     assert(jl_typetagis(stmts, jl_array_any_type));
     s->src = src;
     s->module = m;
@@ -760,6 +761,7 @@ jl_value_t *NOINLINE jl_interpret_toplevel_thunk(jl_module_t *m, jl_code_info_t
     jl_value_t *r = eval_body(stmts, s, 0, 1);
     ct->world_age = last_age;
     JL_GC_POP();
+    JL_GC_POP();
     return r;
 }
 
diff --git a/src/ircode.c b/src/ircode.c
index 4121d6691aa5b..69a5ed9a05a80 100644
--- a/src/ircode.c
+++ b/src/ircode.c
@@ -1162,12 +1162,15 @@ void jl_init_serializer(void)
     assert(LAST_TAG+1+i < 256);
 
     for (i = 2; i < 256; i++) {
-        if (deser_tag[i])
+        if (deser_tag[i]) {
+            PTRHASH_PIN(deser_tag[i])
             ptrhash_put(&ser_tag, deser_tag[i], (void*)i);
+        }
     }
 
     i = 2;
     while (common_symbols[i-2] != NULL) {
+        PTRHASH_PIN(common_symbols[i-2])
         ptrhash_put(&common_symbol_tag, common_symbols[i-2], (void*)i);
         deser_symbols[i] = (jl_value_t*)common_symbols[i-2];
         i += 1;
diff --git a/src/julia.h b/src/julia.h
index 77a95bf625b80..9d3e177544af4 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -3,6 +3,19 @@
 #ifndef JULIA_H
 #define JULIA_H
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int mmtk_object_is_managed_by_mmtk(void* addr);
+extern unsigned char mmtk_pin_object(void* obj);
+#define PTRHASH_PIN(key)                        \
+        mmtk_pin_object(key);                   \
+
+#ifdef __cplusplus
+}
+#endif
+
 #if defined(JL_LIBRARY_EXPORTS_INTERNAL) || defined(JL_LIBRARY_EXPORTS_CODEGEN)
 #define JL_LIBRARY_EXPORTS
 #endif
diff --git a/src/julia_internal.h b/src/julia_internal.h
index cbd0bf7750251..c3588b6518816 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -535,7 +535,9 @@ JL_DLLEXPORT uintptr_t jl_get_buff_tag(void);
 typedef void jl_gc_tracked_buffer_t; // For the benefit of the static analyzer
 STATIC_INLINE jl_gc_tracked_buffer_t *jl_gc_alloc_buf(jl_ptls_t ptls, size_t sz)
 {
-    return jl_gc_alloc(ptls, sz, (void*)jl_buff_tag);
+    jl_gc_tracked_buffer_t *buf = jl_gc_alloc(ptls, sz, (void*)jl_buff_tag);
+    mmtk_pin_object(buf);
+    return buf;
 }
 
 STATIC_INLINE jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT
diff --git a/src/runtime_ccall.cpp b/src/runtime_ccall.cpp
index 23793254c205d..cac9270c72556 100644
--- a/src/runtime_ccall.cpp
+++ b/src/runtime_ccall.cpp
@@ -352,6 +352,8 @@ jl_value_t *jl_get_cfunction_trampoline(
     tramp = trampoline_alloc();
     ((void**)result)[0] = tramp;
     tramp = init_trampoline(tramp, nval);
+    PTRHASH_PIN((void*)fobj)
+    PTRHASH_PIN(result)
     ptrhash_put(cache, (void*)fobj, result);
     uv_mutex_unlock(&trampoline_lock);
     return result;
diff --git a/src/staticdata.c b/src/staticdata.c
index 49b97480b5165..fba106ad632d3 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -655,6 +655,8 @@ static int needs_uniquing(jl_value_t *v) JL_NOTSAFEPOINT
 
 static void record_field_change(jl_value_t **addr, jl_value_t *newval) JL_NOTSAFEPOINT
 {
+    PTRHASH_PIN((void*)addr)
+    PTRHASH_PIN((void*)newval)
     ptrhash_put(&field_replace, (void*)addr, newval);
 }
 
@@ -2137,6 +2139,8 @@ static jl_svec_t *jl_prune_type_cache_hash(jl_svec_t *cache) JL_GC_DISABLED
     assert(serialization_queue.items[(char*)idx - 1 - (char*)HT_NOTFOUND] == cache);
     cache = cache_rehash_set(cache, l);
     // redirect all references to the old cache to relocate to the new cache object
+    PTRHASH_PIN((void*)cache)
+    PTRHASH_PIN((void*)idx)
     ptrhash_put(&serialization_order, cache, idx);
     serialization_queue.items[(char*)idx - 1 - (char*)HT_NOTFOUND] = cache;
     return cache;
@@ -2387,6 +2391,7 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
     htable_new(&fptr_to_id, sizeof(id_to_fptrs) / sizeof(*id_to_fptrs));
     uintptr_t i;
     for (i = 0; id_to_fptrs[i] != NULL; i++) {
+        PTRHASH_PIN((void*)(uintptr_t)id_to_fptrs[i])
         ptrhash_put(&fptr_to_id, (void*)(uintptr_t)id_to_fptrs[i], (void*)(i + 2));
     }
     htable_new(&serialization_order, 25000);
@@ -2473,6 +2478,7 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
             htable_new(&external_objects, NUM_TAGS);
             for (size_t i = 0; tags[i] != NULL; i++) {
                 jl_value_t *tag = *tags[i];
+                PTRHASH_PIN(tag)
                 ptrhash_put(&external_objects, tag, tag);
             }
             // Queue the worklist itself as the first item we serialize
@@ -3044,6 +3050,7 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image, jl
                 assert(tag == 0);
                 arraylist_push(&delay_list, pfld);
                 arraylist_push(&delay_list, obj);
+                PTRHASH_PIN(obj)
                 ptrhash_put(&new_dt_objs, (void*)obj, obj); // mark obj as invalid
                 *pfld = (uintptr_t)NULL;
                 continue;
@@ -3077,6 +3084,8 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image, jl
                     }
                     static_assert(offsetof(jl_datatype_t, name) == 0, "");
                     newdt->name = dt->name;
+                    PTRHASH_PIN(newdt)
+                    PTRHASH_PIN(dt)
                     ptrhash_put(&new_dt_objs, (void*)newdt, dt);
                 }
                 else {
diff --git a/src/staticdata_utils.c b/src/staticdata_utils.c
index bf1a830b608de..bc6502c134672 100644
--- a/src/staticdata_utils.c
+++ b/src/staticdata_utils.c
@@ -272,6 +272,7 @@ static void jl_collect_new_roots(jl_array_t *roots, jl_array_t *new_specializati
         assert(jl_is_code_instance(ci));
         jl_method_t *m = ci->def->def.method;
         assert(jl_is_method(m));
+        PTRHASH_PIN(m)
         ptrhash_put(&mset, (void*)m, (void*)m);
     }
     int nwithkey;
@@ -434,6 +435,7 @@ static void jl_collect_edges(jl_array_t *edges, jl_array_t *ext_targets, jl_arra
         for (size_t i = 0; i < jl_array_len(external_cis); i++) {
             jl_code_instance_t *ci = (jl_code_instance_t*)jl_array_ptr_ref(external_cis, i);
             jl_method_instance_t *mi = ci->def;
+            PTRHASH_PIN(mi)
             ptrhash_put(&external_mis, (void*)mi, (void*)mi);
         }
     }
@@ -469,6 +471,8 @@ static void jl_collect_edges(jl_array_t *edges, jl_array_t *ext_targets, jl_arra
     for (size_t i = 0; i < l / 2; i++) {
         jl_method_instance_t *caller = (jl_method_instance_t*)jl_array_ptr_ref(edges, i * 2);
         void *target = (void*)((char*)HT_NOTFOUND + i + 1);
+        PTRHASH_PIN(caller)
+        PTRHASH_PIN(target)
         ptrhash_put(&edges_ids, (void*)caller, target);
     }
     // process target list to turn it into a memoized validity table
@@ -545,6 +549,8 @@ static void jl_collect_edges(jl_array_t *edges, jl_array_t *ext_targets, jl_arra
                 jl_array_ptr_1d_push(ext_targets, callee);
                 jl_array_ptr_1d_push(ext_targets, matches);
                 target = (void*)((char*)HT_NOTFOUND + jl_array_len(ext_targets) / 3);
+                PTRHASH_PIN(callee)
+                PTRHASH_PIN(target)
                 ptrhash_put(&edges_map2, (void*)callee, target);
             }
             idxs[++nt] = (char*)target - (char*)HT_NOTFOUND - 1;
@@ -1090,6 +1096,8 @@ static void jl_insert_backedges(jl_array_t *edges, jl_array_t *ext_targets, jl_a
         jl_code_instance_t *ci = (jl_code_instance_t*)jl_array_ptr_ref(ci_list, i);
         assert(ci->min_world == minworld);
         if (ci->max_world == 1) { // sentinel value: has edges to external callables
+            PTRHASH_PIN((void*)ci->def)
+            PTRHASH_PIN((void*)ci)
             ptrhash_put(&visited, (void*)ci->def, (void*)ci);
         }
         else {
@@ -1155,6 +1163,7 @@ static void classify_callers(htable_t *callers_with_edges, jl_array_t *edges)
     size_t l = edges ? jl_array_len(edges) / 2 : 0;
     for (size_t i = 0; i < l; i++) {
         jl_method_instance_t *caller = (jl_method_instance_t*)jl_array_ptr_ref(edges, 2 * i);
+        PTRHASH_PIN((void*)caller)
         ptrhash_put(callers_with_edges, (void*)caller, (void*)caller);
     }
 }
diff --git a/src/toplevel.c b/src/toplevel.c
index 200d0ad220231..8a72ce8e6c2e6 100644
--- a/src/toplevel.c
+++ b/src/toplevel.c
@@ -140,6 +140,8 @@ static jl_value_t *jl_eval_module_expr(jl_module_t *parent_module, jl_expr_t *ex
     jl_value_t *form = (jl_value_t*)newm;
     JL_GC_PUSH1(&form);
     JL_LOCK(&jl_modules_mutex);
+    PTRHASH_PIN(newm)
+    PTRHASH_PIN((void*)((uintptr_t)HT_NOTFOUND + 1))
     ptrhash_put(&jl_current_modules, (void*)newm, (void*)((uintptr_t)HT_NOTFOUND + 1));
     JL_UNLOCK(&jl_modules_mutex);
 

From 2f21eecf4c9a9156022fb4ebd18e774a3a293c57 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Mon, 7 Aug 2023 01:06:17 +0000
Subject: [PATCH 027/116] Pushing current task into the stack to set as red
 root; pin string when realloc-ing if pinned; removing static from functions
 needed to sweep live_tasks array

---
 src/gc-stacks.c | 4 ++--
 src/mmtk-gc.c   | 3 +++
 src/task.c      | 2 ++
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/gc-stacks.c b/src/gc-stacks.c
index b35c1722c82ff..9e8b71ac442fc 100644
--- a/src/gc-stacks.c
+++ b/src/gc-stacks.c
@@ -66,7 +66,7 @@ static void *malloc_stack(size_t bufsz) JL_NOTSAFEPOINT
     return stk;
 }
 
-static void free_stack(void *stkbuf, size_t bufsz)
+void free_stack(void *stkbuf, size_t bufsz)
 {
     munmap(stkbuf, bufsz);
     jl_atomic_fetch_add(&num_stack_mappings, -1);
@@ -104,7 +104,7 @@ static unsigned select_pool(size_t nb) JL_NOTSAFEPOINT
 }
 
 
-static void _jl_free_stack(jl_ptls_t ptls, void *stkbuf, size_t bufsz)
+void _jl_free_stack(jl_ptls_t ptls, void *stkbuf, size_t bufsz)
 {
 #ifdef _COMPILER_ASAN_ENABLED_
     __asan_unpoison_stack_memory((uintptr_t)stkbuf, bufsz);
diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index baa5d6642721d..d8bae88fc56a1 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -428,6 +428,9 @@ jl_value_t *jl_gc_realloc_string(jl_value_t *s, size_t sz)
     size_t len = jl_string_len(s);
     jl_value_t *snew = jl_alloc_string(sz);
     memcpy(jl_string_data(snew), jl_string_data(s), sz <= len ? sz : len);
+    if(mmtk_is_pinned(s)) {
+        mmtk_pin_object(snew);
+    }
     return snew;
 }
 
diff --git a/src/task.c b/src/task.c
index 477ae481071a0..267f7448fa52a 100644
--- a/src/task.c
+++ b/src/task.c
@@ -1207,6 +1207,7 @@ CFI_NORETURN
     jl_task_t *ct = jl_get_current_task();
 #else
     jl_task_t *ct = jl_current_task;
+    JL_GC_PUSH1(&ct);
 #endif
     jl_ptls_t ptls = ct->ptls;
     jl_value_t *res;
@@ -1247,6 +1248,7 @@ skip_pop_exception:;
     ct->result = res;
     jl_gc_wb(ct, ct->result);
     jl_finish_task(ct);
+    JL_GC_POP();
     jl_gc_debug_critical_error();
     abort();
 }

From 39530f9e4c9ca2d66bf152d4d125e44a43c9caef Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 9 Aug 2023 01:15:56 +0000
Subject: [PATCH 028/116] Fixing issue that prevented building Julia or running
 tests with Julia's debug build

---
 Makefile     | 2 +-
 src/Makefile | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 3e4dbef73bb52..cfa5af6052db9 100644
--- a/Makefile
+++ b/Makefile
@@ -119,7 +119,7 @@ check-whitespace:
 ifneq ($(NO_GIT), 1)
 	@# Append the directory containing the julia we just built to the end of `PATH`,
 	@# to give us the best chance of being able to run this check.
-	@PATH="$(PATH):$(dir $(JULIA_EXECUTABLE))" julia $(call cygpath_w,$(JULIAHOME)/contrib/check-whitespace.jl)
+	@PATH="$(PATH):$(dir $(JULIA_EXECUTABLE))" $(JULIA_EXECUTABLE) $(call cygpath_w,$(JULIAHOME)/contrib/check-whitespace.jl)
 else
 	$(warn "Skipping whitespace check because git is unavailable")
 endif
diff --git a/src/Makefile b/src/Makefile
index ff5f4ce8b99d6..63654f35026e9 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -176,7 +176,7 @@ DOBJS := $(SRCS:%=$(BUILDDIR)/%.dbg.obj)
 ifeq ($(WITH_MMTK), 1)
 MMTK_SRCS := mmtk_julia
 MMTK_OBJS := $(MMTK_SRCS:%=$(MMTK_JULIA_INC)/%.o) $(MMTK_LIB_DST)
-MMTK_DOBJS := $(MMTK_SRCS:%=$(MMTK_JULIA_INC)/%.dbg.obj)
+MMTK_DOBJS := $(MMTK_SRCS:%=$(MMTK_JULIA_INC)/%.dbg.obj) $(MMTK_LIB_DST)
 else
 MMTK_OBJS :=
 MMTK_DOBJS :=

From 5fe96d76427b48cc07df3165b339bc0db9522dd7 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 9 Aug 2023 01:15:56 +0000
Subject: [PATCH 029/116] Fixing issue that prevented building Julia or running
 tests with Julia's debug build

---
 Makefile     | 2 +-
 src/Makefile | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 3e4dbef73bb52..cfa5af6052db9 100644
--- a/Makefile
+++ b/Makefile
@@ -119,7 +119,7 @@ check-whitespace:
 ifneq ($(NO_GIT), 1)
 	@# Append the directory containing the julia we just built to the end of `PATH`,
 	@# to give us the best chance of being able to run this check.
-	@PATH="$(PATH):$(dir $(JULIA_EXECUTABLE))" julia $(call cygpath_w,$(JULIAHOME)/contrib/check-whitespace.jl)
+	@PATH="$(PATH):$(dir $(JULIA_EXECUTABLE))" $(JULIA_EXECUTABLE) $(call cygpath_w,$(JULIAHOME)/contrib/check-whitespace.jl)
 else
 	$(warn "Skipping whitespace check because git is unavailable")
 endif
diff --git a/src/Makefile b/src/Makefile
index ff5f4ce8b99d6..63654f35026e9 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -176,7 +176,7 @@ DOBJS := $(SRCS:%=$(BUILDDIR)/%.dbg.obj)
 ifeq ($(WITH_MMTK), 1)
 MMTK_SRCS := mmtk_julia
 MMTK_OBJS := $(MMTK_SRCS:%=$(MMTK_JULIA_INC)/%.o) $(MMTK_LIB_DST)
-MMTK_DOBJS := $(MMTK_SRCS:%=$(MMTK_JULIA_INC)/%.dbg.obj)
+MMTK_DOBJS := $(MMTK_SRCS:%=$(MMTK_JULIA_INC)/%.dbg.obj) $(MMTK_LIB_DST)
 else
 MMTK_OBJS :=
 MMTK_DOBJS :=

From fdada6c65a0c53de447abb2f331dae081dcb77cf Mon Sep 17 00:00:00 2001
From: Yi Lin <qinsoon@gmail.com>
Date: Mon, 21 Aug 2023 01:49:01 +0000
Subject: [PATCH 030/116] Fix some build issues

---
 src/mmtk-gc.c |  7 +++----
 src/partr.c   | 14 ++++++++++++++
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index e87e7f0e1449f..a390de3ddffd9 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -350,8 +350,6 @@ void jl_gc_init(void)
         max_heap_size = uv_get_free_memory() * 70 / 100;
     }
 
-    // If the two values are the same, we can use either. Otherwise, we need to be careful.
-    assert(jl_n_gcthreads == jl_options.ngcthreads);
 
     // Check that the julia_copy_stack rust feature has been defined when the COPY_STACK has been defined
     int copy_stacks;
@@ -366,11 +364,12 @@ void jl_gc_init(void)
 
     // if only max size is specified initialize MMTk with a fixed size heap
     // TODO: We just assume mark threads means GC threads, and ignore the number of concurrent sweep threads.
+    // If the two values are the same, we can use either. Otherwise, we need to be careful.
     uintptr_t gcthreads = jl_options.nmarkthreads;
     if (max_size_def != NULL || (max_size_gb != NULL && (min_size_def == NULL && min_size_gb == NULL))) {
-        mmtk_gc_init(0, max_heap_size, jl_options.ngcthreads, &mmtk_upcalls, (sizeof(jl_taggedvalue_t)), jl_buff_tag);
+        mmtk_gc_init(0, max_heap_size, gcthreads, &mmtk_upcalls, (sizeof(jl_taggedvalue_t)), jl_buff_tag);
     } else {
-        mmtk_gc_init(min_heap_size, max_heap_size, jl_options.ngcthreads, &mmtk_upcalls, (sizeof(jl_taggedvalue_t)), jl_buff_tag);
+        mmtk_gc_init(min_heap_size, max_heap_size, gcthreads, &mmtk_upcalls, (sizeof(jl_taggedvalue_t)), jl_buff_tag);
     }
 }
 
diff --git a/src/partr.c b/src/partr.c
index bfdc4ed727973..37cf9ca310d24 100644
--- a/src/partr.c
+++ b/src/partr.c
@@ -168,6 +168,20 @@ void jl_gc_sweep_threadfun(void *arg)
     }
 }
 
+#else
+
+// gc thread mark function
+void jl_gc_mark_threadfun(void *arg)
+{
+    mmtk_unreachable();
+}
+
+// gc thread sweep function
+void jl_gc_sweep_threadfun(void *arg)
+{
+    mmtk_unreachable();
+}
+
 #endif
 
 // thread function: used by all mutator threads except the main thread

From 7985bb2f37817a5ec29a34da7f2a140a5c9192d9 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Tue, 22 Aug 2023 01:40:26 +0000
Subject: [PATCH 031/116] Fixing issue when setting up the number of stock GC
 threads - it should be 0 when using MMTk

---
 src/mmtk-gc.c   |  5 +++--
 src/threading.c | 11 ++++++-----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index d8bae88fc56a1..db2ce338529b5 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -350,8 +350,9 @@ void jl_gc_init(void)
         max_heap_size = uv_get_free_memory() * 70 / 100;
     }
 
-    // If the two values are the same, we can use either. Otherwise, we need to be careful.
-    assert(jl_n_gcthreads == jl_options.ngcthreads);
+    // when using mmtk, we don't spawn any stock GC thread
+    // and mmtk should use jl_options.ngcthreads to set the number of workers
+    assert(jl_n_gcthreads == 0);
 
     // Check that the julia_copy_stack rust feature has been defined when the COPY_STACK has been defined
     int copy_stacks;
diff --git a/src/threading.c b/src/threading.c
index d58528fa183be..ddb4850aa074c 100644
--- a/src/threading.c
+++ b/src/threading.c
@@ -667,6 +667,12 @@ void jl_init_threading(void)
         }
     }
 
+#ifdef MMTK_GC
+    // MMTk gets the number of GC threads from jl_options.ngcthreads, and spawn its GC threads.
+    // So we just set ngcthreads to 0 here to avoid spawning any GC threads in Julia.
+    ngcthreads = 0;
+#endif
+
     jl_all_tls_states_size = nthreads + nthreadsi + ngcthreads;
     jl_n_threads_per_pool = (int*)malloc_s(2 * sizeof(int));
     jl_n_threads_per_pool[0] = nthreadsi;
@@ -684,11 +690,6 @@ void jl_start_threads(void)
 {
     int nthreads = jl_atomic_load_relaxed(&jl_n_threads);
     int ngcthreads = jl_n_gcthreads;
-#ifdef MMTK_GC
-    // MMTk gets the number of GC threads from jl_options.ngcthreads, and spawn its GC threads.
-    // So we just set ngcthreads to 0 here to avoid spawning any GC threads in Julia.
-    ngcthreads = 0;
-#endif
     int cpumasksize = uv_cpumask_size();
     char *cp;
     int i, exclusive;

From 27fc1013a130f7da2ae7f47b69763c4455bb405c Mon Sep 17 00:00:00 2001
From: Yi Lin <qinsoon@gmail.com>
Date: Tue, 22 Aug 2023 02:26:48 +0000
Subject: [PATCH 032/116] Apply lock before schedule finalizers

---
 src/gc-common.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/gc-common.c b/src/gc-common.c
index 38f737ada576f..80365ec5e4a97 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -339,12 +339,18 @@ void jl_gc_run_all_finalizers(jl_task_t *ct)
     jl_ptls_t* gc_all_tls_states;
     gc_n_threads = jl_atomic_load_acquire(&jl_n_threads);
     gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
+    // this is called from `jl_atexit_hook`; threads could still be running
+    // so we have to guard the finalizers' lists
+    JL_LOCK_NOGC(&finalizers_lock);
     schedule_all_finalizers(&finalizer_list_marked);
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
         if (ptls2 != NULL)
             schedule_all_finalizers(&ptls2->finalizers);
     }
+    // this is called from `jl_atexit_hook`; threads could still be running
+    // so we have to guard the finalizers' lists
+    JL_LOCK_NOGC(&finalizers_lock);
     gc_n_threads = 0;
     gc_all_tls_states = NULL;
     run_finalizers(ct);

From e591ad86d475323b2079fc71f99a74ba0750a0cc Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Tue, 22 Aug 2023 01:40:26 +0000
Subject: [PATCH 033/116] Fixing issue when setting up the number of stock GC
 threads - it should be 0 when using MMTk

---
 src/mmtk-gc.c   |  3 +++
 src/threading.c | 11 ++++++-----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index a390de3ddffd9..8e87860c7b6ab 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -350,6 +350,9 @@ void jl_gc_init(void)
         max_heap_size = uv_get_free_memory() * 70 / 100;
     }
 
+    // when using mmtk, we don't spawn any stock GC thread
+    // and mmtk should use jl_options.ngcthreads to set the number of workers
+    assert(jl_n_gcthreads == 0);
 
     // Check that the julia_copy_stack rust feature has been defined when the COPY_STACK has been defined
     int copy_stacks;
diff --git a/src/threading.c b/src/threading.c
index 78ecdcc98ae21..d1157a02dada0 100644
--- a/src/threading.c
+++ b/src/threading.c
@@ -694,6 +694,12 @@ void jl_init_threading(void)
     }
     int16_t ngcthreads = jl_n_markthreads + jl_n_sweepthreads;
 
+#ifdef MMTK_GC
+    // MMTk gets the number of GC threads from jl_options.ngcthreads, and spawn its GC threads.
+    // So we just set ngcthreads to 0 here to avoid spawning any GC threads in Julia.
+    ngcthreads = 0;
+#endif
+
     jl_all_tls_states_size = nthreads + nthreadsi + ngcthreads;
     jl_n_threads_per_pool = (int*)malloc_s(2 * sizeof(int));
     jl_n_threads_per_pool[0] = nthreadsi;
@@ -711,11 +717,6 @@ void jl_start_threads(void)
 {
     int nthreads = jl_atomic_load_relaxed(&jl_n_threads);
     int ngcthreads = jl_n_gcthreads;
-#ifdef MMTK_GC
-    // MMTk gets the number of GC threads from jl_options.ngcthreads, and spawn its GC threads.
-    // So we just set ngcthreads to 0 here to avoid spawning any GC threads in Julia.
-    ngcthreads = 0;
-#endif
     int cpumasksize = uv_cpumask_size();
     char *cp;
     int i, exclusive;

From 00bab46345dd2fc8dd73d9a94ff7aa57ddd90e62 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 9 Aug 2023 01:15:56 +0000
Subject: [PATCH 034/116] Fixing issue that prevented building Julia or running
 tests with Julia's debug build

---
 Makefile     | 2 +-
 src/Makefile | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 895dbe8100b82..d5cce165dc596 100644
--- a/Makefile
+++ b/Makefile
@@ -119,7 +119,7 @@ check-whitespace:
 ifneq ($(NO_GIT), 1)
 	@# Append the directory containing the julia we just built to the end of `PATH`,
 	@# to give us the best chance of being able to run this check.
-	@PATH="$(PATH):$(dir $(JULIA_EXECUTABLE))" julia $(call cygpath_w,$(JULIAHOME)/contrib/check-whitespace.jl)
+	@PATH="$(PATH):$(dir $(JULIA_EXECUTABLE))" $(JULIA_EXECUTABLE) $(call cygpath_w,$(JULIAHOME)/contrib/check-whitespace.jl)
 else
 	$(warn "Skipping whitespace check because git is unavailable")
 endif
diff --git a/src/Makefile b/src/Makefile
index 41629e5a686bf..5ea0a3d5cf76b 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -179,7 +179,7 @@ DOBJS := $(SRCS:%=$(BUILDDIR)/%.dbg.obj)
 ifeq ($(WITH_MMTK), 1)
 MMTK_SRCS := mmtk_julia
 MMTK_OBJS := $(MMTK_SRCS:%=$(MMTK_JULIA_INC)/%.o) $(MMTK_LIB_DST)
-MMTK_DOBJS := $(MMTK_SRCS:%=$(MMTK_JULIA_INC)/%.dbg.obj)
+MMTK_DOBJS := $(MMTK_SRCS:%=$(MMTK_JULIA_INC)/%.dbg.obj) $(MMTK_LIB_DST)
 else
 MMTK_OBJS :=
 MMTK_DOBJS :=

From 6d8df8f8c2fa721d4c1b3eef6b641b66969a6625 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Mon, 28 Aug 2023 00:12:58 +0000
Subject: [PATCH 035/116] Setting number of stock Julia mutators to 0 and
 fixing assertion

---
 src/mmtk-gc.c   |  4 ++--
 src/threading.c | 11 ++++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index baa5d6642721d..4e7a551dd8381 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -350,8 +350,8 @@ void jl_gc_init(void)
         max_heap_size = uv_get_free_memory() * 70 / 100;
     }
 
-    // If the two values are the same, we can use either. Otherwise, we need to be careful.
-    assert(jl_n_gcthreads == jl_options.ngcthreads);
+    // Assert that the number of stock GC threads is 0; MMTK uses the number of threads in jl_options.ngcthreads
+    assert(jl_n_gcthreads == 0);
 
     // Check that the julia_copy_stack rust feature has been defined when the COPY_STACK has been defined
     int copy_stacks;
diff --git a/src/threading.c b/src/threading.c
index d58528fa183be..ddb4850aa074c 100644
--- a/src/threading.c
+++ b/src/threading.c
@@ -667,6 +667,12 @@ void jl_init_threading(void)
         }
     }
 
+#ifdef MMTK_GC
+    // MMTk gets the number of GC threads from jl_options.ngcthreads, and spawn its GC threads.
+    // So we just set ngcthreads to 0 here to avoid spawning any GC threads in Julia.
+    ngcthreads = 0;
+#endif
+
     jl_all_tls_states_size = nthreads + nthreadsi + ngcthreads;
     jl_n_threads_per_pool = (int*)malloc_s(2 * sizeof(int));
     jl_n_threads_per_pool[0] = nthreadsi;
@@ -684,11 +690,6 @@ void jl_start_threads(void)
 {
     int nthreads = jl_atomic_load_relaxed(&jl_n_threads);
     int ngcthreads = jl_n_gcthreads;
-#ifdef MMTK_GC
-    // MMTk gets the number of GC threads from jl_options.ngcthreads, and spawn its GC threads.
-    // So we just set ngcthreads to 0 here to avoid spawning any GC threads in Julia.
-    ngcthreads = 0;
-#endif
     int cpumasksize = uv_cpumask_size();
     char *cp;
     int i, exclusive;

From 8eab37df935555155a919fcbf0a55b9b4a0fa9f0 Mon Sep 17 00:00:00 2001
From: Yi Lin <qinsoon@gmail.com>
Date: Mon, 28 Aug 2023 02:04:39 +0000
Subject: [PATCH 036/116] Add write barrier for excstack update

---
 src/julia_internal.h | 2 +-
 src/rtutils.c        | 7 ++++---
 src/task.c           | 4 ++--
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/julia_internal.h b/src/julia_internal.h
index ed8e40bca4b01..737553ec98845 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -1264,7 +1264,7 @@ STATIC_INLINE size_t jl_excstack_next(jl_excstack_t *stack, size_t itr) JL_NOTSA
     return itr-2 - jl_excstack_bt_size(stack, itr);
 }
 // Exception stack manipulation
-void jl_push_excstack(jl_excstack_t **stack JL_REQUIRE_ROOTED_SLOT JL_ROOTING_ARGUMENT,
+void jl_push_excstack(jl_task_t* task, jl_excstack_t **stack JL_REQUIRE_ROOTED_SLOT JL_ROOTING_ARGUMENT,
                       jl_value_t *exception JL_ROOTED_ARGUMENT,
                       jl_bt_element_t *bt_data, size_t bt_size);
 
diff --git a/src/rtutils.c b/src/rtutils.c
index 01ea11014a6db..7a31d37e4175c 100644
--- a/src/rtutils.c
+++ b/src/rtutils.c
@@ -320,7 +320,7 @@ static void jl_copy_excstack(jl_excstack_t *dest, jl_excstack_t *src) JL_NOTSAFE
     dest->top = src->top;
 }
 
-static void jl_reserve_excstack(jl_excstack_t **stack JL_REQUIRE_ROOTED_SLOT,
+static void jl_reserve_excstack(jl_task_t* task, jl_excstack_t **stack JL_REQUIRE_ROOTED_SLOT,
                                 size_t reserved_size)
 {
     jl_excstack_t *s = *stack;
@@ -334,13 +334,14 @@ static void jl_reserve_excstack(jl_excstack_t **stack JL_REQUIRE_ROOTED_SLOT,
     if (s)
         jl_copy_excstack(new_s, s);
     *stack = new_s;
+    jl_gc_wb(task, new_s);
 }
 
-void jl_push_excstack(jl_excstack_t **stack JL_REQUIRE_ROOTED_SLOT JL_ROOTING_ARGUMENT,
+void jl_push_excstack(jl_task_t* task, jl_excstack_t **stack JL_REQUIRE_ROOTED_SLOT JL_ROOTING_ARGUMENT,
                       jl_value_t *exception JL_ROOTED_ARGUMENT,
                       jl_bt_element_t *bt_data, size_t bt_size)
 {
-    jl_reserve_excstack(stack, (*stack ? (*stack)->top : 0) + bt_size + 2);
+    jl_reserve_excstack(task, stack, (*stack ? (*stack)->top : 0) + bt_size + 2);
     jl_excstack_t *s = *stack;
     jl_bt_element_t *rawstack = jl_excstack_raw(s);
     memcpy(rawstack + s->top, bt_data, sizeof(jl_bt_element_t)*bt_size);
diff --git a/src/task.c b/src/task.c
index 1dab8688cb079..73d9033f0cb50 100644
--- a/src/task.c
+++ b/src/task.c
@@ -721,7 +721,7 @@ JL_DLLEXPORT JL_NORETURN void jl_no_exc_handler(jl_value_t *e, jl_task_t *ct)
         /* The temporary ptls->bt_data is rooted by special purpose code in the\
            GC. This exists only for the purpose of preserving bt_data until we \
            set ptls->bt_size=0 below. */                                       \
-        jl_push_excstack(&ct->excstack, exception,                             \
+        jl_push_excstack(ct, &ct->excstack, exception,                             \
                           ptls->bt_data, ptls->bt_size);                       \
         ptls->bt_size = 0;                                                     \
     }                                                                          \
@@ -1224,7 +1224,7 @@ CFI_NORETURN
     jl_timing_block_task_enter(ct, ptls, NULL);
     if (jl_atomic_load_relaxed(&ct->_isexception)) {
         record_backtrace(ptls, 0);
-        jl_push_excstack(&ct->excstack, ct->result,
+        jl_push_excstack(ct, &ct->excstack, ct->result,
                          ptls->bt_data, ptls->bt_size);
         res = ct->result;
     }

From d0cbd133727fb0135826ba09128f259aaf34d403 Mon Sep 17 00:00:00 2001
From: Yi Lin <qinsoon@gmail.com>
Date: Tue, 29 Aug 2023 02:30:35 +0000
Subject: [PATCH 037/116] Revert "Apply lock before schedule finalizers"

This reverts commit 27fc1013a130f7da2ae7f47b69763c4455bb405c.
---
 src/gc-common.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index 80365ec5e4a97..38f737ada576f 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -339,18 +339,12 @@ void jl_gc_run_all_finalizers(jl_task_t *ct)
     jl_ptls_t* gc_all_tls_states;
     gc_n_threads = jl_atomic_load_acquire(&jl_n_threads);
     gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
-    // this is called from `jl_atexit_hook`; threads could still be running
-    // so we have to guard the finalizers' lists
-    JL_LOCK_NOGC(&finalizers_lock);
     schedule_all_finalizers(&finalizer_list_marked);
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
         if (ptls2 != NULL)
             schedule_all_finalizers(&ptls2->finalizers);
     }
-    // this is called from `jl_atexit_hook`; threads could still be running
-    // so we have to guard the finalizers' lists
-    JL_LOCK_NOGC(&finalizers_lock);
     gc_n_threads = 0;
     gc_all_tls_states = NULL;
     run_finalizers(ct);

From 8d0d8b5db22469d2d3f4a0a65af635f67d7701e3 Mon Sep 17 00:00:00 2001
From: Yi Lin <qinsoon@gmail.com>
Date: Tue, 29 Aug 2023 05:51:04 +0000
Subject: [PATCH 038/116] Revert "Fixing issue when setting up the number of
 stock GC threads - it should be 0 when using MMTk"

This reverts commit e591ad86d475323b2079fc71f99a74ba0750a0cc.
---
 src/mmtk-gc.c   |  3 ---
 src/threading.c | 11 +++++------
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index 8e87860c7b6ab..a390de3ddffd9 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -350,9 +350,6 @@ void jl_gc_init(void)
         max_heap_size = uv_get_free_memory() * 70 / 100;
     }
 
-    // when using mmtk, we don't spawn any stock GC thread
-    // and mmtk should use jl_options.ngcthreads to set the number of workers
-    assert(jl_n_gcthreads == 0);
 
     // Check that the julia_copy_stack rust feature has been defined when the COPY_STACK has been defined
     int copy_stacks;
diff --git a/src/threading.c b/src/threading.c
index d1157a02dada0..78ecdcc98ae21 100644
--- a/src/threading.c
+++ b/src/threading.c
@@ -694,12 +694,6 @@ void jl_init_threading(void)
     }
     int16_t ngcthreads = jl_n_markthreads + jl_n_sweepthreads;
 
-#ifdef MMTK_GC
-    // MMTk gets the number of GC threads from jl_options.ngcthreads, and spawn its GC threads.
-    // So we just set ngcthreads to 0 here to avoid spawning any GC threads in Julia.
-    ngcthreads = 0;
-#endif
-
     jl_all_tls_states_size = nthreads + nthreadsi + ngcthreads;
     jl_n_threads_per_pool = (int*)malloc_s(2 * sizeof(int));
     jl_n_threads_per_pool[0] = nthreadsi;
@@ -717,6 +711,11 @@ void jl_start_threads(void)
 {
     int nthreads = jl_atomic_load_relaxed(&jl_n_threads);
     int ngcthreads = jl_n_gcthreads;
+#ifdef MMTK_GC
+    // MMTk gets the number of GC threads from jl_options.ngcthreads, and spawn its GC threads.
+    // So we just set ngcthreads to 0 here to avoid spawning any GC threads in Julia.
+    ngcthreads = 0;
+#endif
     int cpumasksize = uv_cpumask_size();
     char *cp;
     int i, exclusive;

From 66a49ccf864bded60b232140f57c69059a503f07 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 7 Sep 2023 01:31:12 +0000
Subject: [PATCH 039/116] Refactoring the code to reuse most of jl_gc_collect
 in block_for_gc

---
 src/gc-common.c | 25 +++++++++++++++++++++++++
 src/gc.c        | 25 -------------------------
 2 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index 38f737ada576f..0f6307c1db98f 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -601,6 +601,31 @@ JL_DLLEXPORT int jl_gc_enable(int on)
     return prev;
 }
 
+void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads)
+{
+    JL_TIMING(GC, GC_Stop);
+#ifdef USE_TRACY
+    TracyCZoneCtx ctx = JL_TIMING_DEFAULT_BLOCK->tracy_ctx;
+    TracyCZoneColor(ctx, 0x696969);
+#endif
+    assert(gc_n_threads);
+    if (gc_n_threads > 1)
+        jl_wake_libuv();
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls2 = gc_all_tls_states[i];
+        if (ptls2 != NULL) {
+            // This acquire load pairs with the release stores
+            // in the signal handler of safepoint so we are sure that
+            // all the stores on those threads are visible.
+            // We're currently also using atomic store release in mutator threads
+            // (in jl_gc_state_set), but we may want to use signals to flush the
+            // memory operations on those threads lazily instead.
+            while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state))
+                jl_cpu_pause(); // yield?
+        }
+    }
+}
+
 JL_DLLEXPORT int jl_gc_is_enabled(void)
 {
     jl_ptls_t ptls = jl_current_task->ptls;
diff --git a/src/gc.c b/src/gc.c
index 4846549af93e4..924cdce356a4b 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -344,31 +344,6 @@ NOINLINE uintptr_t gc_get_stack_ptr(void)
 
 #define should_timeout() 0
 
-void jl_gc_wait_for_the_world(jl_ptls_t* gc_all_tls_states, int gc_n_threads)
-{
-    JL_TIMING(GC, GC_Stop);
-#ifdef USE_TRACY
-    TracyCZoneCtx ctx = JL_TIMING_DEFAULT_BLOCK->tracy_ctx;
-    TracyCZoneColor(ctx, 0x696969);
-#endif
-    assert(gc_n_threads);
-    if (gc_n_threads > 1)
-        jl_wake_libuv();
-    for (int i = 0; i < gc_n_threads; i++) {
-        jl_ptls_t ptls2 = gc_all_tls_states[i];
-        if (ptls2 != NULL) {
-            // This acquire load pairs with the release stores
-            // in the signal handler of safepoint so we are sure that
-            // all the stores on those threads are visible.
-            // We're currently also using atomic store release in mutator threads
-            // (in jl_gc_state_set), but we may want to use signals to flush the
-            // memory operations on those threads lazily instead.
-            while (!jl_atomic_load_relaxed(&ptls2->gc_state) || !jl_atomic_load_acquire(&ptls2->gc_state))
-                jl_cpu_pause(); // yield?
-        }
-    }
-}
-
 // malloc wrappers, aligned allocation
 
 #if defined(_OS_WINDOWS_)

From f9da153bddb2b494feb3cd1840021ea574aa7f36 Mon Sep 17 00:00:00 2001
From: Luis Eduardo de Souza Amorim <ledusou@gmail.com>
Date: Fri, 15 Sep 2023 06:49:02 +0000
Subject: [PATCH 040/116] Checking if object is managed by mmtk before calling
 pin function; Pinning owner to avoid introspecting it during scanning

---
 src/array.c | 9 +++++++++
 src/julia.h | 2 ++
 2 files changed, 11 insertions(+)

diff --git a/src/array.c b/src/array.c
index 8a064583bbc9e..7c49889ee4662 100644
--- a/src/array.c
+++ b/src/array.c
@@ -239,6 +239,9 @@ JL_DLLEXPORT jl_array_t *jl_reshape_array(jl_value_t *atype, jl_array_t *data,
     jl_array_t *owner = (jl_array_t*)jl_array_owner(data);
     jl_array_data_owner(a) = (jl_value_t*)owner;
 
+    if(mmtk_object_is_managed_by_mmtk(owner)) {
+        mmtk_pin_object(owner);
+    }
     a->flags.how = 3;
     a->data = data->data;
     a->flags.isshared = 1;
@@ -287,6 +290,9 @@ JL_DLLEXPORT jl_array_t *jl_string_to_array(jl_value_t *str)
     a->flags.ptrarray = 0;
     a->flags.hasptr = 0;
     jl_array_data_owner(a) = str;
+    if(mmtk_object_is_managed_by_mmtk(str)) {
+        mmtk_pin_object(str);
+    }
     a->flags.how = 3;
     a->flags.isshared = 1;
     size_t l = jl_string_len(str);
@@ -683,6 +689,9 @@ static int NOINLINE array_resize_buffer(jl_array_t *a, size_t newlen)
         else {
             s = jl_gc_realloc_string(jl_array_data_owner(a), nbytes - (elsz == 1));
         }
+        if(mmtk_object_is_managed_by_mmtk(s)) {
+            mmtk_pin_object(s);
+        }
         jl_array_data_owner(a) = s;
         jl_gc_wb(a, s);
         a->data = jl_string_data(s);
diff --git a/src/julia.h b/src/julia.h
index 87e965bdae621..337e5131eeee7 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -10,7 +10,9 @@ extern "C" {
 extern int mmtk_object_is_managed_by_mmtk(void* addr);
 extern unsigned char mmtk_pin_object(void* obj);
 #define PTRHASH_PIN(key)                        \
+    if (mmtk_object_is_managed_by_mmtk(key)) {  \
         mmtk_pin_object(key);                   \
+    }                                           \
 
 #ifdef __cplusplus
 }

From cfb6d90e1fcba32a088f073ebf71ac6a98f2cf1d Mon Sep 17 00:00:00 2001
From: Luis Eduardo de Souza Amorim <ledusou@gmail.com>
Date: Fri, 15 Sep 2023 06:54:26 +0000
Subject: [PATCH 041/116] Fixing duplicate code from merging mistake

---
 src/threading.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/threading.c b/src/threading.c
index 8d13788fe657c..d1157a02dada0 100644
--- a/src/threading.c
+++ b/src/threading.c
@@ -694,12 +694,6 @@ void jl_init_threading(void)
     }
     int16_t ngcthreads = jl_n_markthreads + jl_n_sweepthreads;
 
-#ifdef MMTK_GC
-    // MMTk gets the number of GC threads from jl_options.ngcthreads, and spawn its GC threads.
-    // So we just set ngcthreads to 0 here to avoid spawning any GC threads in Julia.
-    ngcthreads = 0;
-#endif
-
 #ifdef MMTK_GC
     // MMTk gets the number of GC threads from jl_options.ngcthreads, and spawn its GC threads.
     // So we just set ngcthreads to 0 here to avoid spawning any GC threads in Julia.

From 6921f6c8934bf99beace5047fe73b1ff9772e9bd Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 5 Oct 2023 03:08:43 +0000
Subject: [PATCH 042/116] Stop using Julia's size classes

---
 src/array.c          |  4 ++--
 src/gc-common.c      | 10 ----------
 src/gc.c             | 10 ++++++++++
 src/julia_internal.h |  2 +-
 src/mmtk-gc.c        | 11 +++++++++++
 5 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/src/array.c b/src/array.c
index 8a064583bbc9e..73b6b04669978 100644
--- a/src/array.c
+++ b/src/array.c
@@ -497,8 +497,8 @@ JL_DLLEXPORT jl_value_t *jl_alloc_string(size_t len)
         s = jl_gc_pool_alloc_noinline(ptls, (char*)p - (char*)ptls, osize);
 #else
         int pool_id = jl_gc_szclass_align8(allocsz);
-        int osize = jl_gc_sizeclasses[pool_id];
-        s = jl_mmtk_gc_alloc_default(ptls, pool_id, osize, jl_string_type);
+        // int osize = jl_gc_sizeclasses[pool_id];
+        s = jl_mmtk_gc_alloc_default(ptls, pool_id, allocsz, jl_string_type);
 #endif
     }
     else {
diff --git a/src/gc-common.c b/src/gc-common.c
index 0f6307c1db98f..6f6c4c5330d74 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -450,16 +450,6 @@ jl_value_t *jl_gc_pool_alloc_noinline(jl_ptls_t ptls, int pool_offset, int osize
     return jl_gc_pool_alloc_inner(ptls, pool_offset, osize);
 }
 
-int jl_gc_classify_pools(size_t sz, int *osize)
-{
-    if (sz > GC_MAX_SZCLASS)
-        return -1;
-    size_t allocsz = sz + sizeof(jl_taggedvalue_t);
-    int klass = jl_gc_szclass(allocsz);
-    *osize = jl_gc_sizeclasses[klass];
-    return (int)(intptr_t)(&((jl_ptls_t)0)->heap.norm_pools[klass]);
-}
-
 // TODO: jl_gc_track_malloced_array needed? Eliminate heap.mallocarrays,
 // heap.mafreelist, mallocarray_t?
 void jl_gc_track_malloced_array(jl_ptls_t ptls, jl_array_t *a) JL_NOTSAFEPOINT
diff --git a/src/gc.c b/src/gc.c
index 924cdce356a4b..5febd88d5f9b6 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -902,6 +902,16 @@ static NOINLINE jl_taggedvalue_t *gc_add_page(jl_gc_pool_t *p) JL_NOTSAFEPOINT
     return fl;
 }
 
+int jl_gc_classify_pools(size_t sz, int *osize)
+{
+    if (sz > GC_MAX_SZCLASS)
+        return -1;
+    size_t allocsz = sz + sizeof(jl_taggedvalue_t);
+    int klass = jl_gc_szclass(allocsz);
+    *osize = jl_gc_sizeclasses[klass];
+    return (int)(intptr_t)(&((jl_ptls_t)0)->heap.norm_pools[klass]);
+}
+
 // Size includes the tag and the tag is not cleared!!
 inline jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset,
                                           int osize)
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 737553ec98845..588ebf7504017 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -495,7 +495,7 @@ STATIC_INLINE jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty)
     if (sz <= GC_MAX_SZCLASS) {
         int pool_id = jl_gc_szclass(allocsz);
         int osize = jl_gc_sizeclasses[pool_id];
-        v = jl_mmtk_gc_alloc_default(ptls, pool_id, osize, ty);
+        v = jl_mmtk_gc_alloc_default(ptls, pool_id, allocsz, ty);
     }
     else {
         if (allocsz < sz) // overflow in adding offs, size was "negative"
diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index 1175c6a161750..b40e734d0a63f 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -53,6 +53,17 @@ static inline void malloc_maybe_collect(jl_ptls_t ptls, size_t sz)
     }
 }
 
+// allocation
+int jl_gc_classify_pools(size_t sz, int *osize)
+{
+    if (sz > GC_MAX_SZCLASS)
+        return -1;
+    size_t allocsz = sz + sizeof(jl_taggedvalue_t);
+    int klass = jl_gc_szclass(allocsz);
+    *osize = LLT_ALIGN(allocsz, 16);
+    return (int)(intptr_t)(&((jl_ptls_t)0)->heap.norm_pools[klass]);
+}
+
 // malloc wrappers, aligned allocation
 // We currently just duplicate what Julia GC does. We will in the future replace the malloc calls with MMTK's malloc.
 

From 29f59932c45cdd379189c85545fac6980126f60f Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 5 Oct 2023 03:46:32 +0000
Subject: [PATCH 043/116] Removing code related to size classes

---
 src/array.c          | 3 +--
 src/julia_internal.h | 6 ++----
 src/mmtk-gc.c        | 2 +-
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/array.c b/src/array.c
index 73b6b04669978..a0166a6479798 100644
--- a/src/array.c
+++ b/src/array.c
@@ -497,8 +497,7 @@ JL_DLLEXPORT jl_value_t *jl_alloc_string(size_t len)
         s = jl_gc_pool_alloc_noinline(ptls, (char*)p - (char*)ptls, osize);
 #else
         int pool_id = jl_gc_szclass_align8(allocsz);
-        // int osize = jl_gc_sizeclasses[pool_id];
-        s = jl_mmtk_gc_alloc_default(ptls, pool_id, allocsz, jl_string_type);
+        s = jl_mmtk_gc_alloc_default(ptls, allocsz, jl_string_type);
 #endif
     }
     else {
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 588ebf7504017..ee32dbe922caa 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -339,7 +339,7 @@ jl_value_t *jl_gc_pool_alloc_noinline(jl_ptls_t ptls, int pool_offset,
                                    int osize);
 jl_value_t *jl_gc_big_alloc_noinline(jl_ptls_t ptls, size_t allocsz);
 #ifdef MMTK_GC
-JL_DLLIMPORT jl_value_t *jl_mmtk_gc_alloc_default(jl_ptls_t ptls, int pool_offset, int osize, void* ty);
+JL_DLLIMPORT jl_value_t *jl_mmtk_gc_alloc_default(jl_ptls_t ptls, int osize, void* ty);
 JL_DLLIMPORT jl_value_t *jl_mmtk_gc_alloc_big(jl_ptls_t ptls, size_t allocsz);
 JL_DLLIMPORT extern void mmtk_post_alloc(void* mutator, void* obj, size_t bytes, int allocator);
 JL_DLLIMPORT extern void mmtk_initialize_collection(void* tls);
@@ -493,9 +493,7 @@ STATIC_INLINE jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty)
     jl_value_t *v;
     const size_t allocsz = sz + sizeof(jl_taggedvalue_t);
     if (sz <= GC_MAX_SZCLASS) {
-        int pool_id = jl_gc_szclass(allocsz);
-        int osize = jl_gc_sizeclasses[pool_id];
-        v = jl_mmtk_gc_alloc_default(ptls, pool_id, allocsz, ty);
+        v = jl_mmtk_gc_alloc_default(ptls, allocsz, ty);
     }
     else {
         if (allocsz < sz) // overflow in adding offs, size was "negative"
diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index b40e734d0a63f..9c532379c599f 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -168,7 +168,7 @@ inline jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset, int o
     // TODO: drop this okay?
     // maybe_collect(ptls);
 
-    jl_value_t *v = jl_mmtk_gc_alloc_default(ptls, pool_offset, osize, NULL);
+    jl_value_t *v = jl_mmtk_gc_alloc_default(ptls, osize, NULL);
     // TODO: this is done (without atomic operations) in jl_mmtk_gc_alloc_default; enable
     // here when that's edited?
     /*

From a0e35b52a2492fc4dc0d262ea2af4120e76d1398 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Fri, 6 Oct 2023 01:44:50 +0000
Subject: [PATCH 044/116] Minor refactoring of jl_gc_classify_pools

---
 src/mmtk-gc.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index 9c532379c599f..d00d763238051 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -56,12 +56,11 @@ static inline void malloc_maybe_collect(jl_ptls_t ptls, size_t sz)
 // allocation
 int jl_gc_classify_pools(size_t sz, int *osize)
 {
-    if (sz > GC_MAX_SZCLASS)
-        return -1;
+    if (sz > GC_MAX_SZCLASS) 
+        return -1; // call big alloc function
     size_t allocsz = sz + sizeof(jl_taggedvalue_t);
-    int klass = jl_gc_szclass(allocsz);
     *osize = LLT_ALIGN(allocsz, 16);
-    return (int)(intptr_t)(&((jl_ptls_t)0)->heap.norm_pools[klass]);
+    return 0; // use MMTk's fastpath logic
 }
 
 // malloc wrappers, aligned allocation

From 387814b244daa9cd1f7917e9355972bd91ba53a7 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Fri, 6 Oct 2023 03:23:24 +0000
Subject: [PATCH 045/116] Aligning strings to 8 bytes

---
 src/array.c          | 3 +--
 src/julia_internal.h | 4 ++--
 src/mmtk-gc.c        | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/array.c b/src/array.c
index a0166a6479798..e17346947e73d 100644
--- a/src/array.c
+++ b/src/array.c
@@ -496,8 +496,7 @@ JL_DLLEXPORT jl_value_t *jl_alloc_string(size_t len)
         // the Allocations Profiler. (See https://github.com/JuliaLang/julia/pull/43868 for more details.)
         s = jl_gc_pool_alloc_noinline(ptls, (char*)p - (char*)ptls, osize);
 #else
-        int pool_id = jl_gc_szclass_align8(allocsz);
-        s = jl_mmtk_gc_alloc_default(ptls, allocsz, jl_string_type);
+        s = jl_mmtk_gc_alloc_default(ptls, allocsz, 8, jl_string_type);
 #endif
     }
     else {
diff --git a/src/julia_internal.h b/src/julia_internal.h
index ee32dbe922caa..575e84e9d41a7 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -339,7 +339,7 @@ jl_value_t *jl_gc_pool_alloc_noinline(jl_ptls_t ptls, int pool_offset,
                                    int osize);
 jl_value_t *jl_gc_big_alloc_noinline(jl_ptls_t ptls, size_t allocsz);
 #ifdef MMTK_GC
-JL_DLLIMPORT jl_value_t *jl_mmtk_gc_alloc_default(jl_ptls_t ptls, int osize, void* ty);
+JL_DLLIMPORT jl_value_t *jl_mmtk_gc_alloc_default(jl_ptls_t ptls, int osize, size_t align, void* ty);
 JL_DLLIMPORT jl_value_t *jl_mmtk_gc_alloc_big(jl_ptls_t ptls, size_t allocsz);
 JL_DLLIMPORT extern void mmtk_post_alloc(void* mutator, void* obj, size_t bytes, int allocator);
 JL_DLLIMPORT extern void mmtk_initialize_collection(void* tls);
@@ -493,7 +493,7 @@ STATIC_INLINE jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty)
     jl_value_t *v;
     const size_t allocsz = sz + sizeof(jl_taggedvalue_t);
     if (sz <= GC_MAX_SZCLASS) {
-        v = jl_mmtk_gc_alloc_default(ptls, allocsz, ty);
+        v = jl_mmtk_gc_alloc_default(ptls, allocsz, 16, ty);
     }
     else {
         if (allocsz < sz) // overflow in adding offs, size was "negative"
diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index d00d763238051..060b9f22a0e33 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -167,7 +167,7 @@ inline jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset, int o
     // TODO: drop this okay?
     // maybe_collect(ptls);
 
-    jl_value_t *v = jl_mmtk_gc_alloc_default(ptls, osize, NULL);
+    jl_value_t *v = jl_mmtk_gc_alloc_default(ptls, osize, 16, NULL);
     // TODO: this is done (without atomic operations) in jl_mmtk_gc_alloc_default; enable
     // here when that's edited?
     /*

From 1d97f5dfea84ed463630ce57965c7716cf7403e8 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Fri, 6 Oct 2023 03:55:55 +0000
Subject: [PATCH 046/116] Fixing whitespace

---
 src/mmtk-gc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index 060b9f22a0e33..efeadc903f71a 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -56,7 +56,7 @@ static inline void malloc_maybe_collect(jl_ptls_t ptls, size_t sz)
 // allocation
 int jl_gc_classify_pools(size_t sz, int *osize)
 {
-    if (sz > GC_MAX_SZCLASS) 
+    if (sz > GC_MAX_SZCLASS)
         return -1; // call big alloc function
     size_t allocsz = sz + sizeof(jl_taggedvalue_t);
     *osize = LLT_ALIGN(allocsz, 16);

From 4792b73132850aa153fe22023b8a45af29a3554f Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 1 Feb 2024 01:44:31 +0000
Subject: [PATCH 047/116] Removing functions to call mmtk's enable and disable
 collection

---
 src/mmtk-gc.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index 1175c6a161750..347ea4634bc9e 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -503,14 +503,6 @@ void jl_gc_threadfun(void *arg)
 }
 
 // added for MMTk integration
-void enable_collection(void)
-{
-    mmtk_enable_collection();
-}
-void disable_collection(void)
-{
-    mmtk_disable_collection();
-}
 
 JL_DLLEXPORT void jl_gc_array_ptr_copy(jl_array_t *dest, void **dest_p, jl_array_t *src, void **src_p, ssize_t n) JL_NOTSAFEPOINT
 {

From 4e057c98921ee71c4b86553df039923857a72c8f Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 1 Feb 2024 01:51:34 +0000
Subject: [PATCH 048/116] Forgot a few other places

---
 src/gc-common.c      | 2 --
 src/gc.c             | 6 ------
 src/julia_internal.h | 2 --
 3 files changed, 10 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index 0f6307c1db98f..aae56366d0c2d 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -588,11 +588,9 @@ JL_DLLEXPORT int jl_gc_enable(int on)
         if (jl_atomic_fetch_add(&jl_gc_disable_counter, -1) == 1) {
             gc_num.allocd += gc_num.deferred_alloc;
             gc_num.deferred_alloc = 0;
-            enable_collection();
         }
     }
     else if (prev && !on) {
-        disable_collection();
         // enable -> disable
         jl_atomic_fetch_add(&jl_gc_disable_counter, 1);
         // check if the GC is running and wait for it to finish
diff --git a/src/gc.c b/src/gc.c
index 924cdce356a4b..c4023e922b676 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -3438,12 +3438,6 @@ void jl_gc_threadfun(void *arg)
 }
 
 // added for MMTk integration
-void enable_collection(void)
-{
-}
-void disable_collection(void)
-{
-}
 
 JL_DLLEXPORT void jl_gc_wb1_noinline(const void *parent) JL_NOTSAFEPOINT
 {
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 737553ec98845..ec60dc76f5f16 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -333,8 +333,6 @@ extern jl_array_t *jl_all_methods JL_GLOBALLY_ROOTED;
 JL_DLLEXPORT extern int jl_lineno;
 JL_DLLEXPORT extern const char *jl_filename;
 
-extern void enable_collection(void);
-extern void disable_collection(void);
 jl_value_t *jl_gc_pool_alloc_noinline(jl_ptls_t ptls, int pool_offset,
                                    int osize);
 jl_value_t *jl_gc_big_alloc_noinline(jl_ptls_t ptls, size_t allocsz);

From 1348e5009adf133f3631c82875c4096667ca760c Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 1 Feb 2024 01:57:46 +0000
Subject: [PATCH 049/116] Missing reference in thread initialization

---
 src/threading.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/threading.c b/src/threading.c
index d1157a02dada0..b050b3eccd87a 100644
--- a/src/threading.c
+++ b/src/threading.c
@@ -351,9 +351,6 @@ jl_ptls_t jl_init_threadtls(int16_t tid)
     ptls->rngseed = jl_rand();
     if (tid == 0) {
         ptls->disable_gc = 1;
-#ifdef MMTK_GC
-        disable_collection();
-#endif
     }
 #ifdef _OS_WINDOWS_
     if (tid == 0) {

From 125d05e1f11dd9e5add538d6c17fcd39474af711 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 1 Feb 2024 02:07:25 +0000
Subject: [PATCH 050/116] Exporting jl_gc_disable_counter

---
 src/gc-common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index aae56366d0c2d..5ff30bbba5ac5 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -576,7 +576,7 @@ void gc_premark(jl_ptls_t ptls2)
 // GC control
 // ---
 
-_Atomic(uint32_t) jl_gc_disable_counter = 1;
+JL_DLLEXPORT _Atomic(uint32_t) jl_gc_disable_counter = 1;
 
 JL_DLLEXPORT int jl_gc_enable(int on)
 {

From 99aa5dd6289ac5445f9f019c4d8c4284fe5da0ba Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 1 Feb 2024 02:52:20 +0000
Subject: [PATCH 051/116] Exporting jl_gc_disable_counter

---
 src/gc.h             | 2 +-
 src/julia_internal.h | 2 +-
 src/mmtk-gc.c        | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gc.h b/src/gc.h
index 0630a039f2b94..03b52fb24acbb 100644
--- a/src/gc.h
+++ b/src/gc.h
@@ -83,7 +83,7 @@ extern const size_t max_collect_interval;
 extern size_t last_long_collect_interval;
 extern size_t total_mem;
 extern memsize_t max_total_memory;
-extern _Atomic(uint32_t) jl_gc_disable_counter;
+extern JL_DLLEXPORT _Atomic(uint32_t) jl_gc_disable_counter;
 extern jl_mutex_t heapsnapshot_lock;
 extern uint64_t finalizer_rngState[];
 extern int gc_n_threads;
diff --git a/src/julia_internal.h b/src/julia_internal.h
index ec60dc76f5f16..f5c9e07ce7329 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -936,7 +936,7 @@ STATIC_INLINE int jl_addr_is_safepoint(uintptr_t addr)
     return addr >= safepoint_addr && addr < safepoint_addr + jl_page_size * 3;
 }
 extern _Atomic(uint32_t) jl_gc_running;
-extern _Atomic(uint32_t) jl_gc_disable_counter;
+extern JL_DLLEXPORT _Atomic(uint32_t) jl_gc_disable_counter;
 // All the functions are safe to be called from within a signal handler
 // provided that the thread will not be interrupted by another asynchronous
 // signal.
diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index 347ea4634bc9e..db985b5149f8a 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -219,7 +219,7 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
 {
     jl_task_t *ct = jl_current_task;
     jl_ptls_t ptls = ct->ptls;
-    if (jl_atomic_load_relaxed(&jl_gc_disable_counter)) {
+    if (jl_atomic_load_acquire(&jl_gc_disable_counter)) {
         size_t localbytes = jl_atomic_load_relaxed(&ptls->gc_num.allocd) + gc_num.interval;
         jl_atomic_store_relaxed(&ptls->gc_num.allocd, -(int64_t)gc_num.interval);
         static_assert(sizeof(_Atomic(uint64_t)) == sizeof(gc_num.deferred_alloc), "");

From a0c895ee0ff25b751475c3015ed82cd84c058b0e Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Fri, 9 Feb 2024 01:41:50 +0000
Subject: [PATCH 052/116] Increasing the timeout just in case it's caused by
 the github runner specs

---
 test/threads.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/threads.jl b/test/threads.jl
index 8189311739e31..376c77347e15f 100644
--- a/test/threads.jl
+++ b/test/threads.jl
@@ -288,7 +288,7 @@ close(proc.in)
         proc = run(cmd; wait = false)
         done = Threads.Atomic{Bool}(false)
         timeout = false
-        timer = Timer(100) do _
+        timer = Timer(150) do _
             timeout = true
             for sig in [Base.SIGTERM, Base.SIGHUP, Base.SIGKILL]
                 for _ in 1:1000

From 3b4ae537216b59ccaded4f7b946dc7faf2a27bb5 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Mon, 12 Feb 2024 02:52:56 +0000
Subject: [PATCH 053/116] Refactoring pinning functions and adding comments

---
 src/array.c          | 24 +++++++++++++++---------
 src/builtins.c       | 19 ++++++++++---------
 src/datatype.c       |  4 ++++
 src/julia.h          |  4 ++--
 src/julia_internal.h |  4 ++++
 src/mmtk-gc.c        |  1 +
 6 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/src/array.c b/src/array.c
index 7c49889ee4662..73ba4cc0e8214 100644
--- a/src/array.c
+++ b/src/array.c
@@ -239,9 +239,11 @@ JL_DLLEXPORT jl_array_t *jl_reshape_array(jl_value_t *atype, jl_array_t *data,
     jl_array_t *owner = (jl_array_t*)jl_array_owner(data);
     jl_array_data_owner(a) = (jl_value_t*)owner;
 
-    if(mmtk_object_is_managed_by_mmtk(owner)) {
-        mmtk_pin_object(owner);
-    }
+    // For array objects with an owner point (a->flags.how == 3), we would need to
+    // introspect the object to update the a->data field. To avoid doing that and
+    // making scan_object much more complex we simply enforce that both owner and
+    // buffers are always pinned
+    mmtk_pin_object(owner);
     a->flags.how = 3;
     a->data = data->data;
     a->flags.isshared = 1;
@@ -290,9 +292,11 @@ JL_DLLEXPORT jl_array_t *jl_string_to_array(jl_value_t *str)
     a->flags.ptrarray = 0;
     a->flags.hasptr = 0;
     jl_array_data_owner(a) = str;
-    if(mmtk_object_is_managed_by_mmtk(str)) {
-        mmtk_pin_object(str);
-    }
+    // For array objects with an owner point (a->flags.how == 3), we would need to
+    // introspect the object to update the a->data field. To avoid doing that and
+    // making scan_object much more complex we simply enforce that both owner and
+    // buffers are always pinned
+    mmtk_pin_object(str);
     a->flags.how = 3;
     a->flags.isshared = 1;
     size_t l = jl_string_len(str);
@@ -689,9 +693,11 @@ static int NOINLINE array_resize_buffer(jl_array_t *a, size_t newlen)
         else {
             s = jl_gc_realloc_string(jl_array_data_owner(a), nbytes - (elsz == 1));
         }
-        if(mmtk_object_is_managed_by_mmtk(s)) {
-            mmtk_pin_object(s);
-        }
+        // For array objects with an owner point (a->flags.how == 3), we would need to
+        // introspect the object to update the a->data field. To avoid doing that and
+        // making scan_object much more complex we simply enforce that both owner and
+        // buffers are always pinned
+        mmtk_pin_object(s);
         jl_array_data_owner(a) = s;
         jl_gc_wb(a, s);
         a->data = jl_string_data(s);
diff --git a/src/builtins.c b/src/builtins.c
index 0094f4e5a2141..0a2cc9cd42729 100644
--- a/src/builtins.c
+++ b/src/builtins.c
@@ -344,9 +344,9 @@ static uintptr_t type_object_id_(jl_value_t *v, jl_varidx_t *env) JL_NOTSAFEPOIN
             i++;
             pe = pe->prev;
         }
-        if(mmtk_object_is_managed_by_mmtk(v)) {
-            mmtk_pin_object(v);
-        }
+        // FIXME: Pinning objects that get hashed
+        // until we implement address space hashing.
+        mmtk_pin_object(v);
         return inthash((uintptr_t)v);
     }
     if (tv == jl_uniontype_type) {
@@ -395,9 +395,10 @@ static uintptr_t immut_id_(jl_datatype_t *dt, jl_value_t *v, uintptr_t h) JL_NOT
         return ~h;
     size_t f, nf = jl_datatype_nfields(dt);
     if (nf == 0 || (!dt->layout->haspadding && dt->layout->npointers == 0)) {
-        if(mmtk_object_is_managed_by_mmtk(v)) {
-            mmtk_pin_object(v);
-        }
+
+        // FIXME: Pinning objects that get hashed
+        // until we implement address space hashing.
+        mmtk_pin_object(v);
         // operate element-wise if there are unused bits inside,
         // otherwise just take the whole data block at once
         // a few select pointers (notably symbol) also have special hash values
@@ -459,9 +460,9 @@ static uintptr_t NOINLINE jl_object_id__cold(jl_datatype_t *dt, jl_value_t *v) J
         return m->hash;
     }
     if (dt->name->mutabl) {
-        if(mmtk_object_is_managed_by_mmtk(v)) {
-            mmtk_pin_object(v);
-        }
+        // FIXME: Pinning objects that get hashed
+        // until we implement address space hashing.
+        mmtk_pin_object(v);
         return inthash((uintptr_t)v);
     }
     return immut_id_(dt, v, dt->hash);
diff --git a/src/datatype.c b/src/datatype.c
index 20c3af1555675..9e6d480985c69 100644
--- a/src/datatype.c
+++ b/src/datatype.c
@@ -65,6 +65,8 @@ JL_DLLEXPORT jl_typename_t *jl_new_typename_in(jl_sym_t *name, jl_module_t *modu
     jl_typename_t *tn =
         (jl_typename_t*)jl_gc_alloc(ct->ptls, sizeof(jl_typename_t),
                                     jl_typename_type);
+    // Typenames should be pinned since they are used as metadata, and are
+    // read during scan_object
     mmtk_pin_object(tn);
     tn->name = name;
     tn->module = module;
@@ -97,6 +99,8 @@ jl_datatype_t *jl_new_uninitialized_datatype(void)
 {
     jl_task_t *ct = jl_current_task;
     jl_datatype_t *t = (jl_datatype_t*)jl_gc_alloc(ct->ptls, sizeof(jl_datatype_t), jl_datatype_type);
+    // Types should be pinned since they are used as metadata, and are
+    // read during scan_object
     mmtk_pin_object(t);
     jl_set_typetagof(t, jl_datatype_tag, 0);
     t->hash = 0;
diff --git a/src/julia.h b/src/julia.h
index 337e5131eeee7..3a33e59e3835a 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -9,10 +9,10 @@ extern "C" {
 
 extern int mmtk_object_is_managed_by_mmtk(void* addr);
 extern unsigned char mmtk_pin_object(void* obj);
+// FIXME: Pinning objects that get hashed in the ptrhash table
+// until we implement address space hashing.
 #define PTRHASH_PIN(key)                        \
-    if (mmtk_object_is_managed_by_mmtk(key)) {  \
         mmtk_pin_object(key);                   \
-    }                                           \
 
 #ifdef __cplusplus
 }
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 4f90b44c80887..90e6dd6ce1ec1 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -533,6 +533,10 @@ typedef void jl_gc_tracked_buffer_t; // For the benefit of the static analyzer
 STATIC_INLINE jl_gc_tracked_buffer_t *jl_gc_alloc_buf(jl_ptls_t ptls, size_t sz)
 {
     jl_gc_tracked_buffer_t *buf = jl_gc_alloc(ptls, sz, (void*)jl_buff_tag);
+    // For array objects with an owner point (a->flags.how == 3), we would need to
+    // introspect the object to update the a->data field. To avoid doing that and
+    // making scan_object much more complex we simply enforce that both owner and
+    // buffers are always pinned
     mmtk_pin_object(buf);
     return buf;
 }
diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index 9d8b0f049e11d..05989a6ac335d 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -432,6 +432,7 @@ jl_value_t *jl_gc_realloc_string(jl_value_t *s, size_t sz)
     jl_value_t *snew = jl_alloc_string(sz);
     memcpy(jl_string_data(snew), jl_string_data(s), sz <= len ? sz : len);
     if(mmtk_is_pinned(s)) {
+        // if the source string was pinned, we also pin the new one
         mmtk_pin_object(snew);
     }
     return snew;

From 6f5f68500c77d4f8daecead6caacf447600dc57a Mon Sep 17 00:00:00 2001
From: Yi Lin <qinsoon@gmail.com>
Date: Wed, 28 Feb 2024 13:45:44 +1300
Subject: [PATCH 054/116] Call initialize_collection after _finish_julia_init
 (#40)

---
 src/init.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/init.c b/src/init.c
index 52f4740ccc306..8a379a5922f5a 100644
--- a/src/init.c
+++ b/src/init.c
@@ -824,9 +824,6 @@ JL_DLLEXPORT void julia_init(JL_IMAGE_SEARCH rel)
 
     jl_ptls_t ptls = jl_init_threadtls(0);
 
-#ifdef MMTK_GC
-    mmtk_initialize_collection((void *)ptls);
-#endif
 #pragma GCC diagnostic push
 #if defined(_COMPILER_GCC_) && __GNUC__ >= 12
 #pragma GCC diagnostic ignored "-Wdangling-pointer"
@@ -836,6 +833,9 @@ JL_DLLEXPORT void julia_init(JL_IMAGE_SEARCH rel)
 #pragma GCC diagnostic pop
     JL_GC_PROMISE_ROOTED(ct);
     _finish_julia_init(rel, ptls, ct);
+#ifdef MMTK_GC
+    mmtk_initialize_collection((void *)ptls);
+#endif
 }
 
 static NOINLINE void _finish_julia_init(JL_IMAGE_SEARCH rel, jl_ptls_t ptls, jl_task_t *ct)

From d907a0630b4551cfbb321366da6e96242ada25d1 Mon Sep 17 00:00:00 2001
From: Yi Lin <qinsoon@gmail.com>
Date: Fri, 10 May 2024 15:51:14 +1200
Subject: [PATCH 055/116] Port PR #42 and #44 to master (#48)

* Call initialize_collection before enabling GC (#44)
* Fix build with stock GC: mmtk_pin_object is conditionaly compiled (#42)
---
 src/array.c          |  6 +++---
 src/builtins.c       |  6 +++---
 src/datatype.c       |  4 ++--
 src/init.c           |  6 +++---
 src/julia.h          |  7 +++++--
 src/julia_internal.h | 18 +++++++++++++++++-
 6 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/src/array.c b/src/array.c
index bc73f582d63fa..2877604f7a900 100644
--- a/src/array.c
+++ b/src/array.c
@@ -243,7 +243,7 @@ JL_DLLEXPORT jl_array_t *jl_reshape_array(jl_value_t *atype, jl_array_t *data,
     // introspect the object to update the a->data field. To avoid doing that and
     // making scan_object much more complex we simply enforce that both owner and
     // buffers are always pinned
-    mmtk_pin_object(owner);
+    PTR_PIN(owner);
     a->flags.how = 3;
     a->data = data->data;
     a->flags.isshared = 1;
@@ -296,7 +296,7 @@ JL_DLLEXPORT jl_array_t *jl_string_to_array(jl_value_t *str)
     // introspect the object to update the a->data field. To avoid doing that and
     // making scan_object much more complex we simply enforce that both owner and
     // buffers are always pinned
-    mmtk_pin_object(str);
+    PTR_PIN(str);
     a->flags.how = 3;
     a->flags.isshared = 1;
     size_t l = jl_string_len(str);
@@ -695,7 +695,7 @@ static int NOINLINE array_resize_buffer(jl_array_t *a, size_t newlen)
         // introspect the object to update the a->data field. To avoid doing that and
         // making scan_object much more complex we simply enforce that both owner and
         // buffers are always pinned
-        mmtk_pin_object(s);
+        PTR_PIN(s);
         jl_array_data_owner(a) = s;
         jl_gc_wb(a, s);
         a->data = jl_string_data(s);
diff --git a/src/builtins.c b/src/builtins.c
index 0a2cc9cd42729..d961f36cbc707 100644
--- a/src/builtins.c
+++ b/src/builtins.c
@@ -346,7 +346,7 @@ static uintptr_t type_object_id_(jl_value_t *v, jl_varidx_t *env) JL_NOTSAFEPOIN
         }
         // FIXME: Pinning objects that get hashed
         // until we implement address space hashing.
-        mmtk_pin_object(v);
+        PTR_PIN(v);
         return inthash((uintptr_t)v);
     }
     if (tv == jl_uniontype_type) {
@@ -398,7 +398,7 @@ static uintptr_t immut_id_(jl_datatype_t *dt, jl_value_t *v, uintptr_t h) JL_NOT
 
         // FIXME: Pinning objects that get hashed
         // until we implement address space hashing.
-        mmtk_pin_object(v);
+        PTR_PIN(v);
         // operate element-wise if there are unused bits inside,
         // otherwise just take the whole data block at once
         // a few select pointers (notably symbol) also have special hash values
@@ -462,7 +462,7 @@ static uintptr_t NOINLINE jl_object_id__cold(jl_datatype_t *dt, jl_value_t *v) J
     if (dt->name->mutabl) {
         // FIXME: Pinning objects that get hashed
         // until we implement address space hashing.
-        mmtk_pin_object(v);
+        PTR_PIN(v);
         return inthash((uintptr_t)v);
     }
     return immut_id_(dt, v, dt->hash);
diff --git a/src/datatype.c b/src/datatype.c
index 9e6d480985c69..ae8853f37c688 100644
--- a/src/datatype.c
+++ b/src/datatype.c
@@ -67,7 +67,7 @@ JL_DLLEXPORT jl_typename_t *jl_new_typename_in(jl_sym_t *name, jl_module_t *modu
                                     jl_typename_type);
     // Typenames should be pinned since they are used as metadata, and are
     // read during scan_object
-    mmtk_pin_object(tn);
+    PTR_PIN(tn);
     tn->name = name;
     tn->module = module;
     tn->wrapper = NULL;
@@ -101,7 +101,7 @@ jl_datatype_t *jl_new_uninitialized_datatype(void)
     jl_datatype_t *t = (jl_datatype_t*)jl_gc_alloc(ct->ptls, sizeof(jl_datatype_t), jl_datatype_type);
     // Types should be pinned since they are used as metadata, and are
     // read during scan_object
-    mmtk_pin_object(t);
+    PTR_PIN(t);
     jl_set_typetagof(t, jl_datatype_tag, 0);
     t->hash = 0;
     t->hasfreetypevars = 0;
diff --git a/src/init.c b/src/init.c
index 8a379a5922f5a..faa446a34cf22 100644
--- a/src/init.c
+++ b/src/init.c
@@ -833,9 +833,6 @@ JL_DLLEXPORT void julia_init(JL_IMAGE_SEARCH rel)
 #pragma GCC diagnostic pop
     JL_GC_PROMISE_ROOTED(ct);
     _finish_julia_init(rel, ptls, ct);
-#ifdef MMTK_GC
-    mmtk_initialize_collection((void *)ptls);
-#endif
 }
 
 static NOINLINE void _finish_julia_init(JL_IMAGE_SEARCH rel, jl_ptls_t ptls, jl_task_t *ct)
@@ -883,6 +880,9 @@ static NOINLINE void _finish_julia_init(JL_IMAGE_SEARCH rel, jl_ptls_t ptls, jl_
     }
     jl_start_threads();
 
+#ifdef MMTK_GC
+    mmtk_initialize_collection((void *)ptls);
+#endif
     jl_gc_enable(1);
 
     if (jl_options.image_file && (!jl_generating_output() || jl_options.incremental) && jl_module_init_order) {
diff --git a/src/julia.h b/src/julia.h
index 3a33e59e3835a..ed6305715a87c 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -11,8 +11,11 @@ extern int mmtk_object_is_managed_by_mmtk(void* addr);
 extern unsigned char mmtk_pin_object(void* obj);
 // FIXME: Pinning objects that get hashed in the ptrhash table
 // until we implement address space hashing.
-#define PTRHASH_PIN(key)                        \
-        mmtk_pin_object(key);                   \
+#ifdef MMTK_GC
+#define PTRHASH_PIN(key) mmtk_pin_object(key);
+#else
+#define PTRHASH_PIN(key)
+#endif
 
 #ifdef __cplusplus
 }
diff --git a/src/julia_internal.h b/src/julia_internal.h
index a8593a23a4e40..25983ea6c0d27 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -3,6 +3,22 @@
 #ifndef JL_INTERNAL_H
 #define JL_INTERNAL_H
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int mmtk_object_is_managed_by_mmtk(void* addr);
+extern unsigned char mmtk_pin_object(void* obj);
+#ifdef MMTK_GC
+#define PTR_PIN(key) mmtk_pin_object(key);
+#else
+#define PTR_PIN(key)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
 #include "options.h"
 #include "julia_assert.h"
 #include "julia_locks.h"
@@ -535,7 +551,7 @@ STATIC_INLINE jl_gc_tracked_buffer_t *jl_gc_alloc_buf(jl_ptls_t ptls, size_t sz)
     // introspect the object to update the a->data field. To avoid doing that and
     // making scan_object much more complex we simply enforce that both owner and
     // buffers are always pinned
-    mmtk_pin_object(buf);
+    PTR_PIN(buf);
     return buf;
 }
 

From f9f38df99af1bfa8e37a0d4e224ed17bf037c7b8 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 18 Jul 2024 04:42:09 +0000
Subject: [PATCH 056/116] Changes to support mmtk

---
 src/gc-common.c                | 37 +++++++++++++++++++++++++++++
 src/gc-page-profiler.c         |  4 ++++
 src/gc-stacks.c                |  2 ++
 src/gc.c                       | 43 +++-------------------------------
 src/gc.h                       | 30 ++++++++++++------------
 src/julia.h                    |  5 ++--
 src/llvm-final-gc-lowering.cpp | 25 ++++++++------------
 src/llvm-late-gc-lowering.cpp  |  8 +++----
 src/llvm-pass-helpers.cpp      | 32 +++++++++++++++++++++++++
 src/mmtk-gc.c                  | 42 +++++++++++----------------------
 src/scheduler.c                | 14 +++++++++++
 11 files changed, 137 insertions(+), 105 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index 7462cb06c3cf3..d1e87b5741384 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -3,6 +3,7 @@
 #include "gc.h"
 
 jl_gc_num_t gc_num = {0};
+gc_heapstatus_t gc_heap_stats = {0};
 size_t last_long_collect_interval;
 int gc_n_threads;
 jl_ptls_t* gc_all_tls_states;
@@ -484,7 +485,43 @@ void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT
     jl_batch_accum_heap_size(ptls, sz);
 }
 
+void jl_batch_accum_free_size(jl_ptls_t ptls, uint64_t sz) JL_NOTSAFEPOINT
+{
+    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.free_acc, jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.free_acc) + sz);
+}
+
+void jl_gc_count_freed(size_t sz) JL_NOTSAFEPOINT
+{
+    jl_batch_accum_free_size(jl_current_task->ptls, sz);
+}
 
+void jl_gc_free_memory(jl_value_t *v, int isaligned) JL_NOTSAFEPOINT
+{
+    assert(jl_is_genericmemory(v));
+    jl_genericmemory_t *m = (jl_genericmemory_t*)v;
+    assert(jl_genericmemory_how(m) == 1 || jl_genericmemory_how(m) == 2);
+    char *d = (char*)m->ptr;
+    if (isaligned)
+        jl_free_aligned(d);
+    else
+        free(d);
+    jl_atomic_store_relaxed(&gc_heap_stats.heap_size,
+        jl_atomic_load_relaxed(&gc_heap_stats.heap_size) - jl_genericmemory_nbytes(m));
+    gc_num.freed += jl_genericmemory_nbytes(m);
+    gc_num.freecall++;
+}
+
+void jl_free_thread_gc_state(jl_ptls_t ptls)
+{
+    jl_gc_markqueue_t *mq = &ptls->gc_tls.mark_queue;
+    ws_queue_t *cq = &mq->chunk_queue;
+    free_ws_array(jl_atomic_load_relaxed(&cq->array));
+    jl_atomic_store_relaxed(&cq->array, NULL);
+    ws_queue_t *q = &mq->ptr_queue;
+    free_ws_array(jl_atomic_load_relaxed(&q->array));
+    jl_atomic_store_relaxed(&q->array, NULL);
+    arraylist_free(&mq->reclaim_set);
+}
 
 // GCNum, statistics manipulation
 // ---
diff --git a/src/gc-page-profiler.c b/src/gc-page-profiler.c
index 2e876e4b7b4d6..fe7a52b4d1f8a 100644
--- a/src/gc-page-profiler.c
+++ b/src/gc-page-profiler.c
@@ -1,5 +1,7 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
 
+#ifndef MMTK_GC
+
 #include "gc-page-profiler.h"
 
 #ifdef __cplusplus
@@ -177,3 +179,5 @@ JL_DLLEXPORT void jl_gc_take_page_profile(ios_t *stream)
 #ifdef __cplusplus
 }
 #endif
+
+#endif // !MMTK_GC
\ No newline at end of file
diff --git a/src/gc-stacks.c b/src/gc-stacks.c
index e00e954c105e0..465dce7fda26b 100644
--- a/src/gc-stacks.c
+++ b/src/gc-stacks.c
@@ -325,6 +325,8 @@ void sweep_stack_pools(void) JL_NOTSAFEPOINT
     }
 }
 
+extern int gc_first_tid;
+
 JL_DLLEXPORT jl_array_t *jl_live_tasks(void)
 {
     size_t nthreads = jl_atomic_load_acquire(&jl_n_threads);
diff --git a/src/gc.c b/src/gc.c
index edf90e8741498..520a8b8cb608c 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -193,8 +193,6 @@ static _Atomic(int) support_conservative_marking = 0;
  * have proper support of GC transition in codegen, we should execute the
  * finalizers in unmanaged (GC safe) mode.
  */
-
-gc_heapstatus_t gc_heap_stats = {0};
 int next_sweep_full = 0;
 
 // List of marked big objects.  Not per-thread.  Accessed only by master thread.
@@ -600,10 +598,7 @@ STATIC_INLINE void jl_batch_accum_heap_size(jl_ptls_t ptls, uint64_t sz) JL_NOTS
     }
 }
 
-STATIC_INLINE void jl_batch_accum_free_size(jl_ptls_t ptls, uint64_t sz) JL_NOTSAFEPOINT
-{
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.free_acc, jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.free_acc) + sz);
-}
+
 
 // big value list
 
@@ -697,28 +692,7 @@ static void sweep_big(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT
 }
 
 // tracking Memorys with malloc'd storage
-void jl_gc_count_freed(size_t sz) JL_NOTSAFEPOINT
-{
-    jl_batch_accum_free_size(jl_current_task->ptls, sz);
-}
-
-
-static void jl_gc_free_memory(jl_value_t *v, int isaligned) JL_NOTSAFEPOINT
-{
-    assert(jl_is_genericmemory(v));
-    jl_genericmemory_t *m = (jl_genericmemory_t*)v;
-    assert(jl_genericmemory_how(m) == 1 || jl_genericmemory_how(m) == 2);
-    char *d = (char*)m->ptr;
-    if (isaligned)
-        jl_free_aligned(d);
-    else
-        free(d);
-    jl_atomic_store_relaxed(&gc_heap_stats.heap_size,
-        jl_atomic_load_relaxed(&gc_heap_stats.heap_size) - jl_genericmemory_nbytes(m));
-    gc_num.freed += jl_genericmemory_nbytes(m);
-    gc_num.freecall++;
-}
-
+extern void jl_gc_free_memory(jl_value_t *v, int isaligned);
 static void sweep_malloced_memory(void) JL_NOTSAFEPOINT
 {
     gc_time_mallocd_memory_start();
@@ -3380,18 +3354,6 @@ void jl_init_thread_heap(jl_ptls_t ptls)
     jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, -(int64_t)gc_num.interval);
 }
 
-void jl_free_thread_gc_state(jl_ptls_t ptls)
-{
-    jl_gc_markqueue_t *mq = &ptls->gc_tls.mark_queue;
-    ws_queue_t *cq = &mq->chunk_queue;
-    free_ws_array(jl_atomic_load_relaxed(&cq->array));
-    jl_atomic_store_relaxed(&cq->array, NULL);
-    ws_queue_t *q = &mq->ptr_queue;
-    free_ws_array(jl_atomic_load_relaxed(&q->array));
-    jl_atomic_store_relaxed(&q->array, NULL);
-    arraylist_free(&mq->reclaim_set);
-}
-
 void jl_deinit_thread_heap(jl_ptls_t ptls)
 {
     // Do nothing
@@ -3478,6 +3440,7 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
     return data;
 }
 
+extern void jl_batch_accum_free_size(jl_ptls_t ptls, uint64_t sz) JL_NOTSAFEPOINT;
 JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz)
 {
     jl_gcframe_t **pgcstack = jl_get_pgcstack();
diff --git a/src/gc.h b/src/gc.h
index eb724985b599d..ea2766a646127 100644
--- a/src/gc.h
+++ b/src/gc.h
@@ -71,6 +71,14 @@ extern uint64_t finalizer_rngState[];
 extern int gc_n_threads;
 extern jl_ptls_t* gc_all_tls_states;
 
+#ifdef GC_SMALL_PAGE
+#define GC_PAGE_LG2 12 // log2(size of a page)
+#else
+#define GC_PAGE_LG2 14 // log2(size of a page)
+#endif
+#define GC_PAGE_SZ (1 << GC_PAGE_LG2)
+#define GC_PAGE_OFFSET (JL_HEAP_ALIGNMENT - (sizeof(jl_taggedvalue_t) % JL_HEAP_ALIGNMENT))
+
 // This struct must be kept in sync with the Julia type of the same name in base/timing.jl
 typedef struct {
     int64_t     allocd;
@@ -99,6 +107,13 @@ typedef struct {
     uint64_t    last_incremental_sweep;
 } jl_gc_num_t;
 
+typedef struct {
+    _Atomic(size_t) bytes_mapped;
+    _Atomic(size_t) bytes_resident;
+    _Atomic(size_t) heap_size;
+    _Atomic(size_t) heap_target;
+} gc_heapstatus_t;
+
 extern jl_gc_num_t gc_num;
 
 // data structure for tracking malloc'd arrays.
@@ -192,14 +207,6 @@ typedef struct {
 extern "C" {
 #endif
 
-#ifdef GC_SMALL_PAGE
-#define GC_PAGE_LG2 12 // log2(size of a page)
-#else
-#define GC_PAGE_LG2 14 // log2(size of a page)
-#endif
-#define GC_PAGE_SZ (1 << GC_PAGE_LG2)
-#define GC_PAGE_OFFSET (JL_HEAP_ALIGNMENT - (sizeof(jl_taggedvalue_t) % JL_HEAP_ALIGNMENT))
-
 #define jl_malloc_tag ((void*)0xdeadaa01)
 #define jl_singleton_tag ((void*)0xdeadaa02)
 
@@ -428,13 +435,6 @@ typedef struct {
     pagetable1_t *meta1[REGION2_PG_COUNT];
 } pagetable_t;
 
-typedef struct {
-    _Atomic(size_t) bytes_mapped;
-    _Atomic(size_t) bytes_resident;
-    _Atomic(size_t) heap_size;
-    _Atomic(size_t) heap_target;
-} gc_heapstatus_t;
-
 #define GC_PAGE_UNMAPPED        0
 #define GC_PAGE_ALLOCATED       1
 #define GC_PAGE_LAZILY_FREED    2
diff --git a/src/julia.h b/src/julia.h
index aab7512b6cc03..a16785ee0e9d3 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -646,12 +646,11 @@ typedef struct _jl_binding_t {
     _Atomic(struct _jl_binding_t*) owner;  // for individual imported bindings (NULL until 'resolved')
     _Atomic(jl_value_t*) ty;  // binding type
     uint8_t constp:1;
-    uint8_t exportp:1; // `public foo` sets `publicp`, `export foo` sets both `publicp` and `exportp`
-    uint8_t publicp:1; // exportp without publicp is not allowed.
+    uint8_t exportp:1;
     uint8_t imported:1;
     uint8_t usingfailed:1;
     uint8_t deprecated:2; // 0=not deprecated, 1=renamed, 2=moved to another package
-    uint8_t padding:1;
+    uint8_t padding:2;
 } jl_binding_t;
 
 typedef struct {
diff --git a/src/llvm-final-gc-lowering.cpp b/src/llvm-final-gc-lowering.cpp
index 83944f63bacee..cbc26da892403 100644
--- a/src/llvm-final-gc-lowering.cpp
+++ b/src/llvm-final-gc-lowering.cpp
@@ -307,7 +307,7 @@ void FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
                 builder.CreateStore(new_cursor, cursor_ptr);
 
                 // ptls->gc_num.allocd += osize;
-                auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_num));
+                auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, gc_num));
                 auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos);
                 auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc");
                 auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls);
@@ -322,11 +322,13 @@ void FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
                 phiNode->addIncoming(v_as_ptr, fastpath);
                 phiNode->takeName(target);
 
-                return phiNode;
+                target->replaceAllUsesWith(phiNode);
+                target->eraseFromParent();
+                return;
             } else {
                 auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1);
-                newI = builder.CreateCall(poolAllocFunc, { ptls, pool_offs, pool_osize_i32 });
-                derefAttr = Attribute::getWithDereferenceableBytes(F.getContext(), osize);
+                newI = builder.CreateCall(poolAllocFunc, { ptls, pool_offs, pool_osize_i32, type });
+                derefBytes = sizeof(void*);
             }
         #endif // MMTK_GC
         }
@@ -368,13 +370,6 @@ bool FinalLowerGC::runOnFunction(Function &F)
     allocTypedFunc = getOrDeclare(jl_well_known::GCAllocTyped);
     T_size = F.getParent()->getDataLayout().getIntPtrType(F.getContext());
 
-#ifdef MMTK_GC
-    auto writeBarrier1Func = getOrNull(jl_intrinsics::writeBarrier1);
-    auto writeBarrier2Func = getOrNull(jl_intrinsics::writeBarrier2);
-    auto writeBarrier1SlowFunc = getOrNull(jl_intrinsics::writeBarrier1Slow);
-    auto writeBarrier2SlowFunc = getOrNull(jl_intrinsics::writeBarrier2Slow);
-#endif
-
     // Lower all calls to supported intrinsics.
     for (auto &BB : F) {
         for (auto &I : make_early_inc_range(BB)) {
@@ -403,10 +398,10 @@ bool FinalLowerGC::runOnFunction(Function &F)
 
 
 #ifdef MMTK_GC
-            LOWER_INTRINSIC(writeBarrier1Func, lowerWriteBarrier1);
-            LOWER_INTRINSIC(writeBarrier2Func, lowerWriteBarrier2);
-            LOWER_INTRINSIC(writeBarrier1SlowFunc, lowerWriteBarrier1Slow);
-            LOWER_INTRINSIC(writeBarrier2SlowFunc, lowerWriteBarrier2Slow);
+            LOWER_INTRINSIC(writeBarrier1, lowerWriteBarrier1);
+            LOWER_INTRINSIC(writeBarrier2, lowerWriteBarrier2);
+            LOWER_INTRINSIC(writeBarrier1Slow, lowerWriteBarrier1Slow);
+            LOWER_INTRINSIC(writeBarrier2Slow, lowerWriteBarrier2Slow);
 #endif
 
 
diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
index 1e2e0d3933783..f257afd2c6211 100644
--- a/src/llvm-late-gc-lowering.cpp
+++ b/src/llvm-late-gc-lowering.cpp
@@ -2575,15 +2575,15 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) {
         if (CFGModified) {
             *CFGModified = true;
         }
+
+        IRBuilder<> builder(CI);
+        builder.SetCurrentDebugLocation(CI->getDebugLoc());
+#ifndef MMTK_GC
         auto DebugInfoMeta = F.getParent()->getModuleFlag("julia.debug_level");
         int debug_info = 1;
         if (DebugInfoMeta != nullptr) {
             debug_info = cast<ConstantInt>(cast<ConstantAsMetadata>(DebugInfoMeta)->getValue())->getZExtValue();
         }
-
-        IRBuilder<> builder(CI);
-        builder.SetCurrentDebugLocation(CI->getDebugLoc());
-#ifndef MMTK_GC
         auto parBits = builder.CreateAnd(EmitLoadTag(builder, T_size, parent), GC_OLD_MARKED);
         setName(parBits, "parent_bits", debug_info);
         auto parOldMarked = builder.CreateICmpEQ(parBits, ConstantInt::get(T_size, GC_OLD_MARKED));
diff --git a/src/llvm-pass-helpers.cpp b/src/llvm-pass-helpers.cpp
index 7f701cb2db639..6d6c3898e875c 100644
--- a/src/llvm-pass-helpers.cpp
+++ b/src/llvm-pass-helpers.cpp
@@ -275,7 +275,11 @@ namespace jl_intrinsics {
                     false),
                 Function::ExternalLinkage,
                 WRITE_BARRIER_1_NAME);
+#if JL_LLVM_VERSION >= 160000
+            intrinsic->setMemoryEffects(MemoryEffects::inaccessibleOrArgMemOnly());
+#else
             intrinsic->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
+#endif
             return intrinsic;
         });
     const IntrinsicDescription writeBarrier2(
@@ -290,7 +294,11 @@ namespace jl_intrinsics {
                     false),
                 Function::ExternalLinkage,
                 WRITE_BARRIER_2_NAME);
+#if JL_LLVM_VERSION >= 160000
+            intrinsic->setMemoryEffects(MemoryEffects::inaccessibleOrArgMemOnly());
+#else
             intrinsic->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
+#endif
             return intrinsic;
         });
     const IntrinsicDescription writeBarrier1Slow(
@@ -305,7 +313,11 @@ namespace jl_intrinsics {
                     false),
                 Function::ExternalLinkage,
                 WRITE_BARRIER_1_SLOW_NAME);
+#if JL_LLVM_VERSION >= 160000
+            intrinsic->setMemoryEffects(MemoryEffects::inaccessibleOrArgMemOnly());
+#else
             intrinsic->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
+#endif
             return intrinsic;
         });
     const IntrinsicDescription writeBarrier2Slow(
@@ -320,7 +332,11 @@ namespace jl_intrinsics {
                     false),
                 Function::ExternalLinkage,
                 WRITE_BARRIER_2_SLOW_NAME);
+#if JL_LLVM_VERSION >= 160000
+            intrinsic->setMemoryEffects(MemoryEffects::inaccessibleOrArgMemOnly());
+#else
             intrinsic->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
+#endif
             return intrinsic;
         });
 #endif
@@ -423,7 +439,11 @@ namespace jl_well_known {
                     false),
                 Function::ExternalLinkage,
                 GC_WB_1_NAME);
+#if JL_LLVM_VERSION >= 160000
+            func->setMemoryEffects(MemoryEffects::inaccessibleOrArgMemOnly());
+#else
             func->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
+#endif
             return func;
     });
 
@@ -439,7 +459,11 @@ namespace jl_well_known {
                     false),
                 Function::ExternalLinkage,
                 GC_WB_2_NAME);
+#if JL_LLVM_VERSION >= 160000
+            func->setMemoryEffects(MemoryEffects::inaccessibleOrArgMemOnly());
+#else
             func->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
+#endif
             return func;
     });
 
@@ -455,7 +479,11 @@ namespace jl_well_known {
                     false),
                 Function::ExternalLinkage,
                 GC_WB_1_SLOW_NAME);
+#if JL_LLVM_VERSION >= 160000
+            func->setMemoryEffects(MemoryEffects::inaccessibleOrArgMemOnly());
+#else
             func->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
+#endif
             return func;
     });
 
@@ -471,7 +499,11 @@ namespace jl_well_known {
                     false),
                 Function::ExternalLinkage,
                 GC_WB_2_SLOW_NAME);
+#if JL_LLVM_VERSION >= 160000
+            func->setMemoryEffects(MemoryEffects::inaccessibleOrArgMemOnly());
+#else
             func->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
+#endif
             return func;
     });
 #endif
diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index 0691a1db776e8..8a7d95871d7c6 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -182,19 +182,6 @@ inline jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset, int o
    return v;
 }
 
-void jl_gc_free_array(jl_array_t *a) JL_NOTSAFEPOINT
-{
-    if (a->flags.how == 2) {
-        char *d = (char*)a->data - a->offset*a->elsize;
-        if (a->flags.isaligned)
-            jl_free_aligned(d);
-        else
-            free(d);
-        gc_num.freed += jl_array_nbytes(a);
-        gc_num.freecall++;
-    }
-}
-
 // roots
 // ---
 
@@ -204,7 +191,7 @@ JL_DLLEXPORT void jl_gc_queue_root(const jl_value_t *ptr)
 }
 
 // TODO: exported, but not MMTk-specific?
-JL_DLLEXPORT void jl_gc_queue_multiroot(const jl_value_t *parent, const jl_value_t *ptr) JL_NOTSAFEPOINT
+JL_DLLEXPORT void jl_gc_queue_multiroot(const jl_value_t *root, const void *stored, jl_datatype_t *dt) JL_NOTSAFEPOINT
 {
     mmtk_unreachable();
 }
@@ -233,10 +220,10 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
     jl_task_t *ct = jl_current_task;
     jl_ptls_t ptls = ct->ptls;
     if (jl_atomic_load_acquire(&jl_gc_disable_counter)) {
-        size_t localbytes = jl_atomic_load_relaxed(&ptls->gc_num.allocd) + gc_num.interval;
-        jl_atomic_store_relaxed(&ptls->gc_num.allocd, -(int64_t)gc_num.interval);
+        size_t localbytes = jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + gc_num.interval;
+        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, -(int64_t)gc_num.interval);
         static_assert(sizeof(_Atomic(uint64_t)) == sizeof(gc_num.deferred_alloc), "");
-        jl_atomic_fetch_add((_Atomic(uint64_t)*)&gc_num.deferred_alloc, localbytes);
+        jl_atomic_fetch_add_relaxed((_Atomic(uint64_t)*)&gc_num.deferred_alloc, localbytes);
         return;
     }
     mmtk_handle_user_collection_request(ptls, collection);
@@ -247,32 +234,31 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
 // TODO: remove `gc_cache`?
 void jl_init_thread_heap(jl_ptls_t ptls)
 {
-    jl_thread_heap_t *heap = &ptls->heap;
+    jl_thread_heap_t *heap = &ptls->gc_tls.heap;
     jl_gc_pool_t *p = heap->norm_pools;
     for (int i = 0; i < JL_GC_N_POOLS; i++) {
         p[i].osize = jl_gc_sizeclasses[i];
         p[i].freelist = NULL;
         p[i].newpages = NULL;
     }
-    arraylist_new(&heap->weak_refs, 0);
-    arraylist_new(&heap->live_tasks, 0);
+    small_arraylist_new(&heap->weak_refs, 0);
+    small_arraylist_new(&heap->live_tasks, 0);
+    for (int i = 0; i < JL_N_STACK_POOLS; i++)
+        small_arraylist_new(&heap->free_stacks[i], 0);
     heap->mallocarrays = NULL;
     heap->mafreelist = NULL;
     heap->big_objects = NULL;
-    heap->remset = &heap->_remset[0];
-    heap->last_remset = &heap->_remset[1];
-    arraylist_new(heap->remset, 0);
-    arraylist_new(heap->last_remset, 0);
+    arraylist_new(&heap->remset, 0);
     arraylist_new(&ptls->finalizers, 0);
-    arraylist_new(&ptls->sweep_objs, 0);
+    arraylist_new(&ptls->gc_tls.sweep_objs, 0);
 
-    jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache;
+    jl_gc_mark_cache_t *gc_cache = &ptls->gc_tls.gc_cache;
     gc_cache->perm_scanned_bytes = 0;
     gc_cache->scanned_bytes = 0;
     gc_cache->nbig_obj = 0;
 
-    memset(&ptls->gc_num, 0, sizeof(ptls->gc_num));
-    jl_atomic_store_relaxed(&ptls->gc_num.allocd, -(int64_t)gc_num.interval);
+    memset(&ptls->gc_tls.gc_num, 0, sizeof(ptls->gc_tls.gc_num));
+    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, -(int64_t)gc_num.interval);
 
     // Clear the malloc sz count
     jl_atomic_store_relaxed(&ptls->malloc_sz_since_last_poll, 0);
diff --git a/src/scheduler.c b/src/scheduler.c
index 2c7dbd63ef4a4..5c885dd2f3b76 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -112,6 +112,8 @@ void jl_init_threadinginfra(void)
 
 void JL_NORETURN jl_finish_task(jl_task_t *ct);
 
+#ifndef MMTK_GC
+
 static inline int may_mark(void) JL_NOTSAFEPOINT
 {
     return (jl_atomic_load(&gc_n_threads_marking) > 0);
@@ -185,6 +187,18 @@ void jl_concurrent_gc_threadfun(void *arg)
     }
 }
 
+#else
+void jl_parallel_gc_threadfun(void *arg)
+{
+    mmtk_unreachable();
+}
+
+void jl_concurrent_gc_threadfun(void *arg)
+{
+    mmtk_unreachable();
+}
+#endif
+
 // thread function: used by all mutator threads except the main thread
 void jl_threadfun(void *arg)
 {

From b64c1e4721036606bfa8555d77f91d6e4a0a1a88 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Mon, 22 Jul 2024 21:53:50 +0000
Subject: [PATCH 057/116] Final changes to support the binding

---
 src/gc-common.c                | 14 ++++++++++
 src/gc.c                       | 14 ----------
 src/gc.h                       |  1 -
 src/genericmemory.c            |  1 +
 src/julia.h                    |  7 ++---
 src/llvm-final-gc-lowering.cpp | 49 +++++++++++++++-------------------
 6 files changed, 41 insertions(+), 45 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index d1e87b5741384..640f2ec1de29e 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -477,6 +477,20 @@ void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, i
     ptls->gc_tls.heap.mallocarrays = ma;
 }
 
+JL_DLLEXPORT int64_t jl_gc_pool_live_bytes(void)
+{
+    int n_threads = jl_atomic_load_acquire(&jl_n_threads);
+    jl_ptls_t *all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
+    int64_t pool_live_bytes = 0;
+    for (int i = 0; i < n_threads; i++) {
+        jl_ptls_t ptls2 = all_tls_states[i];
+        if (ptls2 != NULL) {
+            pool_live_bytes += jl_atomic_load_relaxed(&ptls2->gc_tls.gc_num.pool_live_bytes);
+        }
+    }
+    return pool_live_bytes;
+}
+
 void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT
 {
     jl_ptls_t ptls = jl_current_task->ptls;
diff --git a/src/gc.c b/src/gc.c
index 520a8b8cb608c..a189ac24b9f95 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -2789,20 +2789,6 @@ static void sweep_finalizer_list(arraylist_t *list)
     list->len = j;
 }
 
-JL_DLLEXPORT int64_t jl_gc_pool_live_bytes(void)
-{
-    int n_threads = jl_atomic_load_acquire(&jl_n_threads);
-    jl_ptls_t *all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
-    int64_t pool_live_bytes = 0;
-    for (int i = 0; i < n_threads; i++) {
-        jl_ptls_t ptls2 = all_tls_states[i];
-        if (ptls2 != NULL) {
-            pool_live_bytes += jl_atomic_load_relaxed(&ptls2->gc_tls.gc_num.pool_live_bytes);
-        }
-    }
-    return pool_live_bytes;
-}
-
 uint64_t jl_gc_smooth(uint64_t old_val, uint64_t new_val, double factor)
 {
     double est = factor * old_val + (1 - factor) * new_val;
diff --git a/src/gc.h b/src/gc.h
index ea2766a646127..c9320a6dbd837 100644
--- a/src/gc.h
+++ b/src/gc.h
@@ -47,7 +47,6 @@ extern jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset, int o
 extern void jl_rng_split(uint64_t to[JL_RNG_SIZE], uint64_t from[JL_RNG_SIZE]);
 extern void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t oldsz,
                                  int isaligned, jl_value_t *owner, int8_t can_collect);
-extern size_t jl_array_nbytes(jl_array_t *a);
 extern void run_finalizers(jl_task_t *ct, int finalizers_thread);
 
 #define malloc_cache_align(sz) jl_malloc_aligned(sz, JL_CACHE_BYTE_ALIGNMENT)
diff --git a/src/genericmemory.c b/src/genericmemory.c
index ea52fca66ba48..24db8f29f1a12 100644
--- a/src/genericmemory.c
+++ b/src/genericmemory.c
@@ -54,6 +54,7 @@ jl_genericmemory_t *_new_genericmemory_(jl_value_t *mtype, size_t nel, int8_t is
         tot = sizeof(jl_genericmemory_t) + sizeof(void*);
     }
     m = (jl_genericmemory_t*)jl_gc_alloc(ct->ptls, tot, mtype);
+
     if (pooled) {
         data = (char*)m + JL_SMALL_BYTE_ALIGNMENT;
     }
diff --git a/src/julia.h b/src/julia.h
index a16785ee0e9d3..b4ff97daae150 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -646,11 +646,12 @@ typedef struct _jl_binding_t {
     _Atomic(struct _jl_binding_t*) owner;  // for individual imported bindings (NULL until 'resolved')
     _Atomic(jl_value_t*) ty;  // binding type
     uint8_t constp:1;
-    uint8_t exportp:1;
+    uint8_t exportp:1; // `public foo` sets `publicp`, `export foo` sets both `publicp` and `exportp`
+    uint8_t publicp:1; // exportp without publicp is not allowed.
     uint8_t imported:1;
     uint8_t usingfailed:1;
     uint8_t deprecated:2; // 0=not deprecated, 1=renamed, 2=moved to another package
-    uint8_t padding:2;
+    uint8_t padding:1;
 } jl_binding_t;
 
 typedef struct {
@@ -809,7 +810,7 @@ static inline jl_value_t *jl_to_typeof(uintptr_t t)
     return (jl_value_t*)t;
 }
 #else
-extern JL_HIDDEN jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
+extern JL_DLLEXPORT jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
 static inline jl_value_t *jl_to_typeof(uintptr_t t)
 {
     if (t < (jl_max_tags << 4))
diff --git a/src/llvm-final-gc-lowering.cpp b/src/llvm-final-gc-lowering.cpp
index cbc26da892403..9339cbff1ec61 100644
--- a/src/llvm-final-gc-lowering.cpp
+++ b/src/llvm-final-gc-lowering.cpp
@@ -103,7 +103,6 @@ void FinalLowerGC::lowerNewGCFrame(CallInst *target, Function &F)
     builder.CreateMemSet(gcframe, Constant::getNullValue(Type::getInt8Ty(F.getContext())), ptrsize * (nRoots + 2), Align(16), tbaa_gcframe);
 
     target->replaceAllUsesWith(gcframe);
-    target->eraseFromParent();
 }
 
 void FinalLowerGC::lowerPushGCFrame(CallInst *target, Function &F)
@@ -131,7 +130,6 @@ void FinalLowerGC::lowerPushGCFrame(CallInst *target, Function &F)
             gcframe,
             pgcstack,
             Align(sizeof(void*)));
-    target->eraseFromParent();
 }
 
 void FinalLowerGC::lowerPopGCFrame(CallInst *target, Function &F)
@@ -150,7 +148,6 @@ void FinalLowerGC::lowerPopGCFrame(CallInst *target, Function &F)
         pgcstack,
         Align(sizeof(void*)));
     inst->setMetadata(LLVMContext::MD_tbaa, tbaa_gcframe);
-    target->eraseFromParent();
 }
 
 void FinalLowerGC::lowerGetGCFrameSlot(CallInst *target, Function &F)
@@ -170,7 +167,6 @@ void FinalLowerGC::lowerGetGCFrameSlot(CallInst *target, Function &F)
     auto gep = builder.CreateInBoundsGEP(T_prjlvalue, gcframe, index);
     gep->takeName(target);
     target->replaceAllUsesWith(gep);
-    target->eraseFromParent();
 }
 
 void FinalLowerGC::lowerQueueGCRoot(CallInst *target, Function &F)
@@ -187,7 +183,6 @@ void FinalLowerGC::lowerSafepoint(CallInst *target, Function &F)
     IRBuilder<> builder(target);
     Value* signal_page = target->getOperand(0);
     builder.CreateLoad(T_size, signal_page, true);
-    target->eraseFromParent();
 }
 
 #ifdef MMTK_GC
@@ -252,7 +247,7 @@ void FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
 
             // Should we generate fastpath allocation sequence here? We should always generate fastpath here for MMTk.
             // Setting this to false will increase allocation overhead a lot, and should only be used for debugging.
-            const bool INLINE_FASTPATH_ALLOCATION = true;
+            const bool INLINE_FASTPATH_ALLOCATION = false;
 
             if (INLINE_FASTPATH_ALLOCATION) {
                 // Assuming we use the first immix allocator.
@@ -307,12 +302,12 @@ void FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
                 builder.CreateStore(new_cursor, cursor_ptr);
 
                 // ptls->gc_num.allocd += osize;
-                auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, gc_num));
-                auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos);
-                auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc");
-                auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls);
-                auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize);
-                builder.CreateStore(pool_allocd_total, pool_alloc_tls);
+                // auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, gc_num));
+                // auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos);
+                // auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc");
+                // auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls);
+                // auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize);
+                // builder.CreateStore(pool_allocd_total, pool_alloc_tls);
 
                 auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t)));
                 auto v_as_ptr = builder.CreateIntToPtr(v_raw, poolAllocFunc->getReturnType());
@@ -321,14 +316,14 @@ void FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
                 phiNode->addIncoming(new_call, slowpath);
                 phiNode->addIncoming(v_as_ptr, fastpath);
                 phiNode->takeName(target);
-
+                
                 target->replaceAllUsesWith(phiNode);
-                target->eraseFromParent();
                 return;
             } else {
                 auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1);
                 newI = builder.CreateCall(poolAllocFunc, { ptls, pool_offs, pool_osize_i32, type });
-                derefBytes = sizeof(void*);
+                if (sz > 0)
+                    derefBytes = sz;
             }
         #endif // MMTK_GC
         }
@@ -346,7 +341,6 @@ void FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
         newI->addDereferenceableRetAttr(derefBytes);
     newI->takeName(target);
     target->replaceAllUsesWith(newI);
-    target->eraseFromParent();
 }
 
 bool FinalLowerGC::runOnFunction(Function &F)
@@ -372,21 +366,23 @@ bool FinalLowerGC::runOnFunction(Function &F)
 
     // Lower all calls to supported intrinsics.
     for (auto &BB : F) {
-        for (auto &I : make_early_inc_range(BB)) {
-            auto *CI = dyn_cast<CallInst>(&I);
-            if (!CI)
+        for (auto it = BB.begin(); it != BB.end();) {
+            auto *CI = dyn_cast<CallInst>(&*it);
+            if (!CI) {
+                ++it;
                 continue;
+            }
 
             Value *callee = CI->getCalledOperand();
             assert(callee);
 
 #define LOWER_INTRINSIC(INTRINSIC, LOWER_INTRINSIC_FUNC) \
-            do { \
-                auto intrinsic = getOrNull(jl_intrinsics::INTRINSIC); \
-                if (intrinsic == callee) { \
-                    LOWER_INTRINSIC_FUNC(CI, F); \
-                } \
-            } while (0)
+            auto INTRINSIC = getOrNull(jl_intrinsics::INTRINSIC); \
+            if (INTRINSIC == callee) { \
+                LOWER_INTRINSIC_FUNC(CI, F); \
+                it = CI->eraseFromParent(); \
+                continue; \
+            } \
 
             LOWER_INTRINSIC(newGCFrame, lowerNewGCFrame);
             LOWER_INTRINSIC(pushGCFrame, lowerPushGCFrame);
@@ -396,14 +392,13 @@ bool FinalLowerGC::runOnFunction(Function &F)
             LOWER_INTRINSIC(queueGCRoot, lowerQueueGCRoot);
             LOWER_INTRINSIC(safepoint, lowerSafepoint);
 
-
 #ifdef MMTK_GC
             LOWER_INTRINSIC(writeBarrier1, lowerWriteBarrier1);
             LOWER_INTRINSIC(writeBarrier2, lowerWriteBarrier2);
             LOWER_INTRINSIC(writeBarrier1Slow, lowerWriteBarrier1Slow);
             LOWER_INTRINSIC(writeBarrier2Slow, lowerWriteBarrier2Slow);
 #endif
-
+            ++it;
 
 #undef LOWER_INTRINSIC
         }

From 8bb0895bb38a4371387169941595c84d896a70fa Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 25 Jul 2024 00:52:28 +0000
Subject: [PATCH 058/116] Immix working; inlining fastpath allocation is not

---
 src/datatype.c                 |  8 ++++++++
 src/gc-common.c                | 22 ++++++++++------------
 src/gc-debug.c                 | 10 ----------
 src/gc-page-profiler.c         |  2 +-
 src/gc.c                       | 12 ++++++++++++
 src/jitlayers.h                |  7 ++++++-
 src/llvm-final-gc-lowering.cpp | 21 ++++++++++++++-------
 src/llvm-late-gc-lowering.cpp  |  2 +-
 src/mmtk-gc.c                  | 32 +++++++++++++++++++++++++++-----
 9 files changed, 79 insertions(+), 37 deletions(-)

diff --git a/src/datatype.c b/src/datatype.c
index 422e9a4e897ed..cb10ef7719dd5 100644
--- a/src/datatype.c
+++ b/src/datatype.c
@@ -291,6 +291,10 @@ static jl_datatype_layout_t *jl_get_layout(uint32_t sz,
     if ((void*)ret == HT_NOTFOUND) {
         if (!should_malloc) {
             char *perm_mem = (char *)jl_gc_perm_alloc(flddesc_sz, 0, 4, 0);
+#ifdef MMTK_GC
+    jl_ptls_t ptls = jl_current_task->ptls;
+    mmtk_immortal_post_alloc_fast(&ptls->mmtk_mutator, jl_valueof(perm_mem), flddesc_sz);
+#endif
             assert(perm_mem);
             ret = (jl_datatype_layout_t *)perm_mem;
             memcpy(perm_mem, flddesc, flddesc_sz);
@@ -968,6 +972,10 @@ JL_DLLEXPORT jl_datatype_t * jl_new_foreign_type(jl_sym_t *name,
     jl_datatype_layout_t *layout = (jl_datatype_layout_t *)
       jl_gc_perm_alloc(sizeof(jl_datatype_layout_t) + sizeof(jl_fielddescdyn_t),
         0, 4, 0);
+#ifdef MMTK_GC
+    jl_ptls_t ptls = jl_current_task->ptls;
+    mmtk_immortal_post_alloc_fast(&ptls->mmtk_mutator, jl_valueof(layout), sizeof(jl_datatype_layout_t) + sizeof(jl_fielddescdyn_t));
+#endif
     layout->size = large ? GC_MAX_SZCLASS+1 : 0;
     layout->nfields = 0;
     layout->alignment = sizeof(void *);
diff --git a/src/gc-common.c b/src/gc-common.c
index 640f2ec1de29e..98ef3f62125f9 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -525,18 +525,6 @@ void jl_gc_free_memory(jl_value_t *v, int isaligned) JL_NOTSAFEPOINT
     gc_num.freecall++;
 }
 
-void jl_free_thread_gc_state(jl_ptls_t ptls)
-{
-    jl_gc_markqueue_t *mq = &ptls->gc_tls.mark_queue;
-    ws_queue_t *cq = &mq->chunk_queue;
-    free_ws_array(jl_atomic_load_relaxed(&cq->array));
-    jl_atomic_store_relaxed(&cq->array, NULL);
-    ws_queue_t *q = &mq->ptr_queue;
-    free_ws_array(jl_atomic_load_relaxed(&q->array));
-    jl_atomic_store_relaxed(&q->array, NULL);
-    arraylist_free(&mq->reclaim_set);
-}
-
 // GCNum, statistics manipulation
 // ---
 // Only safe to update the heap inside the GC
@@ -642,6 +630,16 @@ JL_DLLEXPORT int jl_gc_is_enabled(void)
     return !ptls->disable_gc;
 }
 
+int gc_logging_enabled = 0;
+
+JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
+    gc_logging_enabled = enable;
+}
+
+JL_DLLEXPORT int jl_is_gc_logging_enabled(void) {
+    return gc_logging_enabled;
+}
+
 JL_DLLEXPORT void jl_gc_get_total_bytes(int64_t *bytes) JL_NOTSAFEPOINT
 {
     jl_gc_num_t num = gc_num;
diff --git a/src/gc-debug.c b/src/gc-debug.c
index 41607638fa5df..19348b380e145 100644
--- a/src/gc-debug.c
+++ b/src/gc-debug.c
@@ -1142,16 +1142,6 @@ NOINLINE void gc_mark_loop_unwind(jl_ptls_t ptls, jl_gc_markqueue_t *mq, int off
 //     return (slot - start) / elsize;
 // }
 
-static int gc_logging_enabled = 0;
-
-JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
-    gc_logging_enabled = enable;
-}
-
-JL_DLLEXPORT int jl_is_gc_logging_enabled(void) {
-    return gc_logging_enabled;
-}
-
 void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect, int64_t live_bytes) JL_NOTSAFEPOINT {
     if (!gc_logging_enabled) {
         return;
diff --git a/src/gc-page-profiler.c b/src/gc-page-profiler.c
index fe7a52b4d1f8a..05666c7a86af2 100644
--- a/src/gc-page-profiler.c
+++ b/src/gc-page-profiler.c
@@ -180,4 +180,4 @@ JL_DLLEXPORT void jl_gc_take_page_profile(ios_t *stream)
 }
 #endif
 
-#endif // !MMTK_GC
\ No newline at end of file
+#endif // !MMTK_GC
diff --git a/src/gc.c b/src/gc.c
index a189ac24b9f95..ed7188a1b449a 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -3345,6 +3345,18 @@ void jl_deinit_thread_heap(jl_ptls_t ptls)
     // Do nothing
 }
 
+void jl_free_thread_gc_state(jl_ptls_t ptls)
+{
+    jl_gc_markqueue_t *mq = &ptls->gc_tls.mark_queue;
+    ws_queue_t *cq = &mq->chunk_queue;
+    free_ws_array(jl_atomic_load_relaxed(&cq->array));
+    jl_atomic_store_relaxed(&cq->array, NULL);
+    ws_queue_t *q = &mq->ptr_queue;
+    free_ws_array(jl_atomic_load_relaxed(&q->array));
+    jl_atomic_store_relaxed(&q->array, NULL);
+    arraylist_free(&mq->reclaim_set);
+}
+
 // System-wide initializations
 void jl_gc_init(void)
 {
diff --git a/src/jitlayers.h b/src/jitlayers.h
index 393e6d81e418d..aed88f05a1cfb 100644
--- a/src/jitlayers.h
+++ b/src/jitlayers.h
@@ -320,7 +320,12 @@ class MaxAlignedAllocImpl
     LLVM_ATTRIBUTE_RETURNS_NONNULL void *Allocate(size_t Size, Align Alignment) {
         Align MaxAlign = alignment(Size);
         assert(Alignment < MaxAlign); (void)Alignment;
-        return jl_gc_perm_alloc(Size, 0, MaxAlign.value(), offset);
+        void* result = jl_gc_perm_alloc(Size, 0, MaxAlign.value(), offset);
+#ifdef MMTK_GC
+        jl_ptls_t ptls = jl_current_task->ptls;
+        mmtk_immortal_post_alloc_fast(&ptls->mmtk_mutator, jl_valueof(result), Size);
+#endif
+        return result;
     }
 
     inline LLVM_ATTRIBUTE_RETURNS_NONNULL
diff --git a/src/llvm-final-gc-lowering.cpp b/src/llvm-final-gc-lowering.cpp
index 9339cbff1ec61..ac2e6c385d0a5 100644
--- a/src/llvm-final-gc-lowering.cpp
+++ b/src/llvm-final-gc-lowering.cpp
@@ -302,12 +302,12 @@ void FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
                 builder.CreateStore(new_cursor, cursor_ptr);
 
                 // ptls->gc_num.allocd += osize;
-                // auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, gc_num));
-                // auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos);
-                // auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc");
-                // auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls);
-                // auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize);
-                // builder.CreateStore(pool_allocd_total, pool_alloc_tls);
+                auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, gc_num));
+                auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos);
+                auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc");
+                auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls);
+                auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize);
+                builder.CreateStore(pool_allocd_total, pool_alloc_tls);
 
                 auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t)));
                 auto v_as_ptr = builder.CreateIntToPtr(v_raw, poolAllocFunc->getReturnType());
@@ -316,7 +316,7 @@ void FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
                 phiNode->addIncoming(new_call, slowpath);
                 phiNode->addIncoming(v_as_ptr, fastpath);
                 phiNode->takeName(target);
-                
+
                 target->replaceAllUsesWith(phiNode);
                 return;
             } else {
@@ -364,6 +364,13 @@ bool FinalLowerGC::runOnFunction(Function &F)
     allocTypedFunc = getOrDeclare(jl_well_known::GCAllocTyped);
     T_size = F.getParent()->getDataLayout().getIntPtrType(F.getContext());
 
+#ifdef MMTK_GC
+    writeBarrier1Func = getOrDeclare(jl_well_known::GCWriteBarrier1);
+    writeBarrier2Func = getOrDeclare(jl_well_known::GCWriteBarrier2);
+    writeBarrier1SlowFunc = getOrDeclare(jl_well_known::GCWriteBarrier1Slow);
+    writeBarrier2SlowFunc = getOrDeclare(jl_well_known::GCWriteBarrier2Slow);
+#endif
+
     // Lower all calls to supported intrinsics.
     for (auto &BB : F) {
         for (auto it = BB.begin(); it != BB.end();) {
diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
index f257afd2c6211..7ce2732e4280b 100644
--- a/src/llvm-late-gc-lowering.cpp
+++ b/src/llvm-late-gc-lowering.cpp
@@ -2617,7 +2617,7 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) {
         // FIXME: Currently we call write barrier with the src object (parent).
         // This works fine for object barrier for generational plans (such as stickyimmix), which does not use the target object at all.
         // But for other MMTk plans, we need to be careful.
-        const bool INLINE_WRITE_BARRIER = true;
+        const bool INLINE_WRITE_BARRIER = false;
         if (CI->getCalledOperand() == write_barrier_func) {
             if (MMTK_NEEDS_WRITE_BARRIER == MMTK_OBJECT_BARRIER) {
                 if (INLINE_WRITE_BARRIER) {
diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index 8a7d95871d7c6..c2b9aff11ac3e 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -32,7 +32,23 @@ JL_DLLEXPORT void jl_gc_set_cb_notify_external_free(jl_gc_cb_notify_external_fre
 JL_DLLEXPORT void jl_gc_set_cb_notify_gc_pressure(jl_gc_cb_notify_gc_pressure_t cb, int enable)
 {
 }
+JL_DLLEXPORT void jl_gc_take_page_profile(ios_t *stream)
+{
+}
+
+JL_DLLEXPORT double jl_gc_page_utilization_stats[JL_GC_N_MAX_POOLS];
+
+STATIC_INLINE void gc_dump_page_utilization_data(void) JL_NOTSAFEPOINT
+{
+    // FIXME: MMTk would have to provide its own stats
+}
 
+#define MMTK_GC_PAGE_SZ (1 << 12) // MMTk's page size is defined in mmtk-core constants
+
+JL_DLLEXPORT uint64_t jl_get_pg_size(void)
+{
+    return MMTK_GC_PAGE_SZ;
+}
 
 inline void maybe_collect(jl_ptls_t ptls)
 {
@@ -271,6 +287,10 @@ void jl_init_thread_heap(jl_ptls_t ptls)
     mmtk_post_bind_mutator(&ptls->mmtk_mutator, mmtk_mutator);
 }
 
+void jl_free_thread_gc_state(jl_ptls_t ptls)
+{
+}
+
 void jl_deinit_thread_heap(jl_ptls_t ptls)
 {
     mmtk_destroy_mutator(&ptls->mmtk_mutator);
@@ -380,24 +400,26 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
 {
     jl_gcframe_t **pgcstack = jl_get_pgcstack();
     jl_task_t *ct = jl_current_task;
-    if (pgcstack && ct->world_age) {
+    void *data = malloc(sz);
+    if (data != NULL && pgcstack != NULL && ct->world_age) {
         jl_ptls_t ptls = ct->ptls;
         malloc_maybe_collect(ptls, sz);
         jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, sz);
     }
-    return malloc(sz);
+    return data;
 }
 
 JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
 {
     jl_gcframe_t **pgcstack = jl_get_pgcstack();
     jl_task_t *ct = jl_current_task;
-    if (pgcstack && ct->world_age) {
+    void *data = calloc(nm, sz);
+    if (data != NULL && pgcstack != NULL && ct->world_age) {
         jl_ptls_t ptls = ct->ptls;
         malloc_maybe_collect(ptls, nm * sz);
         jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, nm * sz);
     }
-    return calloc(nm, sz);
+    return data;
 }
 
 JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz)
@@ -405,7 +427,7 @@ JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz)
     jl_gcframe_t **pgcstack = jl_get_pgcstack();
     jl_task_t *ct = jl_current_task;
     free(p);
-    if (pgcstack && ct->world_age) {
+    if (pgcstack != NULL && ct->world_age) {
         jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, -sz);
     }
 }

From 4a17579236fec2d207860b5368311709f860bdcf Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 25 Jul 2024 04:30:56 +0000
Subject: [PATCH 059/116] Fix test in make-Profile

---
 src/mmtk-gc.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index c2b9aff11ac3e..5a104c4856c54 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -32,8 +32,16 @@ JL_DLLEXPORT void jl_gc_set_cb_notify_external_free(jl_gc_cb_notify_external_fre
 JL_DLLEXPORT void jl_gc_set_cb_notify_gc_pressure(jl_gc_cb_notify_gc_pressure_t cb, int enable)
 {
 }
+
+// mutex for page profile
+uv_mutex_t page_profile_lock;
+
 JL_DLLEXPORT void jl_gc_take_page_profile(ios_t *stream)
 {
+    uv_mutex_lock(&page_profile_lock);
+    const char *str = "Page profiler in unsupported in MMTk.";
+    ios_write(stream, str, strlen(str));
+    uv_mutex_unlock(&page_profile_lock);
 }
 
 JL_DLLEXPORT double jl_gc_page_utilization_stats[JL_GC_N_MAX_POOLS];

From 6fee739cdb3c00599ba4faee49895cc0094e91a3 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 31 Jul 2024 06:38:35 +0000
Subject: [PATCH 060/116] Fixing macro to lower intrinsics properly

---
 src/llvm-final-gc-lowering.cpp | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/src/llvm-final-gc-lowering.cpp b/src/llvm-final-gc-lowering.cpp
index ac2e6c385d0a5..f8802a8f62514 100644
--- a/src/llvm-final-gc-lowering.cpp
+++ b/src/llvm-final-gc-lowering.cpp
@@ -396,14 +396,24 @@ bool FinalLowerGC::runOnFunction(Function &F)
             LOWER_INTRINSIC(popGCFrame, lowerPopGCFrame);
             LOWER_INTRINSIC(getGCFrameSlot, lowerGetGCFrameSlot);
             LOWER_INTRINSIC(GCAllocBytes, lowerGCAllocBytes);
-            LOWER_INTRINSIC(queueGCRoot, lowerQueueGCRoot);
             LOWER_INTRINSIC(safepoint, lowerSafepoint);
 
+// These lowerings preserve the CI and do not erase them from the parent
+#define LOWER_WB_INTRINSIC(INTRINSIC, LOWER_INTRINSIC_FUNC) \
+            auto INTRINSIC = getOrNull(jl_intrinsics::INTRINSIC); \
+            if (INTRINSIC == callee) { \
+                LOWER_INTRINSIC_FUNC(CI, F); \
+                ++it; \
+                continue; \
+            } \
+
+            LOWER_WB_INTRINSIC(queueGCRoot, lowerQueueGCRoot);
+
 #ifdef MMTK_GC
-            LOWER_INTRINSIC(writeBarrier1, lowerWriteBarrier1);
-            LOWER_INTRINSIC(writeBarrier2, lowerWriteBarrier2);
-            LOWER_INTRINSIC(writeBarrier1Slow, lowerWriteBarrier1Slow);
-            LOWER_INTRINSIC(writeBarrier2Slow, lowerWriteBarrier2Slow);
+            LOWER_WB_INTRINSIC(writeBarrier1, lowerWriteBarrier1);
+            LOWER_WB_INTRINSIC(writeBarrier2, lowerWriteBarrier2);
+            LOWER_WB_INTRINSIC(writeBarrier1Slow, lowerWriteBarrier1Slow);
+            LOWER_WB_INTRINSIC(writeBarrier2Slow, lowerWriteBarrier2Slow);
 #endif
             ++it;
 

From 708e4355c2f9397a0012110c04a8bd1d581969e1 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 31 Jul 2024 06:40:12 +0000
Subject: [PATCH 061/116] Remove jl_gc_wb_buf and jl_gc_wb_binding functions

---
 src/julia_internal.h | 28 ----------------------------
 1 file changed, 28 deletions(-)

diff --git a/src/julia_internal.h b/src/julia_internal.h
index 493d75f10eebf..530dc3db8e567 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -667,34 +667,6 @@ void jl_gc_add_finalizer_(jl_ptls_t ptls, void *v, void *f) JL_NOTSAFEPOINT;
 
 void gc_setmark_buf(jl_ptls_t ptls, void *buf, uint8_t, size_t) JL_NOTSAFEPOINT;
 
-#ifndef MMTK_GC
-STATIC_INLINE void jl_gc_wb_binding(jl_binding_t *bnd, void *val) JL_NOTSAFEPOINT // val isa jl_value_t*
-{
-    jl_gc_wb(bnd, val);
-}
-
-STATIC_INLINE void jl_gc_wb_buf(void *parent, void *bufptr, size_t minsz) JL_NOTSAFEPOINT // parent isa jl_value_t*
-{
-    // if parent is marked and buf is not
-    if (__unlikely(jl_astaggedvalue(parent)->bits.gc & 1)) {
-        jl_task_t *ct = jl_current_task;
-        gc_setmark_buf(ct->ptls, bufptr, 3, minsz);
-    }
-}
-
-#else  // MMTK_GC
-
-STATIC_INLINE void jl_gc_wb_binding(jl_binding_t *bnd, void *val) JL_NOTSAFEPOINT // val isa jl_value_t*
-{
-    mmtk_gc_wb(bnd, val);
-}
-
-STATIC_INLINE void jl_gc_wb_buf(void *parent, void *bufptr, size_t minsz) JL_NOTSAFEPOINT // parent isa jl_value_t*
-{
-    mmtk_gc_wb(parent, (void*)0);
-}
-#endif // MMTK_GC
-
 JL_DLLEXPORT void jl_gc_array_ptr_copy(jl_array_t *dest, void **dest_p, jl_array_t *src, void **src_p, ssize_t n) JL_NOTSAFEPOINT;
 
 void jl_gc_debug_print_status(void) JL_NOTSAFEPOINT;

From f525fef1cdf20579a1404cdfd126ecad8fba6374 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Fri, 2 Aug 2024 00:41:29 +0000
Subject: [PATCH 062/116] Pinning generic memory and owners (wip)

---
 src/genericmemory.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/genericmemory.c b/src/genericmemory.c
index 24db8f29f1a12..f3fa3e2d77320 100644
--- a/src/genericmemory.c
+++ b/src/genericmemory.c
@@ -54,6 +54,7 @@ jl_genericmemory_t *_new_genericmemory_(jl_value_t *mtype, size_t nel, int8_t is
         tot = sizeof(jl_genericmemory_t) + sizeof(void*);
     }
     m = (jl_genericmemory_t*)jl_gc_alloc(ct->ptls, tot, mtype);
+    PTR_PIN(m);
 
     if (pooled) {
         data = (char*)m + JL_SMALL_BYTE_ALIGNMENT;
@@ -107,9 +108,11 @@ JL_DLLEXPORT jl_genericmemory_t *jl_string_to_genericmemory(jl_value_t *str)
     jl_task_t *ct = jl_current_task;
     int tsz = sizeof(jl_genericmemory_t) + sizeof(void*);
     jl_genericmemory_t *m = (jl_genericmemory_t*)jl_gc_alloc(ct->ptls, tsz, jl_memory_uint8_type);
+    PTR_PIN(m);
     m->length = jl_string_len(str);
     m->ptr = jl_string_data(str);
     jl_genericmemory_data_owner_field(m) = str;
+    PTR_PIN(str);
     return m;
 }
 
@@ -160,6 +163,7 @@ JL_DLLEXPORT jl_genericmemory_t *jl_ptr_to_genericmemory(jl_value_t *mtype, void
         jl_exceptionf(jl_argumenterror_type, "invalid GenericMemory size: too large for system address width");
     int tsz = sizeof(jl_genericmemory_t) + sizeof(void*);
     m = (jl_genericmemory_t*)jl_gc_alloc(ct->ptls, tsz, mtype);
+    PTR_PIN(m);
     m->ptr = data;
     m->length = nel;
     jl_genericmemory_data_owner_field(m) = own_buffer ? (jl_value_t*)m : NULL;
@@ -249,9 +253,11 @@ JL_DLLEXPORT jl_genericmemory_t *jl_genericmemory_slice(jl_genericmemory_t *mem,
     }
     jl_task_t *ct = jl_current_task;
     jl_genericmemory_t *newmem = (jl_genericmemory_t*)jl_gc_alloc(ct->ptls, sizeof(jl_genericmemory_t) + sizeof(void*), dt);
+    PTR_PIN(newmem);
     newmem->length = len;
     newmem->ptr = data;
     jl_genericmemory_data_owner_field(newmem) = jl_genericmemory_owner(mem);
+    PTR_PIN(jl_genericmemory_owner(mem));
     return newmem;
 }
 

From 5cf0dae060a45b1d93fe88d3e0028375c3d9e5eb Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Fri, 2 Aug 2024 00:42:02 +0000
Subject: [PATCH 063/116] Inlining write barrier

---
 src/llvm-late-gc-lowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
index 7ce2732e4280b..f257afd2c6211 100644
--- a/src/llvm-late-gc-lowering.cpp
+++ b/src/llvm-late-gc-lowering.cpp
@@ -2617,7 +2617,7 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) {
         // FIXME: Currently we call write barrier with the src object (parent).
         // This works fine for object barrier for generational plans (such as stickyimmix), which does not use the target object at all.
         // But for other MMTk plans, we need to be careful.
-        const bool INLINE_WRITE_BARRIER = false;
+        const bool INLINE_WRITE_BARRIER = true;
         if (CI->getCalledOperand() == write_barrier_func) {
             if (MMTK_NEEDS_WRITE_BARRIER == MMTK_OBJECT_BARRIER) {
                 if (INLINE_WRITE_BARRIER) {

From ec77b2b0e725e35733e81f71b281bb18dc9293b6 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Fri, 2 Aug 2024 03:33:09 +0000
Subject: [PATCH 064/116] Adding wb on array copying; undef new macro as well

---
 src/genericmemory.c            | 1 +
 src/llvm-final-gc-lowering.cpp | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/genericmemory.c b/src/genericmemory.c
index f3fa3e2d77320..d98c8302d3573 100644
--- a/src/genericmemory.c
+++ b/src/genericmemory.c
@@ -273,6 +273,7 @@ JL_DLLEXPORT void jl_genericmemory_copyto(jl_genericmemory_t *dest, char* destda
         _Atomic(void*) * dest_p = (_Atomic(void*)*)destdata;
         _Atomic(void*) * src_p = (_Atomic(void*)*)srcdata;
         jl_value_t *owner = jl_genericmemory_owner(dest);
+        mmtk_gc_wb(owner, NULL);
         if (__unlikely(jl_astaggedvalue(owner)->bits.gc == GC_OLD_MARKED)) {
             jl_value_t *src_owner = jl_genericmemory_owner(src);
             ssize_t done = 0;
diff --git a/src/llvm-final-gc-lowering.cpp b/src/llvm-final-gc-lowering.cpp
index f8802a8f62514..401ad983f8c81 100644
--- a/src/llvm-final-gc-lowering.cpp
+++ b/src/llvm-final-gc-lowering.cpp
@@ -418,6 +418,7 @@ bool FinalLowerGC::runOnFunction(Function &F)
             ++it;
 
 #undef LOWER_INTRINSIC
+#undef LOWER_WB_INTRINSIC
         }
     }
 

From 7cc64d592612ec61259e0ed93f7ba64f22277f95 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Fri, 2 Aug 2024 05:10:54 +0000
Subject: [PATCH 065/116] Enabled inlined fastpath allocation again

---
 src/llvm-final-gc-lowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llvm-final-gc-lowering.cpp b/src/llvm-final-gc-lowering.cpp
index 401ad983f8c81..9090460662c73 100644
--- a/src/llvm-final-gc-lowering.cpp
+++ b/src/llvm-final-gc-lowering.cpp
@@ -247,7 +247,7 @@ void FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
 
             // Should we generate fastpath allocation sequence here? We should always generate fastpath here for MMTk.
             // Setting this to false will increase allocation overhead a lot, and should only be used for debugging.
-            const bool INLINE_FASTPATH_ALLOCATION = false;
+            const bool INLINE_FASTPATH_ALLOCATION = true;
 
             if (INLINE_FASTPATH_ALLOCATION) {
                 // Assuming we use the first immix allocator.

From 9dc444834e093cd26737f4b0cbf1211400a8701c Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Tue, 6 Aug 2024 11:25:59 +0000
Subject: [PATCH 066/116] Enabling fastpath allocation

---
 src/llvm-final-gc-lowering.cpp | 139 +++++----------------------------
 src/llvm-late-gc-lowering.cpp  | 139 +++++++++++++++++++++++++++++++++
 2 files changed, 160 insertions(+), 118 deletions(-)

diff --git a/src/llvm-final-gc-lowering.cpp b/src/llvm-final-gc-lowering.cpp
index 9090460662c73..8bfb5e3b32a5e 100644
--- a/src/llvm-final-gc-lowering.cpp
+++ b/src/llvm-final-gc-lowering.cpp
@@ -103,6 +103,7 @@ void FinalLowerGC::lowerNewGCFrame(CallInst *target, Function &F)
     builder.CreateMemSet(gcframe, Constant::getNullValue(Type::getInt8Ty(F.getContext())), ptrsize * (nRoots + 2), Align(16), tbaa_gcframe);
 
     target->replaceAllUsesWith(gcframe);
+    target->eraseFromParent();
 }
 
 void FinalLowerGC::lowerPushGCFrame(CallInst *target, Function &F)
@@ -130,6 +131,7 @@ void FinalLowerGC::lowerPushGCFrame(CallInst *target, Function &F)
             gcframe,
             pgcstack,
             Align(sizeof(void*)));
+    target->eraseFromParent();
 }
 
 void FinalLowerGC::lowerPopGCFrame(CallInst *target, Function &F)
@@ -148,6 +150,7 @@ void FinalLowerGC::lowerPopGCFrame(CallInst *target, Function &F)
         pgcstack,
         Align(sizeof(void*)));
     inst->setMetadata(LLVMContext::MD_tbaa, tbaa_gcframe);
+    target->eraseFromParent();
 }
 
 void FinalLowerGC::lowerGetGCFrameSlot(CallInst *target, Function &F)
@@ -167,6 +170,7 @@ void FinalLowerGC::lowerGetGCFrameSlot(CallInst *target, Function &F)
     auto gep = builder.CreateInBoundsGEP(T_prjlvalue, gcframe, index);
     gep->takeName(target);
     target->replaceAllUsesWith(gep);
+    target->eraseFromParent();
 }
 
 void FinalLowerGC::lowerQueueGCRoot(CallInst *target, Function &F)
@@ -183,6 +187,7 @@ void FinalLowerGC::lowerSafepoint(CallInst *target, Function &F)
     IRBuilder<> builder(target);
     Value* signal_page = target->getOperand(0);
     builder.CreateLoad(T_size, signal_page, true);
+    target->eraseFromParent();
 }
 
 #ifdef MMTK_GC
@@ -209,7 +214,6 @@ void FinalLowerGC::lowerWriteBarrier2Slow(CallInst *target, Function &F)
     assert(target->arg_size() == 2);
     target->setCalledFunction(writeBarrier2SlowFunc);
 }
-
 #endif
 
 void FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
@@ -235,97 +239,11 @@ void FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
                 derefBytes = sz;
         }
         else {
-        #ifndef MMTK_GC
             auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), offset);
             auto pool_osize = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize);
             newI = builder.CreateCall(poolAllocFunc, { ptls, pool_offs, pool_osize, type });
             if (sz > 0)
                 derefBytes = sz;
-        #else // MMTK_GC
-            auto pool_osize_i32 = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize);
-            auto pool_osize = ConstantInt::get(Type::getInt64Ty(F.getContext()), osize);
-
-            // Should we generate fastpath allocation sequence here? We should always generate fastpath here for MMTk.
-            // Setting this to false will increase allocation overhead a lot, and should only be used for debugging.
-            const bool INLINE_FASTPATH_ALLOCATION = true;
-
-            if (INLINE_FASTPATH_ALLOCATION) {
-                // Assuming we use the first immix allocator.
-                // FIXME: We should get the allocator index and type from MMTk.
-                auto allocator_offset = offsetof(jl_tls_states_t, mmtk_mutator) + offsetof(MMTkMutatorContext, allocators) + offsetof(Allocators, immix);
-
-                auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, cursor));
-                auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()),  allocator_offset + offsetof(ImmixAllocator, limit));
-
-                auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos);
-                auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr");
-                auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor");
-
-                // offset = 8
-                auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8));
-                auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor);
-                auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor);
-                // alignment 16 (15 = 16 - 1)
-                auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta");
-                auto result = builder.CreateNSWAdd(cursor, delta, "result");
-
-                auto new_cursor = builder.CreateNSWAdd(result, pool_osize);
-
-                auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos);
-                auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr");
-                auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit");
-
-                auto gt_limit = builder.CreateICmpSGT(new_cursor, limit);
-
-                auto current_block = target->getParent();
-                builder.SetInsertPoint(target->getNextNode());
-                auto phiNode = builder.CreatePHI(poolAllocFunc->getReturnType(), 2, "phi_fast_slow");
-                auto top_cont = current_block->splitBasicBlock(target->getNextNode(), "top_cont");
-
-                auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction());
-                auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction(), top_cont);
-
-                auto next_br = current_block->getTerminator();
-                next_br->eraseFromParent();
-                builder.SetInsertPoint(current_block);
-                builder.CreateCondBr(gt_limit, slowpath, fastpath);
-
-                // slowpath
-                builder.SetInsertPoint(slowpath);
-                auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1);
-                auto new_call = builder.CreateCall(poolAllocFunc, { ptls, pool_offs, pool_osize_i32, type });
-                new_call->setAttributes(new_call->getCalledFunction()->getAttributes());
-                builder.CreateBr(top_cont);
-
-                // // fastpath
-                builder.SetInsertPoint(fastpath);
-                builder.CreateStore(new_cursor, cursor_ptr);
-
-                // ptls->gc_num.allocd += osize;
-                auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, gc_num));
-                auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos);
-                auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc");
-                auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls);
-                auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize);
-                builder.CreateStore(pool_allocd_total, pool_alloc_tls);
-
-                auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t)));
-                auto v_as_ptr = builder.CreateIntToPtr(v_raw, poolAllocFunc->getReturnType());
-                builder.CreateBr(top_cont);
-
-                phiNode->addIncoming(new_call, slowpath);
-                phiNode->addIncoming(v_as_ptr, fastpath);
-                phiNode->takeName(target);
-
-                target->replaceAllUsesWith(phiNode);
-                return;
-            } else {
-                auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1);
-                newI = builder.CreateCall(poolAllocFunc, { ptls, pool_offs, pool_osize_i32, type });
-                if (sz > 0)
-                    derefBytes = sz;
-            }
-        #endif // MMTK_GC
         }
     } else {
         auto size = builder.CreateZExtOrTrunc(target->getArgOperand(1), T_size);
@@ -333,7 +251,6 @@ void FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
         newI = builder.CreateCall(allocTypedFunc, { ptls, size, type });
         derefBytes = sizeof(void*);
     }
-
     newI->setAttributes(newI->getCalledFunction()->getAttributes());
     unsigned align = std::max((unsigned)target->getRetAlign().valueOrOne().value(), (unsigned)sizeof(void*));
     newI->addRetAttr(Attribute::getWithAlignment(F.getContext(), Align(align)));
@@ -341,6 +258,7 @@ void FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
         newI->addDereferenceableRetAttr(derefBytes);
     newI->takeName(target);
     target->replaceAllUsesWith(newI);
+    target->eraseFromParent();
 }
 
 bool FinalLowerGC::runOnFunction(Function &F)
@@ -362,63 +280,48 @@ bool FinalLowerGC::runOnFunction(Function &F)
     poolAllocFunc = getOrDeclare(jl_well_known::GCPoolAlloc);
     bigAllocFunc = getOrDeclare(jl_well_known::GCBigAlloc);
     allocTypedFunc = getOrDeclare(jl_well_known::GCAllocTyped);
-    T_size = F.getParent()->getDataLayout().getIntPtrType(F.getContext());
-
 #ifdef MMTK_GC
     writeBarrier1Func = getOrDeclare(jl_well_known::GCWriteBarrier1);
     writeBarrier2Func = getOrDeclare(jl_well_known::GCWriteBarrier2);
     writeBarrier1SlowFunc = getOrDeclare(jl_well_known::GCWriteBarrier1Slow);
     writeBarrier2SlowFunc = getOrDeclare(jl_well_known::GCWriteBarrier2Slow);
 #endif
+    T_size = F.getParent()->getDataLayout().getIntPtrType(F.getContext());
 
     // Lower all calls to supported intrinsics.
     for (auto &BB : F) {
-        for (auto it = BB.begin(); it != BB.end();) {
-            auto *CI = dyn_cast<CallInst>(&*it);
-            if (!CI) {
-                ++it;
+        for (auto &I : make_early_inc_range(BB)) {
+            auto *CI = dyn_cast<CallInst>(&I);
+            if (!CI)
                 continue;
-            }
 
             Value *callee = CI->getCalledOperand();
             assert(callee);
 
 #define LOWER_INTRINSIC(INTRINSIC, LOWER_INTRINSIC_FUNC) \
-            auto INTRINSIC = getOrNull(jl_intrinsics::INTRINSIC); \
-            if (INTRINSIC == callee) { \
-                LOWER_INTRINSIC_FUNC(CI, F); \
-                it = CI->eraseFromParent(); \
-                continue; \
-            } \
+            do { \
+                auto intrinsic = getOrNull(jl_intrinsics::INTRINSIC); \
+                if (intrinsic == callee) { \
+                    LOWER_INTRINSIC_FUNC(CI, F); \
+                } \
+            } while (0)
 
             LOWER_INTRINSIC(newGCFrame, lowerNewGCFrame);
             LOWER_INTRINSIC(pushGCFrame, lowerPushGCFrame);
             LOWER_INTRINSIC(popGCFrame, lowerPopGCFrame);
             LOWER_INTRINSIC(getGCFrameSlot, lowerGetGCFrameSlot);
             LOWER_INTRINSIC(GCAllocBytes, lowerGCAllocBytes);
+            LOWER_INTRINSIC(queueGCRoot, lowerQueueGCRoot);
             LOWER_INTRINSIC(safepoint, lowerSafepoint);
 
-// These lowerings preserve the CI and do not erase them from the parent
-#define LOWER_WB_INTRINSIC(INTRINSIC, LOWER_INTRINSIC_FUNC) \
-            auto INTRINSIC = getOrNull(jl_intrinsics::INTRINSIC); \
-            if (INTRINSIC == callee) { \
-                LOWER_INTRINSIC_FUNC(CI, F); \
-                ++it; \
-                continue; \
-            } \
-
-            LOWER_WB_INTRINSIC(queueGCRoot, lowerQueueGCRoot);
-
 #ifdef MMTK_GC
-            LOWER_WB_INTRINSIC(writeBarrier1, lowerWriteBarrier1);
-            LOWER_WB_INTRINSIC(writeBarrier2, lowerWriteBarrier2);
-            LOWER_WB_INTRINSIC(writeBarrier1Slow, lowerWriteBarrier1Slow);
-            LOWER_WB_INTRINSIC(writeBarrier2Slow, lowerWriteBarrier2Slow);
+            LOWER_INTRINSIC(writeBarrier1, lowerNewGCFrame);
+            LOWER_INTRINSIC(writeBarrier2, lowerNewGCFrame);
+            LOWER_INTRINSIC(writeBarrier1Slow, lowerNewGCFrame);
+            LOWER_INTRINSIC(writeBarrier2Slow, lowerNewGCFrame);
 #endif
-            ++it;
 
 #undef LOWER_INTRINSIC
-#undef LOWER_WB_INTRINSIC
         }
     }
 
diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
index f257afd2c6211..eb63e1196e8ae 100644
--- a/src/llvm-late-gc-lowering.cpp
+++ b/src/llvm-late-gc-lowering.cpp
@@ -30,6 +30,7 @@
 #include <llvm/Support/Debug.h>
 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
 #include <llvm/Transforms/Utils/ModuleUtils.h>
+#include <llvm/Analysis/DomTreeUpdater.h>
 
 #include <llvm/InitializePasses.h>
 
@@ -323,6 +324,7 @@ struct LateLowerGCFrame:  private JuliaPassContext {
 
 private:
     CallInst *pgcstack;
+    Function *poolAllocFunc;
 
     void MaybeNoteDef(State &S, BBState &BBS, Value *Def, const ArrayRef<int> &SafepointsSoFar,
                       SmallVector<int, 1> &&RefinedPtr = SmallVector<int, 1>());
@@ -359,6 +361,10 @@ struct LateLowerGCFrame:  private JuliaPassContext {
     void RefineLiveSet(LargeSparseBitVector &LS, State &S, ArrayRef<int> CalleeRoots);
     Value *EmitTagPtr(IRBuilder<> &builder, Type *T, Type *T_size, Value *V);
     Value *EmitLoadTag(IRBuilder<> &builder, Type *T_size, Value *V);
+
+#ifdef MMTK_GC
+    Value* lowerGCAllocBytesLate(CallInst *target, Function &F);
+#endif
 };
 
 static unsigned getValueAddrSpace(Value *V) {
@@ -2880,8 +2886,118 @@ void LateLowerGCFrame::PlaceRootsAndUpdateCalls(SmallVectorImpl<int> &Colors, St
     }
 }
 
+Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
+{
+    assert(target->arg_size() == 3);
+
+    IRBuilder<> builder(target);
+    auto ptls = target->getArgOperand(0);
+    auto type = target->getArgOperand(2);
+    if (auto CI = dyn_cast<ConstantInt>(target->getArgOperand(1))) {
+        size_t sz = (size_t)CI->getZExtValue();
+        // This is strongly architecture and OS dependent
+        int osize;
+        int offset = jl_gc_classify_pools(sz, &osize);
+        if (offset >= 0) {
+            // In this case julia.gc_alloc_bytes will simply become a call to jl_gc_pool_alloc in the final GC lowering pass
+            auto pool_osize_i32 = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize);
+            auto pool_osize = ConstantInt::get(Type::getInt64Ty(F.getContext()), osize);
+
+            // Should we generate fastpath allocation sequence here? We should always generate fastpath here for MMTk.
+            // Setting this to false will increase allocation overhead a lot, and should only be used for debugging.
+            const bool INLINE_FASTPATH_ALLOCATION = true;
+
+            if (INLINE_FASTPATH_ALLOCATION) {
+                // Assuming we use the first immix allocator.
+                // FIXME: We should get the allocator index and type from MMTk.
+                auto allocator_offset = offsetof(jl_tls_states_t, mmtk_mutator) + offsetof(MMTkMutatorContext, allocators) + offsetof(Allocators, immix);
+
+                auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, cursor));
+                auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()),  allocator_offset + offsetof(ImmixAllocator, limit));
+
+                auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos);
+                auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr");
+                auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor");
+
+                // offset = 8
+                auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8));
+                auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor);
+                auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor);
+                // alignment 16 (15 = 16 - 1)
+                auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta");
+                auto result = builder.CreateNSWAdd(cursor, delta, "result");
+
+                auto new_cursor = builder.CreateNSWAdd(result, pool_osize);
+
+                auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos);
+                auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr");
+                auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit");
+
+                auto gt_limit = builder.CreateICmpSGT(new_cursor, limit);
+
+                auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction());
+                auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction());
+
+                auto next_instr = target->getNextNode();
+                DomTreeUpdater dtu = DomTreeUpdater(&GetDT(), llvm::DomTreeUpdater::UpdateStrategy::Lazy);
+                MDBuilder MDB(F.getContext());
+                SmallVector<uint32_t, 2> Weights{1, 9};
+                SplitBlockAndInsertIfThenElse(gt_limit, next_instr, &slowpath, &fastpath, false, false, MDB.createBranchWeights(Weights), &dtu);
+
+                builder.SetInsertPoint(next_instr);
+                auto phiNode = builder.CreatePHI(target->getCalledFunction()->getReturnType(), 2, "phi_fast_slow");
+
+                // slowpath
+                builder.SetInsertPoint(slowpath);
+                auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1);
+                auto new_call = builder.CreateCall(poolAllocFunc, { ptls, pool_offs, pool_osize_i32, type });
+                new_call->setAttributes(new_call->getCalledFunction()->getAttributes());
+                builder.CreateBr(next_instr->getParent());
+
+                // // fastpath
+                builder.SetInsertPoint(fastpath);
+                builder.CreateStore(new_cursor, cursor_ptr);
+
+                // ptls->gc_num.allocd += osize;
+                // auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_num));
+                // auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos);
+                // auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc");
+                // auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls);
+                // auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize);
+                // builder.CreateStore(pool_allocd_total, pool_alloc_tls);
+
+                auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t)));
+                auto v_as_ptr = builder.CreateIntToPtr(v_raw, poolAllocFunc->getReturnType());
+                builder.CreateBr(next_instr->getParent());
+
+                phiNode->addIncoming(new_call, slowpath);
+                phiNode->addIncoming(v_as_ptr, fastpath);
+                phiNode->takeName(target);
+                return phiNode;
+            }
+        }
+    }
+    return target;
+}
+
+template<typename TIterator>
+static void replaceInstruction(
+    Instruction *oldInstruction,
+    Value *newInstruction,
+    TIterator &it)
+{
+    if (newInstruction != oldInstruction) {
+        oldInstruction->replaceAllUsesWith(newInstruction);
+        it = oldInstruction->eraseFromParent();
+    }
+    else {
+        ++it;
+    }
+}
+
 bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) {
     initAll(*F.getParent());
+    poolAllocFunc = getOrDeclare(jl_well_known::GCPoolAlloc);
     LLVM_DEBUG(dbgs() << "GC ROOT PLACEMENT: Processing function " << F.getName() << "\n");
     if (!pgcstack_getter && !adoptthread_func)
         return CleanupIR(F, nullptr, CFGModified);
@@ -2896,6 +3012,29 @@ bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) {
     std::map<Value *, std::pair<int, int>> CallFrames; // = OptimizeCallFrames(S, Ordering);
     PlaceRootsAndUpdateCalls(Colors, S, CallFrames);
     CleanupIR(F, &S, CFGModified);
+
+#ifdef MMTK_GC
+    // We lower the julia.gc_alloc_bytes intrinsic in this pass to insert slowpath/fastpath blocks for MMTk
+    for (BasicBlock &BB : F) {
+        for (auto it = BB.begin(); it != BB.end();) {
+            auto *CI = dyn_cast<CallInst>(&*it);
+            if (!CI) {
+                ++it;
+                continue;
+            }
+
+            Value *callee = CI->getCalledOperand();
+            assert(callee);
+
+            auto GCAllocBytes = getOrNull(jl_intrinsics::GCAllocBytes);
+            if (GCAllocBytes == callee) {
+                replaceInstruction(CI, lowerGCAllocBytesLate(CI, F), it);
+                continue;
+            }
+            ++it;
+        }
+    }
+#endif
     return true;
 }
 

From df66882e342deac11ac7c824c0452edfc0c8a375 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 7 Aug 2024 02:24:01 +0000
Subject: [PATCH 067/116] Fixing typos when lowering write barriers

---
 src/llvm-final-gc-lowering.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/llvm-final-gc-lowering.cpp b/src/llvm-final-gc-lowering.cpp
index 8bfb5e3b32a5e..dd4dd05a89101 100644
--- a/src/llvm-final-gc-lowering.cpp
+++ b/src/llvm-final-gc-lowering.cpp
@@ -315,10 +315,10 @@ bool FinalLowerGC::runOnFunction(Function &F)
             LOWER_INTRINSIC(safepoint, lowerSafepoint);
 
 #ifdef MMTK_GC
-            LOWER_INTRINSIC(writeBarrier1, lowerNewGCFrame);
-            LOWER_INTRINSIC(writeBarrier2, lowerNewGCFrame);
-            LOWER_INTRINSIC(writeBarrier1Slow, lowerNewGCFrame);
-            LOWER_INTRINSIC(writeBarrier2Slow, lowerNewGCFrame);
+            LOWER_INTRINSIC(writeBarrier1, lowerWriteBarrier1);
+            LOWER_INTRINSIC(writeBarrier2, lowerWriteBarrier2);
+            LOWER_INTRINSIC(writeBarrier1Slow, lowerWriteBarrier1Slow);
+            LOWER_INTRINSIC(writeBarrier2Slow, lowerWriteBarrier2Slow);
 #endif
 
 #undef LOWER_INTRINSIC

From 8ef0c1547ea49c5fe4033ccf2362bd2932c2a226 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 7 Aug 2024 03:32:28 +0000
Subject: [PATCH 068/116] Updating fastpath allocation to count number of
 alloced bytes

---
 src/llvm-late-gc-lowering.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
index eb63e1196e8ae..2630afede0f7b 100644
--- a/src/llvm-late-gc-lowering.cpp
+++ b/src/llvm-late-gc-lowering.cpp
@@ -2959,12 +2959,12 @@ Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
                 builder.CreateStore(new_cursor, cursor_ptr);
 
                 // ptls->gc_num.allocd += osize;
-                // auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_num));
-                // auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos);
-                // auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc");
-                // auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls);
-                // auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize);
-                // builder.CreateStore(pool_allocd_total, pool_alloc_tls);
+                auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, gc_num));
+                auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos);
+                auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc");
+                auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls);
+                auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize);
+                builder.CreateStore(pool_allocd_total, pool_alloc_tls);
 
                 auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t)));
                 auto v_as_ptr = builder.CreateIntToPtr(v_raw, poolAllocFunc->getReturnType());

From b56b167845583bc9adbbffee64741bb9ffab80bd Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 7 Aug 2024 05:00:56 +0000
Subject: [PATCH 069/116] Updating write barrier block splitting

---
 src/llvm-late-gc-lowering.cpp | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
index 2630afede0f7b..5923214a47652 100644
--- a/src/llvm-late-gc-lowering.cpp
+++ b/src/llvm-late-gc-lowering.cpp
@@ -363,7 +363,7 @@ struct LateLowerGCFrame:  private JuliaPassContext {
     Value *EmitLoadTag(IRBuilder<> &builder, Type *T_size, Value *V);
 
 #ifdef MMTK_GC
-    Value* lowerGCAllocBytesLate(CallInst *target, Function &F);
+    Value* lowerGCAllocBytesLate(CallInst *target, Function &F, State &S);
 #endif
 };
 
@@ -2655,7 +2655,11 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) {
                     // object_reference_write_slow_call((void*) src, (void*) slot, (void*) target);
                     MDBuilder MDB(F.getContext());
                     SmallVector<uint32_t, 2> Weights{1, 9};
-                    auto mayTriggerSlowpath = SplitBlockAndInsertIfThen(is_unlogged, CI, false, MDB.createBranchWeights(Weights));
+                    if (!S->DT) {
+                        S->DT = &GetDT();
+                    }
+                    DomTreeUpdater dtu = DomTreeUpdater(S->DT, llvm::DomTreeUpdater::UpdateStrategy::Lazy);
+                    auto mayTriggerSlowpath = SplitBlockAndInsertIfThen(is_unlogged, CI, false, MDB.createBranchWeights(Weights), &dtu);
                     builder.SetInsertPoint(mayTriggerSlowpath);
                     builder.CreateCall(getOrDeclare(jl_intrinsics::writeBarrier1Slow), { parent });
                 } else {
@@ -2886,7 +2890,7 @@ void LateLowerGCFrame::PlaceRootsAndUpdateCalls(SmallVectorImpl<int> &Colors, St
     }
 }
 
-Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
+Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F, State &S)
 {
     assert(target->arg_size() == 3);
 
@@ -2939,7 +2943,10 @@ Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
                 auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction());
 
                 auto next_instr = target->getNextNode();
-                DomTreeUpdater dtu = DomTreeUpdater(&GetDT(), llvm::DomTreeUpdater::UpdateStrategy::Lazy);
+                if (!S.DT) {
+                    S.DT = &GetDT();
+                }
+                DomTreeUpdater dtu = DomTreeUpdater(S.DT, llvm::DomTreeUpdater::UpdateStrategy::Lazy);
                 MDBuilder MDB(F.getContext());
                 SmallVector<uint32_t, 2> Weights{1, 9};
                 SplitBlockAndInsertIfThenElse(gt_limit, next_instr, &slowpath, &fastpath, false, false, MDB.createBranchWeights(Weights), &dtu);
@@ -3028,7 +3035,7 @@ bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) {
 
             auto GCAllocBytes = getOrNull(jl_intrinsics::GCAllocBytes);
             if (GCAllocBytes == callee) {
-                replaceInstruction(CI, lowerGCAllocBytesLate(CI, F), it);
+                replaceInstruction(CI, lowerGCAllocBytesLate(CI, F, S), it);
                 continue;
             }
             ++it;

From 5bbfd16ef5e10c074a5c5739801a47f459932500 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 15 Aug 2024 05:55:12 +0000
Subject: [PATCH 070/116] Only pin owners, not all generic memory objects

---
 src/genericmemory.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/genericmemory.c b/src/genericmemory.c
index d98c8302d3573..6851e9131e534 100644
--- a/src/genericmemory.c
+++ b/src/genericmemory.c
@@ -54,7 +54,6 @@ jl_genericmemory_t *_new_genericmemory_(jl_value_t *mtype, size_t nel, int8_t is
         tot = sizeof(jl_genericmemory_t) + sizeof(void*);
     }
     m = (jl_genericmemory_t*)jl_gc_alloc(ct->ptls, tot, mtype);
-    PTR_PIN(m);
 
     if (pooled) {
         data = (char*)m + JL_SMALL_BYTE_ALIGNMENT;
@@ -108,7 +107,6 @@ JL_DLLEXPORT jl_genericmemory_t *jl_string_to_genericmemory(jl_value_t *str)
     jl_task_t *ct = jl_current_task;
     int tsz = sizeof(jl_genericmemory_t) + sizeof(void*);
     jl_genericmemory_t *m = (jl_genericmemory_t*)jl_gc_alloc(ct->ptls, tsz, jl_memory_uint8_type);
-    PTR_PIN(m);
     m->length = jl_string_len(str);
     m->ptr = jl_string_data(str);
     jl_genericmemory_data_owner_field(m) = str;
@@ -163,11 +161,11 @@ JL_DLLEXPORT jl_genericmemory_t *jl_ptr_to_genericmemory(jl_value_t *mtype, void
         jl_exceptionf(jl_argumenterror_type, "invalid GenericMemory size: too large for system address width");
     int tsz = sizeof(jl_genericmemory_t) + sizeof(void*);
     m = (jl_genericmemory_t*)jl_gc_alloc(ct->ptls, tsz, mtype);
-    PTR_PIN(m);
     m->ptr = data;
     m->length = nel;
     jl_genericmemory_data_owner_field(m) = own_buffer ? (jl_value_t*)m : NULL;
     if (own_buffer) {
+        PTR_PIN(m);
         int isaligned = 0;  // TODO: allow passing memalign'd buffers
         jl_gc_track_malloced_genericmemory(ct->ptls, m, isaligned);
         jl_gc_count_allocd(nel*elsz);

From df35d17559c67c7fc61ca683ea10fc80addb71bb Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 28 Aug 2024 00:44:09 +0000
Subject: [PATCH 071/116] Refactoring to be considered before adding MMTk

---
 src/gc-common.c      | 156 +++++++++++++++++++++++++++++++++++++++++++
 src/gc-common.h      |   6 ++
 src/gc-debug.c       |  41 +-----------
 src/gc-interface.h   |  12 ++++
 src/gc-stacks.c      |   4 +-
 src/gc-stock.c       | 156 ++++++++++++-------------------------------
 src/gc-stock.h       |  21 ------
 src/julia.h          |   2 +-
 src/julia_internal.h |  26 +-------
 src/scheduler.c      |  11 +++
 src/stackwalk.c      |   4 +-
 src/staticdata.c     |   2 +
 12 files changed, 237 insertions(+), 204 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index ee461b576ea9e..2ec167caa667a 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -20,6 +20,11 @@ extern "C" {
 
 jl_gc_num_t gc_num = {0};
 
+JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void)
+{
+    return gc_num.total_time;
+}
+
 // =========================================================================== //
 // GC Callbacks
 // =========================================================================== //
@@ -489,6 +494,87 @@ jl_ptls_t* gc_all_tls_states;
 // MISC
 // =========================================================================== //
 
+JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return jl_gc_new_weakref_th(ptls, value);
+}
+
+JL_DLLEXPORT void * jl_gc_alloc_typed(jl_ptls_t ptls, size_t sz, void *ty)
+{
+    return jl_gc_alloc(ptls, sz, ty);
+}
+
+JL_DLLEXPORT jl_value_t *jl_gc_allocobj(size_t sz)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return jl_gc_alloc(ptls, sz, NULL);
+}
+
+// allocation wrappers that save the size of allocations, to allow using
+// jl_gc_counted_* functions with a libc-compatible API.
+
+JL_DLLEXPORT void *jl_malloc(size_t sz)
+{
+    int64_t *p = (int64_t *)jl_gc_counted_malloc(sz + JL_SMALL_BYTE_ALIGNMENT);
+    if (p == NULL)
+        return NULL;
+    p[0] = sz;
+    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
+}
+
+//_unchecked_calloc does not check for potential overflow of nm*sz
+STATIC_INLINE void *_unchecked_calloc(size_t nm, size_t sz) {
+    size_t nmsz = nm*sz;
+    int64_t *p = (int64_t *)jl_gc_counted_calloc(nmsz + JL_SMALL_BYTE_ALIGNMENT, 1);
+    if (p == NULL)
+        return NULL;
+    p[0] = nmsz;
+    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
+}
+
+JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz)
+{
+    if (nm > SSIZE_MAX/sz - JL_SMALL_BYTE_ALIGNMENT)
+        return NULL;
+    return _unchecked_calloc(nm, sz);
+}
+
+JL_DLLEXPORT void jl_free(void *p)
+{
+    if (p != NULL) {
+        int64_t *pp = (int64_t *)p - 2;
+        size_t sz = pp[0];
+        jl_gc_counted_free_with_size(pp, sz + JL_SMALL_BYTE_ALIGNMENT);
+    }
+}
+
+JL_DLLEXPORT void *jl_realloc(void *p, size_t sz)
+{
+    int64_t *pp;
+    size_t szold;
+    if (p == NULL) {
+        pp = NULL;
+        szold = 0;
+    }
+    else {
+        pp = (int64_t *)p - 2;
+        szold = pp[0] + JL_SMALL_BYTE_ALIGNMENT;
+    }
+    int64_t *pnew = (int64_t *)jl_gc_counted_realloc_with_old_size(pp, szold, sz + JL_SMALL_BYTE_ALIGNMENT);
+    if (pnew == NULL)
+        return NULL;
+    pnew[0] = sz;
+    return (void *)(pnew + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
+}
+
+// allocator entry points
+
+JL_DLLEXPORT jl_value_t *(jl_gc_alloc)(jl_ptls_t ptls, size_t sz, void *ty)
+{
+    return jl_gc_alloc_(ptls, sz, ty);
+}
+
 const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00
 JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) JL_NOTSAFEPOINT
 {
@@ -501,6 +587,76 @@ JL_DLLEXPORT void jl_throw_out_of_memory_error(void)
     jl_throw(jl_memory_exception);
 }
 
+size_t jl_genericmemory_nbytes(jl_genericmemory_t *m) JL_NOTSAFEPOINT
+{
+    const jl_datatype_layout_t *layout = ((jl_datatype_t*)jl_typetagof(m))->layout;
+    size_t sz = layout->size * m->length;
+    if (layout->flags.arrayelem_isunion)
+        // account for isbits Union array selector bytes
+        sz += m->length;
+    return sz;
+}
+
+// tracking Memorys with malloc'd storage
+void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, int isaligned){
+    // This is **NOT** a GC safe point.
+    mallocmemory_t *ma;
+    if (ptls->gc_tls.heap.mafreelist == NULL) {
+        ma = (mallocmemory_t*)malloc_s(sizeof(mallocmemory_t));
+    }
+    else {
+        ma = ptls->gc_tls.heap.mafreelist;
+        ptls->gc_tls.heap.mafreelist = ma->next;
+    }
+    ma->a = (jl_genericmemory_t*)((uintptr_t)m | !!isaligned);
+    ma->next = ptls->gc_tls.heap.mallocarrays;
+    ptls->gc_tls.heap.mallocarrays = ma;
+}
+
+int gc_logging_enabled = 0;
+
+JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
+    gc_logging_enabled = enable;
+}
+
+JL_DLLEXPORT int jl_is_gc_logging_enabled(void) {
+    return gc_logging_enabled;
+}
+
+// gc-debug common functions
+// ---
+
+int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
+{
+    int nf = (int)jl_datatype_nfields(vt);
+    for (int i = 1; i < nf; i++) {
+        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
+            return i - 1;
+    }
+    return nf - 1;
+}
+
+int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
+{
+    char *slot = (char*)_slot;
+    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
+    char *start = NULL;
+    size_t len = 0;
+    size_t elsize = sizeof(void*);
+    if (vt == jl_module_type) {
+        jl_module_t *m = (jl_module_t*)obj;
+        start = (char*)m->usings.items;
+        len = m->usings.len;
+    }
+    else if (vt == jl_simplevector_type) {
+        start = (char*)jl_svec_data(obj);
+        len = jl_svec_len(obj);
+    }
+    if (slot < start || slot >= start + elsize * len)
+        return -1;
+    return (slot - start) / elsize;
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gc-common.h b/src/gc-common.h
index 4d53830442a7d..154b9659e9ccb 100644
--- a/src/gc-common.h
+++ b/src/gc-common.h
@@ -53,6 +53,12 @@ extern jl_gc_callback_list_t *gc_cblist_notify_gc_pressure;
 // malloc wrappers, aligned allocation
 // =========================================================================== //
 
+// data structure for tracking malloc'd genericmemory.
+typedef struct _mallocmemory_t {
+    jl_genericmemory_t *a; // lowest bit is tagged if this is aligned memory
+    struct _mallocmemory_t *next;
+} mallocmemory_t;
+
 #if defined(_OS_WINDOWS_)
 STATIC_INLINE void *jl_malloc_aligned(size_t sz, size_t align)
 {
diff --git a/src/gc-debug.c b/src/gc-debug.c
index 19dd93af5f236..d05fb4b49e9f7 100644
--- a/src/gc-debug.c
+++ b/src/gc-debug.c
@@ -1105,46 +1105,7 @@ void gc_count_pool(void)
     jl_safe_printf("************************\n");
 }
 
-int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
-{
-    int nf = (int)jl_datatype_nfields(vt);
-    for (int i = 1; i < nf; i++) {
-        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
-            return i - 1;
-    }
-    return nf - 1;
-}
-
-int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
-{
-    char *slot = (char*)_slot;
-    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
-    char *start = NULL;
-    size_t len = 0;
-    size_t elsize = sizeof(void*);
-    if (vt == jl_module_type) {
-        jl_module_t *m = (jl_module_t*)obj;
-        start = (char*)m->usings.items;
-        len = m->usings.len;
-    }
-    else if (vt == jl_simplevector_type) {
-        start = (char*)jl_svec_data(obj);
-        len = jl_svec_len(obj);
-    }
-    if (slot < start || slot >= start + elsize * len)
-        return -1;
-    return (slot - start) / elsize;
-}
-
-static int gc_logging_enabled = 0;
-
-JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
-    gc_logging_enabled = enable;
-}
-
-JL_DLLEXPORT int jl_is_gc_logging_enabled(void) {
-    return gc_logging_enabled;
-}
+extern int gc_logging_enabled;
 
 void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect, int64_t live_bytes) JL_NOTSAFEPOINT {
     if (!gc_logging_enabled) {
diff --git a/src/gc-interface.h b/src/gc-interface.h
index e543b4b5879f1..682f22344d69d 100644
--- a/src/gc-interface.h
+++ b/src/gc-interface.h
@@ -128,6 +128,13 @@ JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void);
 // Allocation
 // ========================================================================= //
 
+// On GCC, this function is inlined when sz is constant (see julia_internal.h)
+// In general, this function should implement allocation and should use the specific GC's logic
+// to decide whether to allocate a small or a large object. Finally, note that this function
+// **must** also set the type of the returning object to be `ty`. The type `ty` may also be used to record
+// an allocation of that type in the allocation profiler.
+struct _jl_value_t *jl_gc_alloc_(struct _jl_tls_states_t * ptls, size_t sz, void *ty);
+
 // Allocates small objects and increments Julia allocation counterst. Size of the object
 // header must be included in the object size. The (possibly unused in some implementations)
 // offset to the arena in which we're allocating is passed in the second parameter, and the
@@ -211,6 +218,11 @@ JL_DLLEXPORT void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align,
 // object being allocated and will be used to set the object header.
 struct _jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT;
 
+// This function notifies the GC about memory addresses that are set when loading the boot image.
+// The GC may use that information to, for instance, determine that such objects should
+// be treated as marked and belonged to the old generation in nursery collections.
+void jl_gc_notify_image_load(const char* img_data, size_t len);
+
 // ========================================================================= //
 // Runtime Write-Barriers
 // ========================================================================= //
diff --git a/src/gc-stacks.c b/src/gc-stacks.c
index 783129ea97693..8c44b65284386 100644
--- a/src/gc-stacks.c
+++ b/src/gc-stacks.c
@@ -46,7 +46,7 @@ static void *malloc_stack(size_t bufsz) JL_NOTSAFEPOINT
 }
 
 
-static void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
+void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
 {
     VirtualFree(stkbuf, 0, MEM_RELEASE);
     jl_atomic_fetch_add_relaxed(&num_stack_mappings, -1);
@@ -81,7 +81,7 @@ static void *malloc_stack(size_t bufsz) JL_NOTSAFEPOINT
     return stk;
 }
 
-static void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
+void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
 {
     munmap(stkbuf, bufsz);
     jl_atomic_fetch_add_relaxed(&num_stack_mappings, -1);
diff --git a/src/gc-stock.c b/src/gc-stock.c
index d25f8917f302d..4a8c6fe7decc5 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -553,24 +553,6 @@ static void sweep_big(jl_ptls_t ptls) JL_NOTSAFEPOINT
     gc_time_big_end();
 }
 
-// tracking Memorys with malloc'd storage
-
-void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, int isaligned){
-    // This is **NOT** a GC safe point.
-    mallocmemory_t *ma;
-    if (ptls->gc_tls.heap.mafreelist == NULL) {
-        ma = (mallocmemory_t*)malloc_s(sizeof(mallocmemory_t));
-    }
-    else {
-        ma = ptls->gc_tls.heap.mafreelist;
-        ptls->gc_tls.heap.mafreelist = ma->next;
-    }
-    ma->a = (jl_genericmemory_t*)((uintptr_t)m | !!isaligned);
-    ma->next = ptls->gc_tls.heap.mallocarrays;
-    ptls->gc_tls.heap.mallocarrays = ma;
-}
-
-
 void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT
 {
     jl_ptls_t ptls = jl_current_task->ptls;
@@ -647,17 +629,6 @@ void jl_gc_reset_alloc_count(void) JL_NOTSAFEPOINT
     reset_thread_gc_counts();
 }
 
-size_t jl_genericmemory_nbytes(jl_genericmemory_t *m) JL_NOTSAFEPOINT
-{
-    const jl_datatype_layout_t *layout = ((jl_datatype_t*)jl_typetagof(m))->layout;
-    size_t sz = layout->size * m->length;
-    if (layout->flags.arrayelem_isunion)
-        // account for isbits Union array selector bytes
-        sz += m->length;
-    return sz;
-}
-
-
 static void jl_gc_free_memory(jl_value_t *v, int isaligned) JL_NOTSAFEPOINT
 {
     assert(jl_is_genericmemory(v));
@@ -816,6 +787,29 @@ jl_value_t *jl_gc_small_alloc_noinline(jl_ptls_t ptls, int offset, int osize) {
     return jl_gc_small_alloc_inner(ptls, offset, osize);
 }
 
+// Size does NOT include the type tag!!
+inline jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty)
+{
+    jl_value_t *v;
+    const size_t allocsz = sz + sizeof(jl_taggedvalue_t);
+    if (sz <= GC_MAX_SZCLASS) {
+        int pool_id = jl_gc_szclass(allocsz);
+        jl_gc_pool_t *p = &ptls->gc_tls.heap.norm_pools[pool_id];
+        int osize = jl_gc_sizeclasses[pool_id];
+        // We call `jl_gc_small_alloc_noinline` instead of `jl_gc_small_alloc` to avoid double-counting in
+        // the Allocations Profiler. (See https://github.com/JuliaLang/julia/pull/43868 for more details.)
+        v = jl_gc_small_alloc_noinline(ptls, (char*)p - (char*)ptls, osize);
+    }
+    else {
+        if (allocsz < sz) // overflow in adding offs, size was "negative"
+            jl_throw(jl_memory_exception);
+        v = jl_gc_big_alloc_noinline(ptls, allocsz);
+    }
+    jl_set_typeof(v, ty);
+    maybe_record_alloc_to_profile(v, sz, (jl_datatype_t*)ty);
+    return v;
+}
+
 int jl_gc_classify_pools(size_t sz, int *osize)
 {
     if (sz > GC_MAX_SZCLASS)
@@ -2792,6 +2786,21 @@ static void sweep_finalizer_list(arraylist_t *list)
     list->len = j;
 }
 
+int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    return tid >= gc_first_tid && tid <= gc_last_parallel_collector_thread_id();
+}
+
+int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    if (jl_n_sweepthreads == 0) {
+        return 0;
+    }
+    int last_parallel_collector_thread_id = gc_last_parallel_collector_thread_id();
+    int concurrent_collector_thread_id = last_parallel_collector_thread_id + 1;
+    return tid == concurrent_collector_thread_id;
+}
+
 // collector entry point and control
 _Atomic(uint32_t) jl_gc_disable_counter = 1;
 
@@ -2830,11 +2839,6 @@ JL_DLLEXPORT void jl_gc_get_total_bytes(int64_t *bytes) JL_NOTSAFEPOINT
     *bytes = (num.total_allocd + num.deferred_alloc + num.allocd);
 }
 
-JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void)
-{
-    return gc_num.total_time;
-}
-
 JL_DLLEXPORT jl_gc_num_t jl_gc_num(void)
 {
     jl_gc_num_t num = gc_num;
@@ -3386,13 +3390,6 @@ void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq)
     gc_mark_roots(mq);
 }
 
-// allocator entry points
-
-JL_DLLEXPORT jl_value_t *(jl_gc_alloc)(jl_ptls_t ptls, size_t sz, void *ty)
-{
-    return jl_gc_alloc_(ptls, sz, ty);
-}
-
 // Per-thread initialization
 void jl_init_thread_heap(jl_ptls_t ptls)
 {
@@ -3674,63 +3671,6 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size
     return data;
 }
 
-// allocation wrappers that save the size of allocations, to allow using
-// jl_gc_counted_* functions with a libc-compatible API.
-
-JL_DLLEXPORT void *jl_malloc(size_t sz)
-{
-    int64_t *p = (int64_t *)jl_gc_counted_malloc(sz + JL_SMALL_BYTE_ALIGNMENT);
-    if (p == NULL)
-        return NULL;
-    p[0] = sz;
-    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
-}
-
-//_unchecked_calloc does not check for potential overflow of nm*sz
-STATIC_INLINE void *_unchecked_calloc(size_t nm, size_t sz) {
-    size_t nmsz = nm*sz;
-    int64_t *p = (int64_t *)jl_gc_counted_calloc(nmsz + JL_SMALL_BYTE_ALIGNMENT, 1);
-    if (p == NULL)
-        return NULL;
-    p[0] = nmsz;
-    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
-}
-
-JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz)
-{
-    if (nm > SSIZE_MAX/sz - JL_SMALL_BYTE_ALIGNMENT)
-        return NULL;
-    return _unchecked_calloc(nm, sz);
-}
-
-JL_DLLEXPORT void jl_free(void *p)
-{
-    if (p != NULL) {
-        int64_t *pp = (int64_t *)p - 2;
-        size_t sz = pp[0];
-        jl_gc_counted_free_with_size(pp, sz + JL_SMALL_BYTE_ALIGNMENT);
-    }
-}
-
-JL_DLLEXPORT void *jl_realloc(void *p, size_t sz)
-{
-    int64_t *pp;
-    size_t szold;
-    if (p == NULL) {
-        pp = NULL;
-        szold = 0;
-    }
-    else {
-        pp = (int64_t *)p - 2;
-        szold = pp[0] + JL_SMALL_BYTE_ALIGNMENT;
-    }
-    int64_t *pnew = (int64_t *)jl_gc_counted_realloc_with_old_size(pp, szold, sz + JL_SMALL_BYTE_ALIGNMENT);
-    if (pnew == NULL)
-        return NULL;
-    pnew[0] = sz;
-    return (void *)(pnew + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
-}
-
 // allocating blocks for Arrays and Strings
 
 JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
@@ -3864,18 +3804,6 @@ jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT
     return jl_valueof(o);
 }
 
-JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return jl_gc_new_weakref_th(ptls, value);
-}
-
-JL_DLLEXPORT jl_value_t *jl_gc_allocobj(size_t sz)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return jl_gc_alloc(ptls, sz, NULL);
-}
-
 JL_DLLEXPORT int jl_gc_enable_conservative_gc_support(void)
 {
     if (jl_is_initialized()) {
@@ -4003,14 +3931,14 @@ JL_DLLEXPORT size_t jl_gc_external_obj_hdr_size(void)
 }
 
 
-JL_DLLEXPORT void * jl_gc_alloc_typed(jl_ptls_t ptls, size_t sz, void *ty)
+JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj)
 {
-    return jl_gc_alloc(ptls, sz, ty);
+    arraylist_push(&ptls->gc_tls.sweep_objs, obj);
 }
 
-JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj)
+void jl_gc_notify_image_load(const char* img_data, size_t len)
 {
-    arraylist_push(&ptls->gc_tls.sweep_objs, obj);
+    // Do nothing
 }
 
 #ifdef __cplusplus
diff --git a/src/gc-stock.h b/src/gc-stock.h
index 45c93bf4289ae..3f3900b349bcf 100644
--- a/src/gc-stock.h
+++ b/src/gc-stock.h
@@ -106,12 +106,6 @@ JL_EXTENSION typedef struct _bigval_t {
     // must be 64-byte aligned here, in 32 & 64 bit modes
 } bigval_t;
 
-// data structure for tracking malloc'd genericmemory.
-typedef struct _mallocmemory_t {
-    jl_genericmemory_t *a; // lowest bit is tagged if this is aligned memory
-    struct _mallocmemory_t *next;
-} mallocmemory_t;
-
 // pool page metadata
 typedef struct _jl_gc_pagemeta_t {
     // next metadata structure in per-thread list
@@ -428,21 +422,6 @@ STATIC_INLINE int gc_ith_parallel_collector_thread_id(int i) JL_NOTSAFEPOINT
     return gc_first_tid + i;
 }
 
-STATIC_INLINE int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
-{
-    return tid >= gc_first_tid && tid <= gc_last_parallel_collector_thread_id();
-}
-
-STATIC_INLINE int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
-{
-    if (jl_n_sweepthreads == 0) {
-        return 0;
-    }
-    int last_parallel_collector_thread_id = gc_last_parallel_collector_thread_id();
-    int concurrent_collector_thread_id = last_parallel_collector_thread_id + 1;
-    return tid == concurrent_collector_thread_id;
-}
-
 STATIC_INLINE int gc_random_parallel_collector_thread_id(jl_ptls_t ptls) JL_NOTSAFEPOINT
 {
     assert(jl_n_markthreads > 0);
diff --git a/src/julia.h b/src/julia.h
index abb8a57ff13b0..db57db1fbeb38 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -850,7 +850,7 @@ static inline jl_value_t *jl_to_typeof(uintptr_t t)
     return (jl_value_t*)t;
 }
 #else
-extern JL_HIDDEN jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
+extern JL_DLLEXPORT jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
 static inline jl_value_t *jl_to_typeof(uintptr_t t)
 {
     if (t < (jl_max_tags << 4))
diff --git a/src/julia_internal.h b/src/julia_internal.h
index f00667d016796..edddb68754fc3 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -513,30 +513,6 @@ STATIC_INLINE uint8_t JL_CONST_FUNC jl_gc_szclass_align8(unsigned sz) JL_NOTSAFE
 #define GC_MAX_SZCLASS (2032-sizeof(void*))
 static_assert(ARRAY_CACHE_ALIGN_THRESHOLD > GC_MAX_SZCLASS, "");
 
-
-// Size does NOT include the type tag!!
-STATIC_INLINE jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty)
-{
-    jl_value_t *v;
-    const size_t allocsz = sz + sizeof(jl_taggedvalue_t);
-    if (sz <= GC_MAX_SZCLASS) {
-        int pool_id = jl_gc_szclass(allocsz);
-        jl_gc_pool_t *p = &ptls->gc_tls.heap.norm_pools[pool_id];
-        int osize = jl_gc_sizeclasses[pool_id];
-        // We call `jl_gc_small_alloc_noinline` instead of `jl_gc_small_alloc` to avoid double-counting in
-        // the Allocations Profiler. (See https://github.com/JuliaLang/julia/pull/43868 for more details.)
-        v = jl_gc_small_alloc_noinline(ptls, (char*)p - (char*)ptls, osize);
-    }
-    else {
-        if (allocsz < sz) // overflow in adding offs, size was "negative"
-            jl_throw(jl_memory_exception);
-        v = jl_gc_big_alloc_noinline(ptls, allocsz);
-    }
-    jl_set_typeof(v, ty);
-    maybe_record_alloc_to_profile(v, sz, (jl_datatype_t*)ty);
-    return v;
-}
-
 /* Programming style note: When using jl_gc_alloc, do not JL_GC_PUSH it into a
  * gc frame, until it has been fully initialized. An uninitialized value in a
  * gc frame can crash upon encountering the first safepoint. By delaying use of
@@ -1074,7 +1050,7 @@ STATIC_INLINE int jl_addr_is_safepoint(uintptr_t addr)
     return addr >= safepoint_addr && addr < safepoint_addr + jl_page_size * 4;
 }
 extern _Atomic(uint32_t) jl_gc_running;
-extern _Atomic(uint32_t) jl_gc_disable_counter;
+extern JL_DLLEXPORT _Atomic(uint32_t) jl_gc_disable_counter;
 // All the functions are safe to be called from within a signal handler
 // provided that the thread will not be interrupted by another asynchronous
 // signal.
diff --git a/src/scheduler.c b/src/scheduler.c
index bb2f85b52283f..b85a481588e4f 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -80,9 +80,20 @@ JL_DLLEXPORT int jl_set_task_threadpoolid(jl_task_t *task, int8_t tpid) JL_NOTSA
     return 1;
 }
 
+<<<<<<< HEAD
 // GC functions used
 extern int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache,
                                          jl_gc_markqueue_t *mq, jl_value_t *obj) JL_NOTSAFEPOINT;
+=======
+// parallel task runtime
+// ---
+
+JL_DLLEXPORT uint32_t jl_rand_ptls(uint32_t max) // [0, n)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return cong(max, &ptls->rngseed);
+}
+>>>>>>> 4f39869d04 (Refactoring to be considered before adding MMTk)
 
 // initialize the threading infrastructure
 // (called only by the main thread)
diff --git a/src/stackwalk.c b/src/stackwalk.c
index 6aa36fa8b499c..5f28b61c4a8fe 100644
--- a/src/stackwalk.c
+++ b/src/stackwalk.c
@@ -5,7 +5,7 @@
   utilities for walking the stack and looking up information about code addresses
 */
 #include <inttypes.h>
-#include "gc-stock.h"
+#include "gc-common.h"
 #include "julia.h"
 #include "julia_internal.h"
 #include "threading.h"
@@ -1294,6 +1294,8 @@ JL_DLLEXPORT void jl_print_backtrace(void) JL_NOTSAFEPOINT
 }
 
 extern int gc_first_tid;
+extern int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT;
+extern int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT;
 
 // Print backtraces for all live tasks, for all threads, to jl_safe_printf stderr
 JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT
diff --git a/src/staticdata.c b/src/staticdata.c
index 363aa46b62221..e07a5365bf06f 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -654,6 +654,7 @@ static void jl_load_sysimg_so(void)
         plen = (size_t *)&jl_system_image_size;
     else
         jl_dlsym(jl_sysimg_handle, "jl_system_image_size", (void **)&plen, 1);
+    jl_gc_notify_image_load(sysimg_data, *plen);
     jl_restore_system_image_data(sysimg_data, *plen);
 }
 
@@ -3899,6 +3900,7 @@ JL_DLLEXPORT jl_value_t *jl_restore_package_image_from_file(const char *fname, j
     jl_dlsym(pkgimg_handle, "jl_system_image_data", (void **)&pkgimg_data, 1);
     size_t *plen;
     jl_dlsym(pkgimg_handle, "jl_system_image_size", (void **)&plen, 1);
+    jl_gc_notify_image_load(pkgimg_data, *plen);
 
     jl_image_t pkgimage = jl_init_processor_pkgimg(pkgimg_handle);
 

From d2f2b8d9c477514e93009d0b99e2ffe65bcc9831 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 29 Aug 2024 04:57:59 +0000
Subject: [PATCH 072/116] Removing jl_gc_notify_image_load, since it's a new
 function and not part of the refactoring

---
 src/gc-interface.h | 5 -----
 src/gc-stock.c     | 5 -----
 src/staticdata.c   | 2 --
 3 files changed, 12 deletions(-)

diff --git a/src/gc-interface.h b/src/gc-interface.h
index 682f22344d69d..25ffed4524f0c 100644
--- a/src/gc-interface.h
+++ b/src/gc-interface.h
@@ -218,11 +218,6 @@ JL_DLLEXPORT void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align,
 // object being allocated and will be used to set the object header.
 struct _jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT;
 
-// This function notifies the GC about memory addresses that are set when loading the boot image.
-// The GC may use that information to, for instance, determine that such objects should
-// be treated as marked and belonged to the old generation in nursery collections.
-void jl_gc_notify_image_load(const char* img_data, size_t len);
-
 // ========================================================================= //
 // Runtime Write-Barriers
 // ========================================================================= //
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 4a8c6fe7decc5..9b633cacd7870 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -3936,11 +3936,6 @@ JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *o
     arraylist_push(&ptls->gc_tls.sweep_objs, obj);
 }
 
-void jl_gc_notify_image_load(const char* img_data, size_t len)
-{
-    // Do nothing
-}
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/staticdata.c b/src/staticdata.c
index e07a5365bf06f..363aa46b62221 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -654,7 +654,6 @@ static void jl_load_sysimg_so(void)
         plen = (size_t *)&jl_system_image_size;
     else
         jl_dlsym(jl_sysimg_handle, "jl_system_image_size", (void **)&plen, 1);
-    jl_gc_notify_image_load(sysimg_data, *plen);
     jl_restore_system_image_data(sysimg_data, *plen);
 }
 
@@ -3900,7 +3899,6 @@ JL_DLLEXPORT jl_value_t *jl_restore_package_image_from_file(const char *fname, j
     jl_dlsym(pkgimg_handle, "jl_system_image_data", (void **)&pkgimg_data, 1);
     size_t *plen;
     jl_dlsym(pkgimg_handle, "jl_system_image_size", (void **)&plen, 1);
-    jl_gc_notify_image_load(pkgimg_data, *plen);
 
     jl_image_t pkgimage = jl_init_processor_pkgimg(pkgimg_handle);
 

From a42cb6410cf4f3e1773b0e41ecb5c696bc9cf836 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Mon, 2 Sep 2024 01:27:08 +0000
Subject: [PATCH 073/116] Moving gc_enable code to gc-common.c

---
 src/gc-common.c | 30 ++++++++++++++++++++++++++++++
 src/gc-stock.c  | 30 ------------------------------
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index 2ec167caa667a..03c046bc300f2 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -613,6 +613,36 @@ void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, i
     ptls->gc_tls.heap.mallocarrays = ma;
 }
 
+// collector entry point and control
+_Atomic(uint32_t) jl_gc_disable_counter = 1;
+
+JL_DLLEXPORT int jl_gc_enable(int on)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    int prev = !ptls->disable_gc;
+    ptls->disable_gc = (on == 0);
+    if (on && !prev) {
+        // disable -> enable
+        if (jl_atomic_fetch_add(&jl_gc_disable_counter, -1) == 1) {
+            gc_num.allocd += gc_num.deferred_alloc;
+            gc_num.deferred_alloc = 0;
+        }
+    }
+    else if (prev && !on) {
+        // enable -> disable
+        jl_atomic_fetch_add(&jl_gc_disable_counter, 1);
+        // check if the GC is running and wait for it to finish
+        jl_gc_safepoint_(ptls);
+    }
+    return prev;
+}
+
+JL_DLLEXPORT int jl_gc_is_enabled(void)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return !ptls->disable_gc;
+}
+
 int gc_logging_enabled = 0;
 
 JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 9b633cacd7870..61fc8d4e83a3a 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -2801,36 +2801,6 @@ int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
     return tid == concurrent_collector_thread_id;
 }
 
-// collector entry point and control
-_Atomic(uint32_t) jl_gc_disable_counter = 1;
-
-JL_DLLEXPORT int jl_gc_enable(int on)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    int prev = !ptls->disable_gc;
-    ptls->disable_gc = (on == 0);
-    if (on && !prev) {
-        // disable -> enable
-        if (jl_atomic_fetch_add(&jl_gc_disable_counter, -1) == 1) {
-            gc_num.allocd += gc_num.deferred_alloc;
-            gc_num.deferred_alloc = 0;
-        }
-    }
-    else if (prev && !on) {
-        // enable -> disable
-        jl_atomic_fetch_add(&jl_gc_disable_counter, 1);
-        // check if the GC is running and wait for it to finish
-        jl_gc_safepoint_(ptls);
-    }
-    return prev;
-}
-
-JL_DLLEXPORT int jl_gc_is_enabled(void)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return !ptls->disable_gc;
-}
-
 JL_DLLEXPORT void jl_gc_get_total_bytes(int64_t *bytes) JL_NOTSAFEPOINT
 {
     jl_gc_num_t num = gc_num;

From 92563918292056178d6f6ed12c58a9f998ef2d54 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Mon, 16 Sep 2024 06:38:02 +0000
Subject: [PATCH 074/116] Addressing PR comments

---
 src/gc-common.c      | 134 +++++++++++++++++++++++++------------------
 src/gc-common.h      |   6 ++
 src/gc-debug.c       |   2 -
 src/gc-interface.h   |  30 +---------
 src/gc-stock.c       |  18 +-----
 src/gc-stock.h       |  15 +++++
 src/julia.h          |   2 +-
 src/julia_internal.h |   4 +-
 src/stackwalk.c      |  10 +---
 9 files changed, 110 insertions(+), 111 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index 03c046bc300f2..046feae6aa4c5 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -491,15 +491,9 @@ int gc_n_threads;
 jl_ptls_t* gc_all_tls_states;
 
 // =========================================================================== //
-// MISC
+// Allocation
 // =========================================================================== //
 
-JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return jl_gc_new_weakref_th(ptls, value);
-}
-
 JL_DLLEXPORT void * jl_gc_alloc_typed(jl_ptls_t ptls, size_t sz, void *ty)
 {
     return jl_gc_alloc(ptls, sz, ty);
@@ -575,17 +569,9 @@ JL_DLLEXPORT jl_value_t *(jl_gc_alloc)(jl_ptls_t ptls, size_t sz, void *ty)
     return jl_gc_alloc_(ptls, sz, ty);
 }
 
-const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00
-JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) JL_NOTSAFEPOINT
-{
-    return jl_buff_tag;
-}
-
-// callback for passing OOM errors from gmp
-JL_DLLEXPORT void jl_throw_out_of_memory_error(void)
-{
-    jl_throw(jl_memory_exception);
-}
+// =========================================================================== //
+// Generic Memory
+// =========================================================================== //
 
 size_t jl_genericmemory_nbytes(jl_genericmemory_t *m) JL_NOTSAFEPOINT
 {
@@ -613,6 +599,66 @@ void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, i
     ptls->gc_tls.heap.mallocarrays = ma;
 }
 
+// =========================================================================== //
+// GC Debug
+// =========================================================================== //
+
+int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
+{
+    int nf = (int)jl_datatype_nfields(vt);
+    for (int i = 1; i < nf; i++) {
+        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
+            return i - 1;
+    }
+    return nf - 1;
+}
+
+int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
+{
+    char *slot = (char*)_slot;
+    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
+    char *start = NULL;
+    size_t len = 0;
+    size_t elsize = sizeof(void*);
+    if (vt == jl_module_type) {
+        jl_module_t *m = (jl_module_t*)obj;
+        start = (char*)m->usings.items;
+        len = m->usings.len;
+    }
+    else if (vt == jl_simplevector_type) {
+        start = (char*)jl_svec_data(obj);
+        len = jl_svec_len(obj);
+    }
+    if (slot < start || slot >= start + elsize * len)
+        return -1;
+    return (slot - start) / elsize;
+}
+
+// =========================================================================== //
+// GC Control
+// =========================================================================== //
+
+JL_DLLEXPORT uint32_t jl_get_gc_disable_counter(void) {
+    return jl_atomic_load_acquire(&jl_gc_disable_counter);
+}
+
+JL_DLLEXPORT int jl_gc_is_enabled(void)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return !ptls->disable_gc;
+}
+
+int gc_logging_enabled = 0;
+
+JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
+    gc_logging_enabled = enable;
+}
+
+JL_DLLEXPORT int jl_is_gc_logging_enabled(void) {
+    return gc_logging_enabled;
+}
+
+
 // collector entry point and control
 _Atomic(uint32_t) jl_gc_disable_counter = 1;
 
@@ -637,54 +683,30 @@ JL_DLLEXPORT int jl_gc_enable(int on)
     return prev;
 }
 
-JL_DLLEXPORT int jl_gc_is_enabled(void)
+// =========================================================================== //
+// MISC
+// =========================================================================== //
+
+JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
 {
     jl_ptls_t ptls = jl_current_task->ptls;
-    return !ptls->disable_gc;
-}
-
-int gc_logging_enabled = 0;
-
-JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
-    gc_logging_enabled = enable;
+    return jl_gc_new_weakref_th(ptls, value);
 }
 
-JL_DLLEXPORT int jl_is_gc_logging_enabled(void) {
-    return gc_logging_enabled;
+JL_DLLEXPORT jl_datatype_t **jl_get_ijl_small_typeof(void) {
+    return ijl_small_typeof;
 }
 
-// gc-debug common functions
-// ---
-
-int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
+const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00
+JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) JL_NOTSAFEPOINT
 {
-    int nf = (int)jl_datatype_nfields(vt);
-    for (int i = 1; i < nf; i++) {
-        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
-            return i - 1;
-    }
-    return nf - 1;
+    return jl_buff_tag;
 }
 
-int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
+// callback for passing OOM errors from gmp
+JL_DLLEXPORT void jl_throw_out_of_memory_error(void)
 {
-    char *slot = (char*)_slot;
-    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
-    char *start = NULL;
-    size_t len = 0;
-    size_t elsize = sizeof(void*);
-    if (vt == jl_module_type) {
-        jl_module_t *m = (jl_module_t*)obj;
-        start = (char*)m->usings.items;
-        len = m->usings.len;
-    }
-    else if (vt == jl_simplevector_type) {
-        start = (char*)jl_svec_data(obj);
-        len = jl_svec_len(obj);
-    }
-    if (slot < start || slot >= start + elsize * len)
-        return -1;
-    return (slot - start) / elsize;
+    jl_throw(jl_memory_exception);
 }
 
 #ifdef __cplusplus
diff --git a/src/gc-common.h b/src/gc-common.h
index 154b9659e9ccb..32b7470b13a58 100644
--- a/src/gc-common.h
+++ b/src/gc-common.h
@@ -179,4 +179,10 @@ JL_DLLEXPORT void jl_finalize_th(jl_task_t *ct, jl_value_t *o);
 extern int gc_n_threads;
 extern jl_ptls_t* gc_all_tls_states;
 
+// =========================================================================== //
+// Logging
+// =========================================================================== //
+
+extern int gc_logging_enabled;
+
 #endif // JL_GC_COMMON_H
diff --git a/src/gc-debug.c b/src/gc-debug.c
index d05fb4b49e9f7..7c479484cde45 100644
--- a/src/gc-debug.c
+++ b/src/gc-debug.c
@@ -1105,8 +1105,6 @@ void gc_count_pool(void)
     jl_safe_printf("************************\n");
 }
 
-extern int gc_logging_enabled;
-
 void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect, int64_t live_bytes) JL_NOTSAFEPOINT {
     if (!gc_logging_enabled) {
         return;
diff --git a/src/gc-interface.h b/src/gc-interface.h
index 25ffed4524f0c..0e9ce32697f35 100644
--- a/src/gc-interface.h
+++ b/src/gc-interface.h
@@ -94,6 +94,8 @@ JL_DLLEXPORT void jl_gc_set_max_memory(uint64_t max_mem);
 // should run a collection cycle again (e.g. a full mark right after a full sweep to ensure
 // we do a full heap traversal).
 JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection);
+// Returns whether the thread with `tid` is a collector thread
+JL_DLLEXPORT int gc_is_collector_thread(int tid) JL_NOTSAFEPOINT;
 
 // ========================================================================= //
 // Metrics
@@ -162,26 +164,6 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz);
 JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz);
 // Wrapper around Libc realloc that updates Julia allocation counters.
 JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz);
-// Wrapper around Libc malloc that allocates a memory region with a few additional machine
-// words before the actual payload that are used to record the size of the requested
-// allocation. Also updates Julia allocation counters. The function returns a pointer to the
-// payload as a result of the allocation.
-JL_DLLEXPORT void *jl_malloc(size_t sz);
-// Wrapper around Libc calloc that allocates a memory region with a few additional machine
-// words before the actual payload that are used to record the size of the requested
-// allocation. Also updates Julia allocation counters. The function returns a pointer to the
-// payload as a result of the allocation.
-JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz);
-// Wrapper around Libc free that takes a pointer to the payload of a memory region allocated
-// with jl_malloc or jl_calloc, and uses the size information stored in the first machine
-// words of the memory buffer update Julia allocation counters, and then frees the
-// corresponding memory buffer.
-JL_DLLEXPORT void jl_free(void *p);
-// Wrapper around Libc realloc that takes a memory region allocated with jl_malloc or
-// jl_calloc, and uses the size information stored in the first machine words of the memory
-// buffer to update Julia allocation counters, reallocating the corresponding memory buffer
-// in the end.
-JL_DLLEXPORT void *jl_realloc(void *p, size_t sz);
 // Wrapper around Libc malloc that's used to dynamically allocate memory for Arrays and
 // Strings. It increments Julia allocation counters and should check whether we're close to
 // the Julia heap target, and therefore, whether we should run a collection. Note that this
@@ -195,14 +177,6 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz);
 // thread-local allocator of the thread referenced by the first jl_ptls_t argument.
 JL_DLLEXPORT struct _jl_weakref_t *jl_gc_new_weakref_th(struct _jl_tls_states_t *ptls,
                                                         struct _jl_value_t *value);
-// Allocates a new weak-reference, assigns its value and increments Julia allocation
-// counters. If thread-local allocators are used, then this function should allocate in the
-// thread-local allocator of the current thread.
-JL_DLLEXPORT struct _jl_weakref_t *jl_gc_new_weakref(struct _jl_value_t *value);
-// Allocates an object whose size is specified by the function argument and increments Julia
-// allocation counters. If thread-local allocators are used, then this function should
-// allocate in the thread-local allocator of the current thread.
-JL_DLLEXPORT struct _jl_value_t *jl_gc_allocobj(size_t sz);
 // Permanently allocates a memory slot of the size specified by the first parameter. This
 // block of memory is allocated in an immortal region that is never swept. The second
 // parameter specifies whether the memory should be filled with zeros. The third and fourth
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 61fc8d4e83a3a..3ff37566dc6c7 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -2786,19 +2786,8 @@ static void sweep_finalizer_list(arraylist_t *list)
     list->len = j;
 }
 
-int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
-{
-    return tid >= gc_first_tid && tid <= gc_last_parallel_collector_thread_id();
-}
-
-int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
-{
-    if (jl_n_sweepthreads == 0) {
-        return 0;
-    }
-    int last_parallel_collector_thread_id = gc_last_parallel_collector_thread_id();
-    int concurrent_collector_thread_id = last_parallel_collector_thread_id + 1;
-    return tid == concurrent_collector_thread_id;
+int gc_is_collector_thread(int tid) JL_NOTSAFEPOINT {
+    return gc_is_parallel_collector_thread(tid) || gc_is_concurrent_collector_thread(tid);
 }
 
 JL_DLLEXPORT void jl_gc_get_total_bytes(int64_t *bytes) JL_NOTSAFEPOINT
@@ -3182,8 +3171,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
         // free empty GC state for threads that have exited
         if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
             // GC threads should never exit
-            assert(!gc_is_parallel_collector_thread(t_i));
-            assert(!gc_is_concurrent_collector_thread(t_i));
+            assert(!gc_is_collector_thread(t_i));
             jl_thread_heap_t *heap = &ptls2->gc_tls.heap;
             if (heap->weak_refs.len == 0)
                 small_arraylist_free(&heap->weak_refs);
diff --git a/src/gc-stock.h b/src/gc-stock.h
index 3f3900b349bcf..50eca3aadbd86 100644
--- a/src/gc-stock.h
+++ b/src/gc-stock.h
@@ -422,6 +422,21 @@ STATIC_INLINE int gc_ith_parallel_collector_thread_id(int i) JL_NOTSAFEPOINT
     return gc_first_tid + i;
 }
 
+STATIC_INLINE int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    return tid >= gc_first_tid && tid <= gc_last_parallel_collector_thread_id();
+}
+
+STATIC_INLINE int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    if (jl_n_sweepthreads == 0) {
+        return 0;
+    }
+    int last_parallel_collector_thread_id = gc_last_parallel_collector_thread_id();
+    int concurrent_collector_thread_id = last_parallel_collector_thread_id + 1;
+    return tid == concurrent_collector_thread_id;
+}
+
 STATIC_INLINE int gc_random_parallel_collector_thread_id(jl_ptls_t ptls) JL_NOTSAFEPOINT
 {
     assert(jl_n_markthreads > 0);
diff --git a/src/julia.h b/src/julia.h
index db57db1fbeb38..abb8a57ff13b0 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -850,7 +850,7 @@ static inline jl_value_t *jl_to_typeof(uintptr_t t)
     return (jl_value_t*)t;
 }
 #else
-extern JL_DLLEXPORT jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
+extern JL_HIDDEN jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
 static inline jl_value_t *jl_to_typeof(uintptr_t t)
 {
     if (t < (jl_max_tags << 4))
diff --git a/src/julia_internal.h b/src/julia_internal.h
index edddb68754fc3..e677f40907dfd 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -367,6 +367,8 @@ extern jl_function_t *jl_typeinf_func JL_GLOBALLY_ROOTED;
 extern JL_DLLEXPORT size_t jl_typeinf_world;
 extern _Atomic(jl_typemap_entry_t*) call_cache[N_CALL_CACHE] JL_GLOBALLY_ROOTED;
 
+extern void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT;
+
 JL_DLLEXPORT extern int jl_lineno;
 JL_DLLEXPORT extern const char *jl_filename;
 
@@ -1050,7 +1052,7 @@ STATIC_INLINE int jl_addr_is_safepoint(uintptr_t addr)
     return addr >= safepoint_addr && addr < safepoint_addr + jl_page_size * 4;
 }
 extern _Atomic(uint32_t) jl_gc_running;
-extern JL_DLLEXPORT _Atomic(uint32_t) jl_gc_disable_counter;
+extern _Atomic(uint32_t) jl_gc_disable_counter;
 // All the functions are safe to be called from within a signal handler
 // provided that the thread will not be interrupted by another asynchronous
 // signal.
diff --git a/src/stackwalk.c b/src/stackwalk.c
index 5f28b61c4a8fe..a1de3a6d61a07 100644
--- a/src/stackwalk.c
+++ b/src/stackwalk.c
@@ -1294,8 +1294,6 @@ JL_DLLEXPORT void jl_print_backtrace(void) JL_NOTSAFEPOINT
 }
 
 extern int gc_first_tid;
-extern int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT;
-extern int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT;
 
 // Print backtraces for all live tasks, for all threads, to jl_safe_printf stderr
 JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT
@@ -1304,12 +1302,8 @@ JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT
     jl_ptls_t *allstates = jl_atomic_load_relaxed(&jl_all_tls_states);
     for (size_t i = 0; i < nthreads; i++) {
         jl_ptls_t ptls2 = allstates[i];
-        if (gc_is_parallel_collector_thread(i)) {
-            jl_safe_printf("==== Skipping backtrace for parallel GC thread %zu\n", i + 1);
-            continue;
-        }
-        if (gc_is_concurrent_collector_thread(i)) {
-            jl_safe_printf("==== Skipping backtrace for concurrent GC thread %zu\n", i + 1);
+        if (gc_is_collector_thread(i)) {
+            jl_safe_printf("==== Skipping backtrace for parallel/concurrent GC thread %zu\n", i + 1);
             continue;
         }
         if (ptls2 == NULL) {

From ec398e1a98cf713a77f908a459ed37fd4b25af27 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 19 Sep 2024 04:18:13 +0000
Subject: [PATCH 075/116] Push resolution of merge conflict

---
 src/scheduler.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/src/scheduler.c b/src/scheduler.c
index b85a481588e4f..bb2f85b52283f 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -80,20 +80,9 @@ JL_DLLEXPORT int jl_set_task_threadpoolid(jl_task_t *task, int8_t tpid) JL_NOTSA
     return 1;
 }
 
-<<<<<<< HEAD
 // GC functions used
 extern int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache,
                                          jl_gc_markqueue_t *mq, jl_value_t *obj) JL_NOTSAFEPOINT;
-=======
-// parallel task runtime
-// ---
-
-JL_DLLEXPORT uint32_t jl_rand_ptls(uint32_t max) // [0, n)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return cong(max, &ptls->rngseed);
-}
->>>>>>> 4f39869d04 (Refactoring to be considered before adding MMTk)
 
 // initialize the threading infrastructure
 // (called only by the main thread)

From 68e5e11a229f253ec6de966a321bd9d3de453a3b Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 25 Sep 2024 01:10:31 +0000
Subject: [PATCH 076/116] Removing jl_gc_mark_queue_obj_explicit extern
 definition from scheduler.c

---
 src/scheduler.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/scheduler.c b/src/scheduler.c
index bb2f85b52283f..7e23f654c2566 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -80,10 +80,6 @@ JL_DLLEXPORT int jl_set_task_threadpoolid(jl_task_t *task, int8_t tpid) JL_NOTSA
     return 1;
 }
 
-// GC functions used
-extern int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache,
-                                         jl_gc_markqueue_t *mq, jl_value_t *obj) JL_NOTSAFEPOINT;
-
 // initialize the threading infrastructure
 // (called only by the main thread)
 void jl_init_threadinginfra(void)

From c23f0db8347f475e1eb2b37261dd4816537210fa Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 25 Sep 2024 02:50:25 +0000
Subject: [PATCH 077/116] Don't need the getter function since it's possible to
 use jl_small_typeof directly

---
 src/gc-common.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index 046feae6aa4c5..417f12f26d64d 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -693,10 +693,6 @@ JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
     return jl_gc_new_weakref_th(ptls, value);
 }
 
-JL_DLLEXPORT jl_datatype_t **jl_get_ijl_small_typeof(void) {
-    return ijl_small_typeof;
-}
-
 const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00
 JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) JL_NOTSAFEPOINT
 {

From 4bfcfe5df056bb5066a545e29c29463722678892 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Tue, 27 Aug 2024 06:47:41 +0000
Subject: [PATCH 078/116] WIP: Adding support for MMTk/Immix

---
 Make.inc                     |  47 ++
 contrib/refresh_checksums.mk |   2 +-
 src/Makefile                 |  43 +-
 src/builtins.c               |   1 +
 src/gc-common.c              |  70 +++
 src/gc-debug.c               |   4 +-
 src/gc-heap-snapshot.cpp     |   1 -
 src/gc-interface.h           |   3 +
 src/gc-mmtk.c                | 843 +++++++++++++++++++++++++++++++++++
 src/gc-mmtk.h                |  34 ++
 src/gc-page-profiler.c       |   4 +-
 src/gc-pages.c               |   4 +-
 src/gc-stock.c               |  14 +-
 src/gc-stock.h               |  18 +-
 src/gc-tls-mmtk.h            |  49 ++
 src/gc-tls.h                 |   4 +
 src/julia.h                  |   2 +-
 src/julia_internal.h         |   2 +-
 src/julia_threads.h          |   4 +
 src/stackwalk.c              |   2 +
 src/staticdata.c             |   2 +
 src/threading.c              |   4 +
 22 files changed, 1123 insertions(+), 34 deletions(-)
 create mode 100644 src/gc-mmtk.c
 create mode 100644 src/gc-mmtk.h
 create mode 100644 src/gc-tls-mmtk.h

diff --git a/Make.inc b/Make.inc
index f078a0c84f806..039755ce34098 100644
--- a/Make.inc
+++ b/Make.inc
@@ -86,6 +86,9 @@ HAVE_SSP := 0
 WITH_GC_VERIFY := 0
 WITH_GC_DEBUG_ENV := 0
 
+# Use MMTk GC
+WITH_MMTK ?= 0
+
 # Enable DTrace support
 WITH_DTRACE := 0
 
@@ -790,6 +793,44 @@ JCXXFLAGS += -DGC_DEBUG_ENV
 JCFLAGS += -DGC_DEBUG_ENV
 endif
 
+ifeq ($(WITH_MMTK), 1)
+ifeq (${MMTK_JULIA_DIR},)
+$(error MMTK_JULIA_DIR must be set to use MMTk)
+endif
+JCXXFLAGS += -DMMTK_GC
+JCFLAGS += -DMMTK_GC
+ifeq (${MMTK_BUILD},)
+ifeq (debug,$(findstring debug,$(MAKECMDGOALS)))
+MMTK_BUILD = debug
+else
+MMTK_BUILD = release
+endif
+endif
+ifeq (${MMTK_PLAN},Immix)
+JCXXFLAGS += -DMMTK_PLAN_IMMIX
+JCFLAGS += -DMMTK_PLAN_IMMIX
+endif
+ifeq (${MMTK_PLAN},StickyImmix)
+JCXXFLAGS += -DMMTK_PLAN_STICKYIMMIX
+JCFLAGS += -DMMTK_PLAN_STICKYIMMIX
+endif
+MMTK_DIR = ${MMTK_JULIA_DIR}/mmtk
+MMTK_API_INC = $(MMTK_DIR)/api
+MMTK_JULIA_INC = ${MMTK_JULIA_DIR}/julia
+ifeq ($(OS),Linux)
+MMTK_LIB_NAME := libmmtk_julia.so
+else
+$(error "Unsupported OS for MMTk")
+endif
+MMTK_LIB_SRC := $(MMTK_DIR)/target/$(MMTK_BUILD)/$(MMTK_LIB_NAME)
+MMTK_LIB_DST := $(BUILDROOT)/usr/lib/$(MMTK_LIB_NAME)
+MMTK_LIB := -lmmtk_julia
+LDFLAGS += -Wl,-rpath=$(MMTK_DIR)/target/$(MMTK_BUILD)/
+else
+MMTK_JULIA_INC :=
+MMTK_LIB :=
+endif
+
 ifeq ($(WITH_DTRACE), 1)
 JCXXFLAGS += -DUSE_DTRACE
 JCFLAGS += -DUSE_DTRACE
@@ -1777,6 +1818,9 @@ PRINT_PERL = printf '    %b %b\n' $(PERLCOLOR)PERL$(ENDCOLOR) $(BINCOLOR)$(GOAL)
 PRINT_FLISP = printf '    %b %b\n' $(FLISPCOLOR)FLISP$(ENDCOLOR) $(BINCOLOR)$(GOAL)$(ENDCOLOR); $(1)
 PRINT_JULIA = printf '    %b %b\n' $(JULIACOLOR)JULIA$(ENDCOLOR) $(BINCOLOR)$(GOAL)$(ENDCOLOR); $(1)
 PRINT_DTRACE = printf '    %b %b\n' $(DTRACECOLOR)DTRACE$(ENDCOLOR) $(BINCOLOR)$(GOAL)$(ENDCOLOR); $(1)
+ifeq ($(WITH_MMTK), 1)
+PRINT_MMTK = printf '    %b %b\n' $(LINKCOLOR)MMTK$(ENDCOLOR) $(BINCOLOR)$(GOAL)$(ENDCOLOR); $(1)
+endif
 
 else
 QUIET_MAKE =
@@ -1787,6 +1831,9 @@ PRINT_PERL = echo '$(subst ','\'',$(1))'; $(1)
 PRINT_FLISP = echo '$(subst ','\'',$(1))'; $(1)
 PRINT_JULIA = echo '$(subst ','\'',$(1))'; $(1)
 PRINT_DTRACE = echo '$(subst ','\'',$(1))'; $(1)
+ifeq ($(WITH_MMTK), 1)
+PRINT_MMTK = echo '$(subst ','\'',$(1))'; $(1)
+endif
 
 endif
 
diff --git a/contrib/refresh_checksums.mk b/contrib/refresh_checksums.mk
index f67088141ccd4..bf99c0fad9da2 100644
--- a/contrib/refresh_checksums.mk
+++ b/contrib/refresh_checksums.mk
@@ -24,7 +24,7 @@ CLANG_TRIPLETS=$(filter %-darwin %-freebsd,$(TRIPLETS))
 NON_CLANG_TRIPLETS=$(filter-out %-darwin %-freebsd,$(TRIPLETS))
 
 # These are the projects currently using BinaryBuilder; both GCC-expanded and non-GCC-expanded:
-BB_PROJECTS=mbedtls libssh2 nghttp2 mpfr curl libgit2 pcre libuv unwind llvmunwind dsfmt objconv p7zip zlib libsuitesparse openlibm blastrampoline libtracyclient
+BB_PROJECTS=mbedtls libssh2 nghttp2 mpfr curl libgit2 pcre libuv unwind llvmunwind dsfmt objconv p7zip zlib libsuitesparse openlibm blastrampoline libtracyclient libmmtk_julia
 BB_GCC_EXPANDED_PROJECTS=openblas csl
 BB_CXX_EXPANDED_PROJECTS=gmp llvm clang llvm-tools lld
 # These are non-BB source-only deps
diff --git a/src/Makefile b/src/Makefile
index 52e673aa6cc1a..c01848c16adf7 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -29,6 +29,10 @@ ifeq ($(USECLANG),1)
 FLAGS += -Wno-return-type-c-linkage -Wno-atomic-alignment
 endif
 
+ifeq ($(WITH_MMTK), 1)
+FLAGS += -I$(MMTK_API_INC) -I$(MMTK_JULIA_INC)
+endif
+
 FLAGS += -DJL_BUILD_ARCH='"$(ARCH)"'
 ifeq ($(OS),WINNT)
 FLAGS += -DJL_BUILD_UNAME='"NT"'
@@ -44,8 +48,8 @@ SRCS := \
 	jltypes gf typemap smallintset ast builtins module interpreter symbol \
 	dlload sys init task array genericmemory staticdata toplevel jl_uv datatype \
 	simplevector runtime_intrinsics precompile jloptions mtarraylist \
-	threading scheduler stackwalk gc-common gc-stock gc-debug gc-pages gc-stacks gc-alloc-profiler gc-page-profiler method \
-	jlapi signal-handling safepoint timing subtype rtutils gc-heap-snapshot \
+	threading scheduler stackwalk gc-common gc-stock gc-mmtk gc-debug gc-pages gc-stacks gc-alloc-profiler gc-page-profiler \
+	method jlapi signal-handling safepoint timing subtype rtutils gc-heap-snapshot \
 	crc32c APInt-C processor ircode opaque_closure codegen-stubs coverage runtime_ccall engine
 
 RT_LLVMLINK :=
@@ -103,7 +107,7 @@ ifeq ($(USE_SYSTEM_LIBUV),0)
 UV_HEADERS += uv.h
 UV_HEADERS += uv/*.h
 endif
-PUBLIC_HEADERS := $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(addprefix $(SRCDIR)/,work-stealing-queue.h gc-interface.h gc-tls.h julia.h julia_assert.h julia_threads.h julia_fasttls.h julia_locks.h julia_atomics.h jloptions.h)
+PUBLIC_HEADERS := $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(addprefix $(SRCDIR)/,work-stealing-queue.h gc-interface.h gc-tls.h gc-tls-mmtk.h julia.h julia_assert.h julia_threads.h julia_fasttls.h julia_locks.h julia_atomics.h jloptions.h)
 ifeq ($(OS),WINNT)
 PUBLIC_HEADERS += $(addprefix $(SRCDIR)/,win32_ucontext.h)
 endif
@@ -168,8 +172,8 @@ LIBJULIA_PATH_REL := libjulia
 endif
 
 COMMON_LIBPATHS := -L$(build_libdir) -L$(build_shlibdir)
-RT_LIBS := $(WHOLE_ARCHIVE) $(LIBUV) $(WHOLE_ARCHIVE) $(LIBUTF8PROC) $(NO_WHOLE_ARCHIVE) $(LIBUNWIND) $(RT_LLVMLINK) $(OSLIBS) $(LIBTRACYCLIENT) $(LIBITTAPI)
-CG_LIBS := $(LIBUNWIND) $(CG_LLVMLINK) $(OSLIBS) $(LIBTRACYCLIENT) $(LIBITTAPI)
+RT_LIBS := $(WHOLE_ARCHIVE) $(LIBUV) $(WHOLE_ARCHIVE) $(LIBUTF8PROC) $(NO_WHOLE_ARCHIVE) $(LIBUNWIND) $(RT_LLVMLINK) $(OSLIBS) $(LIBTRACYCLIENT) $(LIBITTAPI) $(MMTK_LIB)
+CG_LIBS := $(LIBUNWIND) $(CG_LLVMLINK) $(OSLIBS) $(LIBTRACYCLIENT) $(LIBITTAPI) $(MMTK_LIB)
 RT_DEBUG_LIBS := $(COMMON_LIBPATHS) $(WHOLE_ARCHIVE) $(BUILDDIR)/flisp/libflisp-debug.a $(WHOLE_ARCHIVE) $(BUILDDIR)/support/libsupport-debug.a -ljulia-debug $(RT_LIBS)
 CG_DEBUG_LIBS := $(COMMON_LIBPATHS) $(CG_LIBS) -ljulia-debug -ljulia-internal-debug
 RT_RELEASE_LIBS := $(COMMON_LIBPATHS) $(WHOLE_ARCHIVE) $(BUILDDIR)/flisp/libflisp.a $(WHOLE_ARCHIVE) $(BUILDDIR)/support/libsupport.a -ljulia $(RT_LIBS)
@@ -178,6 +182,15 @@ CG_RELEASE_LIBS := $(COMMON_LIBPATHS) $(CG_LIBS) -ljulia -ljulia-internal
 OBJS := $(SRCS:%=$(BUILDDIR)/%.o)
 DOBJS := $(SRCS:%=$(BUILDDIR)/%.dbg.obj)
 
+ifeq ($(WITH_MMTK), 1)
+MMTK_SRCS := mmtk_julia
+MMTK_OBJS := $(MMTK_SRCS:%=$(MMTK_JULIA_INC)/%.o) $(MMTK_LIB_DST)
+MMTK_DOBJS := $(MMTK_SRCS:%=$(MMTK_JULIA_INC)/%.dbg.obj) $(MMTK_LIB_DST)
+else
+MMTK_OBJS :=
+MMTK_DOBJS :=
+endif
+
 CODEGEN_OBJS := $(CODEGEN_SRCS:%=$(BUILDDIR)/%.o)
 CODEGEN_DOBJS := $(CODEGEN_SRCS:%=$(BUILDDIR)/%.dbg.obj)
 
@@ -226,6 +239,16 @@ $(BUILDDIR)/%.h.gen : $(SRCDIR)/%.d
 	sed 's/JULIA_/JL_PROBE_/' $@ > $@.tmp
 	mv $@.tmp $@
 
+# Compile files from the binding side and copy so file into lib folder
+ifeq ($(WITH_MMTK), 1)
+$(MMTK_JULIA_INC)/%.o: $(MMTK_JULIA_INC)/%.c $(HEADERS) | $(MMTK_JULIA_INC)
+	@$(call PRINT_CC, $(CC) $(JCPPFLAGS) $(JCFLAGS) $(SHIPFLAGS) $(DISABLE_ASSERTIONS) -c $< -o $@)
+$(MMTK_JULIA_INC)/%.dbg.obj: $(MMTK_JULIA_INC)/%.c $(HEADERS) | $(MMTK_JULIA_INC)
+	@$(call PRINT_CC, $(CC) $(JCPPFLAGS) $(JCFLAGS) $(DEBUGFLAGS) -c $< -o $@)
+$(MMTK_LIB_DST): $(MMTK_LIB_SRC)
+	@$(call PRINT_MMTK, cp $< $@)
+endif
+
 $(BUILDDIR)/jl_internal_funcs.inc: $(SRCDIR)/jl_exported_funcs.inc
 	# Generate `.inc` file that contains a list of `#define` macros to rename functions defined in `libjulia-internal`
 	# to have a `ijl_` prefix instead of `jl_`, to denote that they are coming from `libjulia-internal`.  This avoids
@@ -318,6 +341,7 @@ $(BUILDDIR)/debuginfo.o $(BUILDDIR)/debuginfo.dbg.obj: $(addprefix $(SRCDIR)/,de
 $(BUILDDIR)/disasm.o $(BUILDDIR)/disasm.dbg.obj: $(SRCDIR)/debuginfo.h $(SRCDIR)/processor.h
 $(BUILDDIR)/gc-debug.o $(BUILDDIR)/gc-debug.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h
 $(BUILDDIR)/gc-pages.o $(BUILDDIR)/gc-pages.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h
+$(BUILDDIR)/gc-mmtk.o $(BUILDDIR)/mmtk-gc.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h
 $(BUILDDIR)/gc-stock.o $(BUILDDIR)/gc.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h $(SRCDIR)/gc-page-profiler.h
 $(BUILDDIR)/gc-heap-snapshot.o $(BUILDDIR)/gc-heap-snapshot.dbg.obj: $(SRCDIR)/gc-heap-snapshot.h
 $(BUILDDIR)/gc-alloc-profiler.o $(BUILDDIR)/gc-alloc-profiler.dbg.obj: $(SRCDIR)/gc-alloc-profiler.h
@@ -389,14 +413,14 @@ $(BUILDDIR)/julia.expmap: $(SRCDIR)/julia.expmap.in $(JULIAHOME)/VERSION $(LLVM_
 	sed <'$<' >'$@' -e "s/@JULIA_SHLIB_SYMBOL_VERSION@/JL_LIBJULIA_$(SOMAJOR)/" \
 		        -e "s/@LLVM_SHLIB_SYMBOL_VERSION@/$(LLVM_SHLIB_SYMBOL_VERSION)/"
 
-$(build_shlibdir)/libjulia-internal.$(JL_MAJOR_MINOR_SHLIB_EXT): $(BUILDDIR)/julia.expmap $(OBJS) $(BUILDDIR)/flisp/libflisp.a $(BUILDDIR)/support/libsupport.a $(LIBUV)
-	@$(call PRINT_LINK, $(CXXLD) $(call IMPLIB_FLAGS,$@) $(JCXXFLAGS) $(JL_CXXFLAGS) $(CXXLDFLAGS) $(SHIPFLAGS) $(OBJS) $(RPATH_LIB) -o $@ \
+$(build_shlibdir)/libjulia-internal.$(JL_MAJOR_MINOR_SHLIB_EXT): $(BUILDDIR)/julia.expmap $(OBJS) $(MMTK_OBJS) $(BUILDDIR)/flisp/libflisp.a $(BUILDDIR)/support/libsupport.a $(LIBUV)
+	@$(call PRINT_LINK, $(CXXLD) $(call IMPLIB_FLAGS,$@) $(JCXXFLAGS) $(JL_CXXFLAGS) $(CXXLDFLAGS) $(SHIPFLAGS) $(OBJS) $(MMTK_OBJS) $(RPATH_LIB) -o $@ \
 		$(JLDFLAGS) $(BOLT_LDFLAGS) $(JLIBLDFLAGS) $(RT_RELEASE_LIBS) $(call SONAME_FLAGS,libjulia-internal.$(JL_MAJOR_SHLIB_EXT)))
 	@$(INSTALL_NAME_CMD)libjulia-internal.$(SHLIB_EXT) $@
 	$(DSYMUTIL) $@
 
-$(build_shlibdir)/libjulia-internal-debug.$(JL_MAJOR_MINOR_SHLIB_EXT): $(BUILDDIR)/julia.expmap $(DOBJS) $(BUILDDIR)/flisp/libflisp-debug.a $(BUILDDIR)/support/libsupport-debug.a $(LIBUV)
-	@$(call PRINT_LINK, $(CXXLD) $(call IMPLIB_FLAGS,$@) $(JCXXFLAGS) $(JL_CXXFLAGS) $(CXXLDFLAGS) $(DEBUGFLAGS) $(DOBJS) $(RPATH_LIB) -o $@ \
+$(build_shlibdir)/libjulia-internal-debug.$(JL_MAJOR_MINOR_SHLIB_EXT): $(BUILDDIR)/julia.expmap $(DOBJS) $(MMTK_DOBJS) $(BUILDDIR)/flisp/libflisp-debug.a $(BUILDDIR)/support/libsupport-debug.a $(LIBUV)
+	@$(call PRINT_LINK, $(CXXLD) $(call IMPLIB_FLAGS,$@) $(JCXXFLAGS) $(JL_CXXFLAGS) $(CXXLDFLAGS) $(DEBUGFLAGS) $(DOBJS) $(MMTK_DOBJS) $(RPATH_LIB) -o $@ \
 		$(JLDFLAGS) $(JLIBLDFLAGS) $(RT_DEBUG_LIBS) $(call SONAME_FLAGS,libjulia-internal-debug.$(JL_MAJOR_SHLIB_EXT)))
 	@$(INSTALL_NAME_CMD)libjulia-internal-debug.$(SHLIB_EXT) $@
 	$(DSYMUTIL) $@
@@ -455,6 +479,7 @@ clean:
 	-rm -f $(BUILDDIR)/*.dbg.obj $(BUILDDIR)/*.o $(BUILDDIR)/*.dwo $(BUILDDIR)/*.$(SHLIB_EXT) $(BUILDDIR)/*.a $(BUILDDIR)/*.h.gen
 	-rm -f $(BUILDDIR)/julia.expmap
 	-rm -f $(BUILDDIR)/julia_version.h
+	-rm -f $(MMTK_OBJS) $(MMTK_DOBJS)
 
 clean-flisp:
 	-$(MAKE) -C $(SRCDIR)/flisp clean BUILDDIR='$(abspath $(BUILDDIR)/flisp)'
diff --git a/src/builtins.c b/src/builtins.c
index 96c4cec0f5087..4a778035de405 100644
--- a/src/builtins.c
+++ b/src/builtins.c
@@ -22,6 +22,7 @@
 #include <ctype.h>
 #include "julia.h"
 #include "julia_internal.h"
+#include "gc-interface.h"
 #include "builtin_proto.h"
 #include "intrinsics.h"
 #include "julia_assert.h"
diff --git a/src/gc-common.c b/src/gc-common.c
index 417f12f26d64d..17f6f1330743b 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -705,6 +705,76 @@ JL_DLLEXPORT void jl_throw_out_of_memory_error(void)
     jl_throw(jl_memory_exception);
 }
 
+size_t jl_genericmemory_nbytes(jl_genericmemory_t *m) JL_NOTSAFEPOINT
+{
+    const jl_datatype_layout_t *layout = ((jl_datatype_t*)jl_typetagof(m))->layout;
+    size_t sz = layout->size * m->length;
+    if (layout->flags.arrayelem_isunion)
+        // account for isbits Union array selector bytes
+        sz += m->length;
+    return sz;
+}
+
+// tracking Memorys with malloc'd storage
+void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, int isaligned){
+    // This is **NOT** a GC safe point.
+    mallocmemory_t *ma;
+    if (ptls->gc_tls.heap.mafreelist == NULL) {
+        ma = (mallocmemory_t*)malloc_s(sizeof(mallocmemory_t));
+    }
+    else {
+        ma = ptls->gc_tls.heap.mafreelist;
+        ptls->gc_tls.heap.mafreelist = ma->next;
+    }
+    ma->a = (jl_genericmemory_t*)((uintptr_t)m | !!isaligned);
+    ma->next = ptls->gc_tls.heap.mallocarrays;
+    ptls->gc_tls.heap.mallocarrays = ma;
+}
+
+int gc_logging_enabled = 0;
+
+JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
+    gc_logging_enabled = enable;
+}
+
+JL_DLLEXPORT int jl_is_gc_logging_enabled(void) {
+    return gc_logging_enabled;
+}
+
+// gc-debug common functions
+// ---
+
+int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
+{
+    int nf = (int)jl_datatype_nfields(vt);
+    for (int i = 1; i < nf; i++) {
+        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
+            return i - 1;
+    }
+    return nf - 1;
+}
+
+int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
+{
+    char *slot = (char*)_slot;
+    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
+    char *start = NULL;
+    size_t len = 0;
+    size_t elsize = sizeof(void*);
+    if (vt == jl_module_type) {
+        jl_module_t *m = (jl_module_t*)obj;
+        start = (char*)m->usings.items;
+        len = m->usings.len;
+    }
+    else if (vt == jl_simplevector_type) {
+        start = (char*)jl_svec_data(obj);
+        len = jl_svec_len(obj);
+    }
+    if (slot < start || slot >= start + elsize * len)
+        return -1;
+    return (slot - start) / elsize;
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gc-debug.c b/src/gc-debug.c
index 7c479484cde45..ecd7f2328cada 100644
--- a/src/gc-debug.c
+++ b/src/gc-debug.c
@@ -1,5 +1,5 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
-
+#ifndef MMTK_GC
 #include "gc-common.h"
 #include "gc-stock.h"
 #include "julia.h"
@@ -1129,3 +1129,5 @@ void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect
 #ifdef __cplusplus
 }
 #endif
+
+#endif // !MMTK_GC
diff --git a/src/gc-heap-snapshot.cpp b/src/gc-heap-snapshot.cpp
index fcda11dad4f8a..d3cb1e98d84a4 100644
--- a/src/gc-heap-snapshot.cpp
+++ b/src/gc-heap-snapshot.cpp
@@ -1,5 +1,4 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
-
 #include "gc-heap-snapshot.h"
 
 #include "julia.h"
diff --git a/src/gc-interface.h b/src/gc-interface.h
index 0e9ce32697f35..72a57f4944156 100644
--- a/src/gc-interface.h
+++ b/src/gc-interface.h
@@ -192,6 +192,9 @@ JL_DLLEXPORT void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align,
 // object being allocated and will be used to set the object header.
 struct _jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT;
 
+// FIXME: add description here
+void jl_gc_notify_image_load(const char* img_data, size_t len);
+
 // ========================================================================= //
 // Runtime Write-Barriers
 // ========================================================================= //
diff --git a/src/gc-mmtk.c b/src/gc-mmtk.c
new file mode 100644
index 0000000000000..e459b0f12c41d
--- /dev/null
+++ b/src/gc-mmtk.c
@@ -0,0 +1,843 @@
+#ifdef MMTK_GC
+
+#include "mmtk_julia.h"
+#include "gc-common.h"
+#include "mmtkMutator.h"
+#include "gc-mmtk.h"
+#include "threading.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// For now we're using the same values as stock-gc. However
+// for the heap size we use 70% of the free memory available
+// since that is actually a hard limit in MMTk.
+
+// max_total_memory is a suggestion.  We try very hard to stay
+// under this limit, but we will go above it rather than halting.
+#ifdef _P64
+typedef uint64_t memsize_t;
+static const size_t default_collect_interval = 5600 * 1024 * sizeof(void*);
+// We expose this to the user/ci as jl_gc_set_max_memory
+static memsize_t max_total_memory = (memsize_t) 2 * 1024 * 1024 * 1024 * 1024 * 1024;
+#else
+typedef uint32_t memsize_t;
+static const size_t default_collect_interval = 3200 * 1024 * sizeof(void*);
+// Work really hard to stay within 2GB
+// Alternative is to risk running out of address space
+// on 32 bit architectures.
+#define MAX32HEAP 1536 * 1024 * 1024
+static memsize_t max_total_memory = (memsize_t) MAX32HEAP;
+#endif
+
+void jl_gc_init(void) {
+    // TODO: use jl_options.heap_size_hint to set MMTk's fixed heap size? (see issue: https://github.com/mmtk/mmtk-julia/issues/167)
+
+    JL_MUTEX_INIT(&finalizers_lock, "finalizers_lock");
+
+    arraylist_new(&to_finalize, 0);
+    arraylist_new(&finalizer_list_marked, 0);
+
+    gc_num.allocd = 0;
+    gc_num.max_pause = 0;
+    gc_num.max_memory = 0;
+
+    long long min_heap_size;
+    long long max_heap_size;
+    char* min_size_def = getenv("MMTK_MIN_HSIZE");
+    char* min_size_gb = getenv("MMTK_MIN_HSIZE_G");
+
+    char* max_size_def = getenv("MMTK_MAX_HSIZE");
+    char* max_size_gb = getenv("MMTK_MAX_HSIZE_G");
+
+    // default min heap currently set as Julia's default_collect_interval
+    if (min_size_def != NULL) {
+        char *p;
+        double min_size = strtod(min_size_def, &p);
+        min_heap_size = (long) 1024 * 1024 * min_size;
+    } else if (min_size_gb != NULL) {
+        char *p;
+        double min_size = strtod(min_size_gb, &p);
+        min_heap_size = (long) 1024 * 1024 * 1024 * min_size;
+    } else {
+        min_heap_size = default_collect_interval;
+    }
+
+    // default max heap currently set as 70% the free memory in the system
+    if (max_size_def != NULL) {
+        char *p;
+        double max_size = strtod(max_size_def, &p);
+        max_heap_size = (long) 1024 * 1024 * max_size;
+    } else if (max_size_gb != NULL) {
+        char *p;
+        double max_size = strtod(max_size_gb, &p);
+        max_heap_size = (long) 1024 * 1024 * 1024 * max_size;
+    } else {
+        max_heap_size = uv_get_free_memory() * 70 / 100;
+    }
+
+    // Assert that the number of stock GC threads is 0; MMTK uses the number of threads in jl_options.ngcthreads
+    assert(jl_n_gcthreads == 0);
+
+    // Check that the julia_copy_stack rust feature has been defined when the COPY_STACK has been defined
+    int copy_stacks;
+
+#ifdef COPY_STACKS
+    copy_stacks = 1;
+#else
+    copy_stacks = 0;
+#endif
+
+    mmtk_julia_copy_stack_check(copy_stacks);
+
+    // if only max size is specified initialize MMTk with a fixed size heap
+    // TODO: We just assume mark threads means GC threads, and ignore the number of concurrent sweep threads.
+    // If the two values are the same, we can use either. Otherwise, we need to be careful.
+    uintptr_t gcthreads = jl_options.nmarkthreads;
+    if (max_size_def != NULL || (max_size_gb != NULL && (min_size_def == NULL && min_size_gb == NULL))) {
+        mmtk_gc_init(0, max_heap_size, gcthreads, &mmtk_upcalls, (sizeof(jl_taggedvalue_t)), jl_buff_tag);
+    } else {
+        mmtk_gc_init(min_heap_size, max_heap_size, gcthreads, &mmtk_upcalls, (sizeof(jl_taggedvalue_t)), jl_buff_tag);
+    }
+}
+
+void jl_start_gc_threads(void) {
+    jl_ptls_t ptls = jl_current_task->ptls;
+    mmtk_initialize_collection((void *)ptls);
+    // int nthreads = jl_atomic_load_relaxed(&jl_n_threads);
+    // int ngcthreads = jl_n_gcthreads;
+    // int nmutator_threads = nthreads - ngcthreads;
+    // printf("nthreads = %d, ngcthreads = %d, nmutator_threads = %d\n", nthreads, ngcthreads, nmutator_threads);
+}
+
+void jl_init_thread_heap(struct _jl_tls_states_t *ptls) JL_NOTSAFEPOINT {
+    jl_thread_heap_t *heap = &ptls->gc_tls.heap;
+    small_arraylist_new(&heap->weak_refs, 0);
+    small_arraylist_new(&heap->live_tasks, 0);
+    for (int i = 0; i < JL_N_STACK_POOLS; i++)
+        small_arraylist_new(&heap->free_stacks[i], 0);
+    heap->mallocarrays = NULL;
+    heap->mafreelist = NULL;
+    arraylist_new(&ptls->finalizers, 0);
+    // Clear the malloc sz count
+    jl_atomic_store_relaxed(&ptls->gc_tls.malloc_sz_since_last_poll, 0);
+    // Create mutator
+    MMTk_Mutator mmtk_mutator = mmtk_bind_mutator((void *)ptls, ptls->tid);
+    // Copy the mutator to the thread local storage
+    memcpy(&ptls->gc_tls.mmtk_mutator, mmtk_mutator, sizeof(MMTkMutatorContext));
+    // Call post_bind to maintain a list of active mutators and to reclaim the old mutator (which is no longer needed)
+    mmtk_post_bind_mutator(&ptls->gc_tls.mmtk_mutator, mmtk_mutator);
+    memset(&ptls->gc_tls.gc_num, 0, sizeof(ptls->gc_tls.gc_num));
+}
+
+void jl_free_thread_gc_state(struct _jl_tls_states_t *ptls) {
+    mmtk_destroy_mutator(&ptls->gc_tls.mmtk_mutator);
+}
+
+// FIXME: mmtk uses the same code as stock to enable/disable the GC
+// Should this be moved to gc-common.c?
+
+_Atomic(uint32_t) jl_gc_disable_counter = 1;
+
+JL_DLLEXPORT int jl_gc_enable(int on) {
+    jl_ptls_t ptls = jl_current_task->ptls;
+    int prev = !ptls->disable_gc;
+    ptls->disable_gc = (on == 0);
+    if (on && !prev) {
+        // disable -> enable
+        if (jl_atomic_fetch_add(&jl_gc_disable_counter, -1) == 1) {
+            gc_num.allocd += gc_num.deferred_alloc;
+            gc_num.deferred_alloc = 0;
+        }
+    }
+    else if (prev && !on) {
+        // enable -> disable
+        jl_atomic_fetch_add(&jl_gc_disable_counter, 1);
+        // check if the GC is running and wait for it to finish
+        jl_gc_safepoint_(ptls);
+    }
+    return prev;
+}
+
+JL_DLLEXPORT int jl_gc_is_enabled(void) {
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return !ptls->disable_gc;
+}
+
+JL_DLLEXPORT void jl_gc_set_max_memory(uint64_t max_mem) {
+    // MMTk currently does not allow setting the heap size at runtime
+}
+
+JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) {
+    jl_task_t *ct = jl_current_task;
+    jl_ptls_t ptls = ct->ptls;
+    if (jl_atomic_load_acquire(&jl_gc_disable_counter)) {
+        size_t localbytes = jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + gc_num.interval;
+        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, -(int64_t)gc_num.interval);
+        static_assert(sizeof(_Atomic(uint64_t)) == sizeof(gc_num.deferred_alloc), "");
+        jl_atomic_fetch_add_relaxed((_Atomic(uint64_t)*)&gc_num.deferred_alloc, localbytes);
+        return;
+    }
+    mmtk_handle_user_collection_request(ptls, collection);
+}
+
+// same as above, some of these are identical to the implementation in gc stock
+static void combine_thread_gc_counts(jl_gc_num_t *dest, int update_heap) JL_NOTSAFEPOINT
+{
+    int gc_n_threads;
+    jl_ptls_t* gc_all_tls_states;
+    gc_n_threads = jl_atomic_load_acquire(&jl_n_threads);
+    gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls = gc_all_tls_states[i];
+        if (ptls) {
+            dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + gc_num.interval);
+            dest->malloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.malloc);
+            dest->realloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.realloc);
+            dest->poolalloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.poolalloc);
+            dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.bigalloc);
+            dest->freed += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.free_acc);
+            if (update_heap) {
+                jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.alloc_acc, 0);
+                jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.free_acc, 0);
+            }
+        }
+    }
+}
+
+
+void reset_thread_gc_counts(void) JL_NOTSAFEPOINT
+{
+    int gc_n_threads;
+    jl_ptls_t* gc_all_tls_states;
+    gc_n_threads = jl_atomic_load_acquire(&jl_n_threads);
+    gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls = gc_all_tls_states[i];
+        if (ptls != NULL) {
+            // don't reset `pool_live_bytes` here
+            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, -(int64_t)gc_num.interval);
+            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.malloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.realloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.poolalloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.bigalloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.alloc_acc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.free_acc, 0);
+        }
+    }
+}
+
+// weak references
+// ---
+JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref_th(jl_ptls_t ptls, jl_value_t *value)
+{
+    jl_weakref_t *wr = (jl_weakref_t*)jl_gc_alloc(ptls, sizeof(void*), jl_weakref_type);
+    wr->value = value;  // NOTE: wb not needed here
+    mmtk_add_weak_candidate(wr);
+    return wr;
+}
+
+
+// allocation
+int jl_gc_classify_pools(size_t sz, int *osize)
+{
+    if (sz > GC_MAX_SZCLASS)
+        return -1; // call big alloc function
+    size_t allocsz = sz + sizeof(jl_taggedvalue_t);
+    *osize = LLT_ALIGN(allocsz, 16);
+    return 0; // use MMTk's fastpath logic
+}
+
+int64_t last_gc_total_bytes = 0;
+int64_t last_live_bytes = 0; // live_bytes at last collection
+int64_t live_bytes = 0;
+
+// Retrieves Julia's `GC_Num` (structure that stores GC statistics).
+JL_DLLEXPORT jl_gc_num_t jl_gc_num(void) {
+    jl_gc_num_t num = gc_num;
+    combine_thread_gc_counts(&num, 0);
+    return num;
+}
+
+JL_DLLEXPORT int64_t jl_gc_diff_total_bytes(void) JL_NOTSAFEPOINT {
+    int64_t oldtb = last_gc_total_bytes;
+    int64_t newtb;
+    jl_gc_get_total_bytes(&newtb);
+    last_gc_total_bytes = newtb;
+    return newtb - oldtb;
+}
+
+JL_DLLEXPORT int64_t jl_gc_sync_total_bytes(int64_t offset) JL_NOTSAFEPOINT
+{
+    int64_t oldtb = last_gc_total_bytes;
+    int64_t newtb;
+    jl_gc_get_total_bytes(&newtb);
+    last_gc_total_bytes = newtb - offset;
+    return newtb - oldtb;
+}
+
+JL_DLLEXPORT int64_t jl_gc_pool_live_bytes(void) {
+    return 0;
+}
+
+void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
+        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + sz);
+}
+
+void jl_gc_count_freed(size_t sz) JL_NOTSAFEPOINT
+{
+}
+
+int64_t inc_live_bytes(int64_t inc) JL_NOTSAFEPOINT
+{
+    jl_timing_counter_inc(JL_TIMING_COUNTER_HeapSize, inc);
+    return live_bytes += inc;
+}
+
+void jl_gc_reset_alloc_count(void) JL_NOTSAFEPOINT
+{
+    combine_thread_gc_counts(&gc_num, 0);
+    inc_live_bytes(gc_num.deferred_alloc + gc_num.allocd);
+    gc_num.allocd = 0;
+    gc_num.deferred_alloc = 0;
+    reset_thread_gc_counts();
+}
+
+JL_DLLEXPORT int64_t jl_gc_live_bytes(void) {
+    return last_live_bytes;
+}
+
+JL_DLLEXPORT void jl_gc_get_total_bytes(int64_t *bytes) JL_NOTSAFEPOINT
+{
+    jl_gc_num_t num = gc_num;
+    combine_thread_gc_counts(&num, 0);
+    // Sync this logic with `base/util.jl:GC_Diff`
+    *bytes = (num.total_allocd + num.deferred_alloc + num.allocd);
+}
+
+JL_DLLEXPORT uint64_t jl_gc_get_max_memory(void)
+{
+    // FIXME: should probably return MMTk's heap size
+    return max_total_memory;
+}
+
+extern void mmtk_object_reference_write_post(void* mutator, const void* parent, const void* ptr);
+extern void mmtk_object_reference_write_slow(void* mutator, const void* parent, const void* ptr);
+extern void* mmtk_alloc(void* mutator, size_t size, size_t align, size_t offset, int allocator);
+extern void mmtk_post_alloc(void* mutator, void* refer, size_t bytes, int allocator);
+
+
+extern const void* MMTK_SIDE_LOG_BIT_BASE_ADDRESS;
+extern const void* MMTK_SIDE_VO_BIT_BASE_ADDRESS;
+
+// These need to be constants.
+
+#define MMTK_OBJECT_BARRIER (1)
+// Stickyimmix needs write barrier. Immix does not need write barrier.
+#ifdef MMTK_PLAN_IMMIX
+#define MMTK_NEEDS_WRITE_BARRIER (0)
+#endif
+#ifdef MMTK_PLAN_STICKYIMMIX
+#define MMTK_NEEDS_WRITE_BARRIER (1)
+#endif
+
+#ifdef MMTK_CONSERVATIVE_SCAN
+#define MMTK_NEEDS_VO_BIT (1)
+#else
+#define MMTK_NEEDS_VO_BIT (0)
+#endif
+
+#define MMTK_DEFAULT_IMMIX_ALLOCATOR (0)
+#define MMTK_IMMORTAL_BUMP_ALLOCATOR (0)
+
+// Directly call into MMTk for write barrier (debugging only)
+inline void mmtk_gc_wb_full(const void *parent, const void *ptr) JL_NOTSAFEPOINT
+{
+    jl_task_t *ct = jl_current_task;
+    jl_ptls_t ptls = ct->ptls;
+    mmtk_object_reference_write_post(&ptls->gc_tls.mmtk_mutator, parent, ptr);
+}
+
+// Fastpath. Return 1 if we should go to slowpath
+inline int mmtk_gc_wb_fast_check(const void *parent, const void *ptr) JL_NOTSAFEPOINT
+{
+    if (MMTK_NEEDS_WRITE_BARRIER == MMTK_OBJECT_BARRIER) {
+        intptr_t addr = (intptr_t) (void*) parent;
+        uint8_t* meta_addr = (uint8_t*) (MMTK_SIDE_LOG_BIT_BASE_ADDRESS) + (addr >> 6);
+        intptr_t shift = (addr >> 3) & 0b111;
+        uint8_t byte_val = *meta_addr;
+        return ((byte_val >> shift) & 1) == 1;
+    } else {
+        return 0;
+    }
+}
+
+// Slowpath.
+inline void mmtk_gc_wb_slow(const void *parent, const void *ptr) JL_NOTSAFEPOINT
+{
+    if (MMTK_NEEDS_WRITE_BARRIER == MMTK_OBJECT_BARRIER) {
+        jl_task_t *ct = jl_current_task;
+        jl_ptls_t ptls = ct->ptls;
+        mmtk_object_reference_write_slow(&ptls->gc_tls.mmtk_mutator, parent, ptr);
+    }
+}
+
+inline void mmtk_gc_wb(const void *parent, const void *ptr) JL_NOTSAFEPOINT
+{
+    if (mmtk_gc_wb_fast_check(parent, ptr)) {
+        mmtk_gc_wb_slow(parent, ptr);
+    }
+}
+
+inline void mmtk_gc_wb_binding(const void *bnd, const void *val) JL_NOTSAFEPOINT
+{
+    if (mmtk_gc_wb_fast_check(bnd, val)) {
+        jl_astaggedvalue(bnd)->bits.gc = 2; // to indicate that the buffer is a binding
+        mmtk_gc_wb_slow(bnd, val);
+    }
+}
+
+#define MMTK_MIN_ALIGNMENT 4
+// MMTk assumes allocation size is aligned to min alignment.
+inline size_t mmtk_align_alloc_sz(size_t sz) JL_NOTSAFEPOINT
+{
+    return (sz + MMTK_MIN_ALIGNMENT - 1) & ~(MMTK_MIN_ALIGNMENT - 1);
+}
+
+inline void* bump_alloc_fast(MMTkMutatorContext* mutator, uintptr_t* cursor, uintptr_t limit, size_t size, size_t align, size_t offset, int allocator) {
+    intptr_t delta = (-offset - *cursor) & (align - 1);
+    uintptr_t result = *cursor + (uintptr_t)delta;
+
+    if (__unlikely(result + size > limit)) {
+        return (void*) mmtk_alloc(mutator, size, align, offset, allocator);
+    } else{
+        *cursor = result + size;
+        return (void*)result;
+    }
+}
+
+inline void* mmtk_immix_alloc_fast(MMTkMutatorContext* mutator, size_t size, size_t align, size_t offset) {
+    ImmixAllocator* allocator = &mutator->allocators.immix[MMTK_DEFAULT_IMMIX_ALLOCATOR];
+    return bump_alloc_fast(mutator, (uintptr_t*)&allocator->cursor, (intptr_t)allocator->limit, size, align, offset, 0);
+}
+
+inline void mmtk_immix_post_alloc_slow(MMTkMutatorContext* mutator, void* obj, size_t size) {
+    mmtk_post_alloc(mutator, obj, size, 0);
+}
+
+inline void mmtk_set_vo_bit(void* obj) {
+        intptr_t addr = (intptr_t) obj;
+        intptr_t shift = (addr >> 3) & 0b111;
+        uint8_t* vo_meta_addr = (uint8_t*) (MMTK_SIDE_VO_BIT_BASE_ADDRESS) + (addr >> 6);
+        uint8_t new_val = (*vo_meta_addr) | (1 << shift);
+        (*vo_meta_addr) = new_val;
+}
+
+inline void mmtk_immix_post_alloc_fast(MMTkMutatorContext* mutator, void* obj, size_t size) {
+    if (MMTK_NEEDS_VO_BIT) {
+        // set VO bit
+        mmtk_set_vo_bit(obj);
+    }
+}
+
+inline void* mmtk_immortal_alloc_fast(MMTkMutatorContext* mutator, size_t size, size_t align, size_t offset) {
+    BumpAllocator* allocator = &mutator->allocators.bump_pointer[MMTK_IMMORTAL_BUMP_ALLOCATOR];
+    return bump_alloc_fast(mutator, (uintptr_t*)&allocator->cursor, (uintptr_t)allocator->limit, size, align, offset, 1);
+}
+
+inline void mmtk_immortal_post_alloc_fast(MMTkMutatorContext* mutator, void* obj, size_t size) {
+    if (MMTK_NEEDS_VO_BIT) {
+        // set VO bit
+        mmtk_set_vo_bit(obj);
+    }
+
+    if (MMTK_NEEDS_WRITE_BARRIER == MMTK_OBJECT_BARRIER) {
+        intptr_t addr = (intptr_t) obj;
+        intptr_t shift = (addr >> 3) & 0b111;
+        uint8_t* meta_addr = (uint8_t*) (MMTK_SIDE_LOG_BIT_BASE_ADDRESS) + (addr >> 6);
+        while(1) {
+            uint8_t old_val = *meta_addr;
+            uint8_t new_val = old_val | (1 << shift);
+            if (jl_atomic_cmpswap((_Atomic(uint8_t)*)meta_addr, &old_val, new_val)) {
+                break;
+            }
+        }
+    }
+}
+
+// mutex for page profile
+uv_mutex_t page_profile_lock;
+
+JL_DLLEXPORT void jl_gc_take_page_profile(ios_t *stream)
+{
+    uv_mutex_lock(&page_profile_lock);
+    const char *str = "Page profiler in unsupported in MMTk.";
+    ios_write(stream, str, strlen(str));
+    uv_mutex_unlock(&page_profile_lock);
+}
+
+// this seems to be needed by the gc tests
+#define JL_GC_N_MAX_POOLS 51
+JL_DLLEXPORT double jl_gc_page_utilization_stats[JL_GC_N_MAX_POOLS];
+
+STATIC_INLINE void gc_dump_page_utilization_data(void) JL_NOTSAFEPOINT
+{
+    // FIXME: MMTk would have to provide its own stats
+}
+
+#define MMTK_GC_PAGE_SZ (1 << 12) // MMTk's page size is defined in mmtk-core constants
+
+JL_DLLEXPORT uint64_t jl_get_pg_size(void)
+{
+    return MMTK_GC_PAGE_SZ;
+}
+
+
+extern void mmtk_store_obj_size_c(void* obj, size_t size);
+
+inline void maybe_collect(jl_ptls_t ptls)
+{
+    // Just do a safe point for general maybe_collect
+    jl_gc_safepoint_(ptls);
+}
+
+// This is only used for malloc. We need to know if we need to do GC. However, keeping checking with MMTk (mmtk_gc_poll),
+// is expensive. So we only check for every few allocations.
+static inline void malloc_maybe_collect(jl_ptls_t ptls, size_t sz)
+{
+    // We do not need to carefully maintain malloc_sz_since_last_poll. We just need to
+    // avoid using mmtk_gc_poll too frequently, and try to be precise on our heap usage
+    // as much as we can.
+    if (ptls->gc_tls.malloc_sz_since_last_poll > 4096) {
+        jl_atomic_store_relaxed(&ptls->gc_tls.malloc_sz_since_last_poll, 0);
+        mmtk_gc_poll(ptls);
+    } else {
+        jl_atomic_fetch_add_relaxed(&ptls->gc_tls.malloc_sz_since_last_poll, sz);
+        jl_gc_safepoint_(ptls);
+    }
+}
+
+// allocation wrappers that track allocation and let collection run
+
+JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
+{
+    jl_gcframe_t **pgcstack = jl_get_pgcstack();
+    jl_task_t *ct = jl_current_task;
+    void *data = malloc(sz);
+    if (data != NULL && pgcstack != NULL && ct->world_age) {
+        jl_ptls_t ptls = ct->ptls;
+        malloc_maybe_collect(ptls, sz);
+        jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, sz);
+    }
+    return data;
+}
+
+JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
+{
+    jl_gcframe_t **pgcstack = jl_get_pgcstack();
+    jl_task_t *ct = jl_current_task;
+    void *data = calloc(nm, sz);
+    if (data != NULL && pgcstack != NULL && ct->world_age) {
+        jl_ptls_t ptls = ct->ptls;
+        malloc_maybe_collect(ptls, nm * sz);
+        jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, nm * sz);
+    }
+    return data;
+}
+
+JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz)
+{
+    jl_gcframe_t **pgcstack = jl_get_pgcstack();
+    jl_task_t *ct = jl_current_task;
+    free(p);
+    if (pgcstack != NULL && ct->world_age) {
+        jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, -sz);
+    }
+}
+
+JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz)
+{
+    jl_gcframe_t **pgcstack = jl_get_pgcstack();
+    jl_task_t *ct = jl_current_task;
+    if (pgcstack && ct->world_age) {
+        jl_ptls_t ptls = ct->ptls;
+        malloc_maybe_collect(ptls, sz);
+        if (sz < old)
+            jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, old - sz);
+        else
+            jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, sz - old);
+    }
+    return realloc(p, sz);
+}
+
+void *jl_gc_perm_alloc_nolock(size_t sz, int zero, unsigned align, unsigned offset)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    size_t allocsz = mmtk_align_alloc_sz(sz);
+    void* addr = mmtk_immortal_alloc_fast(&ptls->gc_tls.mmtk_mutator, allocsz, align, offset);
+    return addr;
+}
+
+void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align, unsigned offset)
+{
+    return jl_gc_perm_alloc_nolock(sz, zero, align, offset);
+}
+
+jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT
+{
+    const size_t allocsz = sz + sizeof(jl_taggedvalue_t);
+    unsigned align = (sz == 0 ? sizeof(void*) : (allocsz <= sizeof(void*) * 2 ?
+                                                 sizeof(void*) * 2 : 16));
+    jl_taggedvalue_t *o = (jl_taggedvalue_t*)jl_gc_perm_alloc(allocsz, 0, align,
+                                                              sizeof(void*) % align);
+
+    jl_ptls_t ptls = jl_current_task->ptls;
+    mmtk_immortal_post_alloc_fast(&ptls->gc_tls.mmtk_mutator, jl_valueof(o), allocsz);
+    o->header = (uintptr_t)ty;
+    return jl_valueof(o);
+}
+
+
+JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_default(jl_ptls_t ptls, int osize, size_t align, void *ty)
+{
+    // safepoint
+    jl_gc_safepoint_(ptls);
+
+    jl_value_t *v;
+    if ((uintptr_t)ty != jl_buff_tag) {
+        // v needs to be 16 byte aligned, therefore v_tagged needs to be offset accordingly to consider the size of header
+        jl_taggedvalue_t *v_tagged = (jl_taggedvalue_t *)mmtk_immix_alloc_fast(&ptls->gc_tls.mmtk_mutator, LLT_ALIGN(osize, align), align, sizeof(jl_taggedvalue_t));
+        v = jl_valueof(v_tagged);
+        mmtk_immix_post_alloc_fast(&ptls->gc_tls.mmtk_mutator, v, LLT_ALIGN(osize, align));
+    } else {
+        // allocating an extra word to store the size of buffer objects
+        jl_taggedvalue_t *v_tagged = (jl_taggedvalue_t *)mmtk_immix_alloc_fast(&ptls->gc_tls.mmtk_mutator, LLT_ALIGN(osize+sizeof(jl_taggedvalue_t), align), align, 0);
+        jl_value_t* v_tagged_aligned = ((jl_value_t*)((char*)(v_tagged) + sizeof(jl_taggedvalue_t)));
+        v = jl_valueof(v_tagged_aligned);
+        mmtk_store_obj_size_c(v, LLT_ALIGN(osize+sizeof(jl_taggedvalue_t), align));
+        mmtk_immix_post_alloc_fast(&ptls->gc_tls.mmtk_mutator, v, LLT_ALIGN(osize+sizeof(jl_taggedvalue_t), align));
+    }
+
+    ptls->gc_tls.gc_num.allocd += osize;
+    ptls->gc_tls.gc_num.poolalloc++;
+
+    return v;
+}
+
+void jl_gc_notify_image_load(const char* img_data, size_t len)
+{
+    mmtk_set_vm_space((void*)img_data, len);
+}
+
+JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_big(jl_ptls_t ptls, size_t sz)
+{
+    // safepoint
+    jl_gc_safepoint_(ptls);
+
+    size_t offs = offsetof(bigval_t, header);
+    assert(sz >= sizeof(jl_taggedvalue_t) && "sz must include tag");
+    static_assert(offsetof(bigval_t, header) >= sizeof(void*), "Empty bigval header?");
+    static_assert(sizeof(bigval_t) % JL_HEAP_ALIGNMENT == 0, "");
+    size_t allocsz = LLT_ALIGN(sz + offs, JL_CACHE_BYTE_ALIGNMENT);
+    if (allocsz < sz) { // overflow in adding offs, size was "negative"
+        assert(0 && "Error when allocating big object");
+        jl_throw(jl_memory_exception);
+    }
+
+    bigval_t *v = (bigval_t*)mmtk_alloc_large(&ptls->gc_tls.mmtk_mutator, allocsz, JL_CACHE_BYTE_ALIGNMENT, 0, 2);
+
+    if (v == NULL) {
+        assert(0 && "Allocation failed");
+        jl_throw(jl_memory_exception);
+    }
+    v->sz = allocsz;
+
+    ptls->gc_tls.gc_num.allocd += allocsz;
+    ptls->gc_tls.gc_num.bigalloc++;
+
+    jl_value_t *result = jl_valueof(&v->header);
+    mmtk_post_alloc(&ptls->gc_tls.mmtk_mutator, result, allocsz, 2);
+
+    return result;
+}
+
+// Instrumented version of jl_gc_small_alloc_inner, called into by LLVM-generated code.
+JL_DLLEXPORT jl_value_t *jl_gc_small_alloc(jl_ptls_t ptls, int offset, int osize, jl_value_t* type)
+{
+    assert(jl_atomic_load_relaxed(&ptls->gc_state) == 0);
+
+    jl_value_t *val = jl_mmtk_gc_alloc_default(ptls, osize, 16, NULL);
+    maybe_record_alloc_to_profile(val, osize, (jl_datatype_t*)type);
+    return val;
+}
+
+// Instrumented version of jl_gc_big_alloc_inner, called into by LLVM-generated code.
+JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz, jl_value_t *type)
+{
+    // TODO: assertion needed here?
+    assert(jl_atomic_load_relaxed(&ptls->gc_state) == 0);
+
+    jl_value_t *val = jl_mmtk_gc_alloc_big(ptls, sz);
+    maybe_record_alloc_to_profile(val, sz, (jl_datatype_t*)type);
+    return val;
+}
+
+inline jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty)
+{
+    jl_value_t *v;
+    const size_t allocsz = sz + sizeof(jl_taggedvalue_t);
+    if (sz <= GC_MAX_SZCLASS) {
+        v = jl_mmtk_gc_alloc_default(ptls, allocsz, 16, ty);
+    }
+    else {
+        if (allocsz < sz) // overflow in adding offs, size was "negative"
+            jl_throw(jl_memory_exception);
+        v = jl_mmtk_gc_alloc_big(ptls, allocsz);
+    }
+    jl_set_typeof(v, ty);
+    maybe_record_alloc_to_profile(v, sz, (jl_datatype_t*)ty);
+    return v;
+}
+
+JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    maybe_collect(ptls);
+    size_t allocsz = LLT_ALIGN(sz, JL_CACHE_BYTE_ALIGNMENT);
+    if (allocsz < sz)  // overflow in adding offs, size was "negative"
+        jl_throw(jl_memory_exception);
+
+    int last_errno = errno;
+#ifdef _OS_WINDOWS_
+    DWORD last_error = GetLastError();
+#endif
+    void *b = malloc_cache_align(allocsz);
+    if (b == NULL)
+        jl_throw(jl_memory_exception);
+
+    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
+        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + allocsz);
+    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.malloc,
+        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.malloc) + 1);
+    // FIXME: Should these be part of mmtk's heap?
+    // malloc_maybe_collect(ptls, sz);
+    // jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, allocsz);
+#ifdef _OS_WINDOWS_
+    SetLastError(last_error);
+#endif
+    errno = last_errno;
+    // jl_gc_managed_malloc is currently always used for allocating array buffers.
+    maybe_record_alloc_to_profile((jl_value_t*)b, sz, (jl_datatype_t*)jl_buff_tag);
+    return b;
+}
+
+// Not used by mmtk
+// Number of GC threads that may run parallel marking
+int jl_n_markthreads;
+// Number of GC threads that may run concurrent sweeping (0 or 1)
+int jl_n_sweepthreads;
+// `tid` of first GC thread
+int gc_first_tid;
+
+JL_DLLEXPORT void jl_gc_queue_root(const struct _jl_value_t *ptr) JL_NOTSAFEPOINT
+{
+    mmtk_unreachable();
+}
+
+JL_DLLEXPORT void jl_gc_queue_multiroot(const struct _jl_value_t *root, const void *stored,
+                                        struct _jl_datatype_t *dt) JL_NOTSAFEPOINT
+{
+    mmtk_unreachable();
+}
+
+// marking
+// ---
+
+JL_DLLEXPORT int jl_gc_mark_queue_obj(jl_ptls_t ptls, jl_value_t *obj)
+{
+    mmtk_unreachable();
+    return 0;
+}
+JL_DLLEXPORT void jl_gc_mark_queue_objarray(jl_ptls_t ptls, jl_value_t *parent,
+                                            jl_value_t **objs, size_t nobjs)
+{
+    mmtk_unreachable();
+}
+
+JL_DLLEXPORT size_t jl_gc_max_internal_obj_size(void)
+{
+    // TODO: meaningful for MMTk?
+    return GC_MAX_SZCLASS;
+}
+
+JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj)
+{
+    // FIXME: do we need to implement this?
+}
+
+// gc-debug functions
+// ---
+
+JL_DLLEXPORT jl_taggedvalue_t *jl_gc_find_taggedvalue_pool(char *p, size_t *osize_p)
+{
+    return NULL;
+}
+
+void jl_gc_debug_critical_error(void) JL_NOTSAFEPOINT
+{
+}
+
+int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    return 0;
+}
+
+int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    return 0;
+}
+
+void jl_gc_debug_print_status(void) JL_NOTSAFEPOINT
+{
+    // May not be accurate but should be helpful enough
+    uint64_t pool_count = gc_num.poolalloc;
+    uint64_t big_count = gc_num.bigalloc;
+    jl_safe_printf("Allocations: %" PRIu64 " "
+                   "(Pool: %" PRIu64 "; Big: %" PRIu64 "); GC: %d\n",
+                   pool_count + big_count, pool_count, big_count, gc_num.pause);
+}
+
+JL_DLLEXPORT size_t jl_gc_external_obj_hdr_size(void)
+{
+    return sizeof(bigval_t);
+}
+
+void jl_print_gc_stats(JL_STREAM *s)
+{
+}
+
+JL_DLLEXPORT int jl_gc_enable_conservative_gc_support(void)
+{
+    return 0;
+}
+
+JL_DLLEXPORT int jl_gc_conservative_gc_support_enabled(void)
+{
+    return 0;
+}
+
+// TODO: if this is needed, it can be added in MMTk
+JL_DLLEXPORT jl_value_t *jl_gc_internal_obj_base_ptr(void *p)
+{
+    return NULL;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // MMTK_GC
diff --git a/src/gc-mmtk.h b/src/gc-mmtk.h
new file mode 100644
index 0000000000000..6c2c7a40bc81f
--- /dev/null
+++ b/src/gc-mmtk.h
@@ -0,0 +1,34 @@
+#ifdef MMTK_GC
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern jl_mutex_t finalizers_lock;
+extern arraylist_t to_finalize;
+extern arraylist_t finalizer_list_marked;
+
+JL_EXTENSION typedef struct _bigval_t {
+    size_t sz;
+#ifdef _P64 // Add padding so that the value is 64-byte aligned
+    // (8 pointers of 8 bytes each) - (2 other pointers in struct)
+    void *_padding[8 - 2];
+#else
+    // (16 pointers of 4 bytes each) - (2 other pointers in struct)
+    void *_padding[16 - 2];
+#endif
+    //struct jl_taggedvalue_t <>;
+    union {
+        uintptr_t header;
+        struct {
+            uintptr_t gc:2;
+        } bits;
+    };
+    // must be 64-byte aligned here, in 32 & 64 bit modes
+} bigval_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // MMTK_GC
diff --git a/src/gc-page-profiler.c b/src/gc-page-profiler.c
index 2625fa812781a..bfd1c74247df8 100644
--- a/src/gc-page-profiler.c
+++ b/src/gc-page-profiler.c
@@ -1,5 +1,5 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
-
+#ifndef MMTK_GC
 #include "gc-page-profiler.h"
 #include "julia.h"
 
@@ -178,3 +178,5 @@ JL_DLLEXPORT void jl_gc_take_page_profile(ios_t *stream)
 #ifdef __cplusplus
 }
 #endif
+
+#endif // !MMTK_GC
diff --git a/src/gc-pages.c b/src/gc-pages.c
index 71d59de29166f..ed6e0ed20ba1c 100644
--- a/src/gc-pages.c
+++ b/src/gc-pages.c
@@ -1,5 +1,5 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
-
+#ifndef MMTK_GC
 #include "gc-common.h"
 #include "gc-stock.h"
 #ifndef _OS_WINDOWS_
@@ -205,3 +205,5 @@ void jl_gc_free_page(jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT
 #ifdef __cplusplus
 }
 #endif
+
+#endif // !MMTK_GC
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 3ff37566dc6c7..164d3067a31de 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -1,5 +1,5 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
-
+#ifndef MMTK_GC
 #include "gc-common.h"
 #include "gc-stock.h"
 #include "gc-alloc-profiler.h"
@@ -405,7 +405,6 @@ static void sweep_weak_refs(void)
     }
 }
 
-
 STATIC_INLINE void jl_batch_accum_heap_size(jl_ptls_t ptls, uint64_t sz) JL_NOTSAFEPOINT
 {
     uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.alloc_acc) + sz;
@@ -453,7 +452,6 @@ STATIC_INLINE jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz)
     return jl_valueof(&v->header);
 }
 
-
 // Instrumented version of jl_gc_big_alloc_inner, called into by LLVM-generated code.
 JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz, jl_value_t *type)
 {
@@ -3888,12 +3886,22 @@ JL_DLLEXPORT size_t jl_gc_external_obj_hdr_size(void)
     return sizeof(bigval_t);
 }
 
+<<<<<<< HEAD
 
+=======
+>>>>>>> c48a701f54 (WIP: Adding support for MMTk/Immix)
 JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj)
 {
     arraylist_push(&ptls->gc_tls.sweep_objs, obj);
 }
 
+void jl_gc_notify_image_load(const char* img_data, size_t len)
+{
+    // Do nothing
+}
+
 #ifdef __cplusplus
 }
 #endif
+
+#endif // !MMTK_GC
diff --git a/src/gc-stock.h b/src/gc-stock.h
index 50eca3aadbd86..8e563f32ab9d3 100644
--- a/src/gc-stock.h
+++ b/src/gc-stock.h
@@ -5,6 +5,7 @@
   . non-moving, precise mark and sweep collector
   . pool-allocates small objects, keeps big objects on a simple list
 */
+#ifndef MMTK_GC
 
 #ifndef JL_GC_H
 #define JL_GC_H
@@ -422,21 +423,6 @@ STATIC_INLINE int gc_ith_parallel_collector_thread_id(int i) JL_NOTSAFEPOINT
     return gc_first_tid + i;
 }
 
-STATIC_INLINE int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
-{
-    return tid >= gc_first_tid && tid <= gc_last_parallel_collector_thread_id();
-}
-
-STATIC_INLINE int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
-{
-    if (jl_n_sweepthreads == 0) {
-        return 0;
-    }
-    int last_parallel_collector_thread_id = gc_last_parallel_collector_thread_id();
-    int concurrent_collector_thread_id = last_parallel_collector_thread_id + 1;
-    return tid == concurrent_collector_thread_id;
-}
-
 STATIC_INLINE int gc_random_parallel_collector_thread_id(jl_ptls_t ptls) JL_NOTSAFEPOINT
 {
     assert(jl_n_markthreads > 0);
@@ -712,3 +698,5 @@ void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect
 #endif
 
 #endif
+
+#endif // !MMTK_GC
diff --git a/src/gc-tls-mmtk.h b/src/gc-tls-mmtk.h
new file mode 100644
index 0000000000000..2eb5f2a6a44d9
--- /dev/null
+++ b/src/gc-tls-mmtk.h
@@ -0,0 +1,49 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+#ifdef MMTK_GC
+
+#include <assert.h>
+#include "mmtkMutator.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    // variable for tracking weak references
+    small_arraylist_t weak_refs;
+    // live tasks started on this thread
+    // that are holding onto a stack from the pool
+    small_arraylist_t live_tasks;
+
+    // variables for tracking malloc'd arrays
+    struct _mallocmemory_t *mallocarrays;
+    struct _mallocmemory_t *mafreelist;
+
+#define JL_N_STACK_POOLS 16
+    small_arraylist_t free_stacks[JL_N_STACK_POOLS];
+} jl_thread_heap_t;
+
+typedef struct {
+    _Atomic(int64_t) allocd;
+    _Atomic(int64_t) pool_live_bytes;
+    _Atomic(uint64_t) malloc;
+    _Atomic(uint64_t) realloc;
+    _Atomic(uint64_t) poolalloc;
+    _Atomic(uint64_t) bigalloc;
+    _Atomic(int64_t) free_acc;
+    _Atomic(uint64_t) alloc_acc;
+} jl_thread_gc_num_t;
+
+typedef struct {
+    jl_thread_heap_t heap;
+    jl_thread_gc_num_t gc_num;
+    MMTkMutatorContext mmtk_mutator;
+    size_t malloc_sz_since_last_poll;
+} jl_gc_tls_states_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // MMTK_GC
diff --git a/src/gc-tls.h b/src/gc-tls.h
index 9e4b09404db84..43adfb8a7ff2a 100644
--- a/src/gc-tls.h
+++ b/src/gc-tls.h
@@ -1,5 +1,7 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
 
+#ifndef MMTK_GC
+
 // Meant to be included in "julia_threads.h"
 #ifndef JL_GC_TLS_H
 #define JL_GC_TLS_H
@@ -90,3 +92,5 @@ typedef struct {
 #endif
 
 #endif // JL_GC_TLS_H
+
+#endif // MMTK_GC
diff --git a/src/julia.h b/src/julia.h
index abb8a57ff13b0..db57db1fbeb38 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -850,7 +850,7 @@ static inline jl_value_t *jl_to_typeof(uintptr_t t)
     return (jl_value_t*)t;
 }
 #else
-extern JL_HIDDEN jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
+extern JL_DLLEXPORT jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
 static inline jl_value_t *jl_to_typeof(uintptr_t t)
 {
     if (t < (jl_max_tags << 4))
diff --git a/src/julia_internal.h b/src/julia_internal.h
index e677f40907dfd..d5013601a9124 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -1052,7 +1052,7 @@ STATIC_INLINE int jl_addr_is_safepoint(uintptr_t addr)
     return addr >= safepoint_addr && addr < safepoint_addr + jl_page_size * 4;
 }
 extern _Atomic(uint32_t) jl_gc_running;
-extern _Atomic(uint32_t) jl_gc_disable_counter;
+extern JL_DLLEXPORT _Atomic(uint32_t) jl_gc_disable_counter;
 // All the functions are safe to be called from within a signal handler
 // provided that the thread will not be interrupted by another asynchronous
 // signal.
diff --git a/src/julia_threads.h b/src/julia_threads.h
index b697a0bf030ed..641c50386c555 100644
--- a/src/julia_threads.h
+++ b/src/julia_threads.h
@@ -4,7 +4,11 @@
 #ifndef JL_THREADS_H
 #define JL_THREADS_H
 
+#ifndef MMTK_GC
 #include "gc-tls.h"
+#else
+#include "gc-tls-mmtk.h"
+#endif
 #include "julia_atomics.h"
 #ifndef _OS_WINDOWS_
 #include "pthread.h"
diff --git a/src/stackwalk.c b/src/stackwalk.c
index a1de3a6d61a07..e6fc2c7bbf56a 100644
--- a/src/stackwalk.c
+++ b/src/stackwalk.c
@@ -1294,6 +1294,8 @@ JL_DLLEXPORT void jl_print_backtrace(void) JL_NOTSAFEPOINT
 }
 
 extern int gc_first_tid;
+extern int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT;
+extern int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT;
 
 // Print backtraces for all live tasks, for all threads, to jl_safe_printf stderr
 JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT
diff --git a/src/staticdata.c b/src/staticdata.c
index 363aa46b62221..e07a5365bf06f 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -654,6 +654,7 @@ static void jl_load_sysimg_so(void)
         plen = (size_t *)&jl_system_image_size;
     else
         jl_dlsym(jl_sysimg_handle, "jl_system_image_size", (void **)&plen, 1);
+    jl_gc_notify_image_load(sysimg_data, *plen);
     jl_restore_system_image_data(sysimg_data, *plen);
 }
 
@@ -3899,6 +3900,7 @@ JL_DLLEXPORT jl_value_t *jl_restore_package_image_from_file(const char *fname, j
     jl_dlsym(pkgimg_handle, "jl_system_image_data", (void **)&pkgimg_data, 1);
     size_t *plen;
     jl_dlsym(pkgimg_handle, "jl_system_image_size", (void **)&plen, 1);
+    jl_gc_notify_image_load(pkgimg_data, *plen);
 
     jl_image_t pkgimage = jl_init_processor_pkgimg(pkgimg_handle);
 
diff --git a/src/threading.c b/src/threading.c
index 44b1192528531..df62ea107bf04 100644
--- a/src/threading.c
+++ b/src/threading.c
@@ -743,6 +743,10 @@ void jl_init_threading(void)
     }
     int16_t ngcthreads = jl_n_markthreads + jl_n_sweepthreads;
 
+#ifdef MMTK_GC
+    ngcthreads = 0;
+#endif
+
     jl_all_tls_states_size = nthreads + nthreadsi + ngcthreads;
     jl_n_threads_per_pool = (int*)malloc_s(2 * sizeof(int));
     jl_n_threads_per_pool[0] = nthreadsi;

From b488bbeb22847c3740459d015878368587ecb847 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 28 Aug 2024 00:44:09 +0000
Subject: [PATCH 079/116] Refactoring to be considered before adding MMTk

---
 src/gc-interface.h | 4 +++-
 src/gc-stock.c     | 3 +++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/gc-interface.h b/src/gc-interface.h
index 72a57f4944156..b1f3ab9d6908d 100644
--- a/src/gc-interface.h
+++ b/src/gc-interface.h
@@ -192,7 +192,9 @@ JL_DLLEXPORT void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align,
 // object being allocated and will be used to set the object header.
 struct _jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT;
 
-// FIXME: add description here
+// This function notifies the GC about memory addresses that are set when loading the boot image.
+// The GC may use that information to, for instance, determine that such objects should
+// be treated as marked and belonged to the old generation in nursery collections.
 void jl_gc_notify_image_load(const char* img_data, size_t len);
 
 // ========================================================================= //
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 164d3067a31de..019ae481ce189 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -3888,8 +3888,11 @@ JL_DLLEXPORT size_t jl_gc_external_obj_hdr_size(void)
 
 <<<<<<< HEAD
 
+<<<<<<< HEAD
 =======
 >>>>>>> c48a701f54 (WIP: Adding support for MMTk/Immix)
+=======
+>>>>>>> 0aee3ba32a (Refactoring to be considered before adding MMTk)
 JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj)
 {
     arraylist_push(&ptls->gc_tls.sweep_objs, obj);

From a4cf8e7c754fc72c9612750ccce65b87eaeb720b Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 29 Aug 2024 05:37:53 +0000
Subject: [PATCH 080/116] Adding fastpath allocation

---
 src/llvm-gc-interface-passes.h |   5 ++
 src/llvm-late-gc-lowering.cpp  | 139 +++++++++++++++++++++++++++++++++
 2 files changed, 144 insertions(+)

diff --git a/src/llvm-gc-interface-passes.h b/src/llvm-gc-interface-passes.h
index d33567e887118..ed6b94dcdc3fc 100644
--- a/src/llvm-gc-interface-passes.h
+++ b/src/llvm-gc-interface-passes.h
@@ -19,6 +19,7 @@
 #include <llvm/ADT/SmallVector.h>
 #include <llvm/ADT/SmallSet.h>
 #include <llvm/Analysis/CFG.h>
+#include <llvm/Analysis/DomTreeUpdater.h>
 #include <llvm/Analysis/InstSimplifyFolder.h>
 #include <llvm/IR/Value.h>
 #include <llvm/IR/Constants.h>
@@ -328,6 +329,7 @@ struct LateLowerGCFrame:  private JuliaPassContext {
 
 private:
     CallInst *pgcstack;
+    Function *smallAllocFunc;
 
     void MaybeNoteDef(State &S, BBState &BBS, Value *Def, const ArrayRef<int> &SafepointsSoFar,
                       SmallVector<int, 1> &&RefinedPtr = SmallVector<int, 1>());
@@ -365,6 +367,9 @@ struct LateLowerGCFrame:  private JuliaPassContext {
     void RefineLiveSet(LargeSparseBitVector &LS, State &S, ArrayRef<int> CalleeRoots);
     Value *EmitTagPtr(IRBuilder<> &builder, Type *T, Type *T_size, Value *V);
     Value *EmitLoadTag(IRBuilder<> &builder, Type *T_size, Value *V);
+#ifdef MMTK_GC
+    Value* lowerGCAllocBytesLate(CallInst *target, Function &F);
+#endif
 };
 
 // The final GC lowering pass. This pass lowers platform-agnostic GC
diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
index 1d390a5115207..d395771f6df0c 100644
--- a/src/llvm-late-gc-lowering.cpp
+++ b/src/llvm-late-gc-lowering.cpp
@@ -2452,8 +2452,122 @@ void LateLowerGCFrame::PlaceRootsAndUpdateCalls(SmallVectorImpl<int> &Colors, St
     }
 }
 
+#ifdef MMTK_GC
+Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
+{
+    assert(target->arg_size() == 3);
+
+    IRBuilder<> builder(target);
+    auto ptls = target->getArgOperand(0);
+    auto type = target->getArgOperand(2);
+    if (auto CI = dyn_cast<ConstantInt>(target->getArgOperand(1))) {
+        size_t sz = (size_t)CI->getZExtValue();
+        // This is strongly architecture and OS dependent
+        int osize;
+        int offset = jl_gc_classify_pools(sz, &osize);
+        if (offset >= 0) {
+            // In this case instead of lowering julia.gc_alloc_bytes to jl_gc_small_alloc
+            // We do a slowpath/fastpath check and lower it only on the slowpath, returning
+            // the cursor and updating it in the fastpath.
+            auto pool_osize_i32 = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize);
+            auto pool_osize = ConstantInt::get(Type::getInt64Ty(F.getContext()), osize);
+
+            // Should we generate fastpath allocation sequence here? We should always generate fastpath here for MMTk.
+            // Setting this to false will increase allocation overhead a lot, and should only be used for debugging.
+            const bool INLINE_FASTPATH_ALLOCATION = true;
+
+            if (INLINE_FASTPATH_ALLOCATION) {
+                // Assuming we use the first immix allocator.
+                // FIXME: We should get the allocator index and type from MMTk.
+                auto allocator_offset = offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, mmtk_mutator) + offsetof(MMTkMutatorContext, allocators) + offsetof(Allocators, immix);
+
+                auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, cursor));
+                auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()),  allocator_offset + offsetof(ImmixAllocator, limit));
+
+                auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos);
+                auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr");
+                auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor");
+
+                // offset = 8
+                auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8));
+                auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor);
+                auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor);
+                // alignment 16 (15 = 16 - 1)
+                auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta");
+                auto result = builder.CreateNSWAdd(cursor, delta, "result");
+
+                auto new_cursor = builder.CreateNSWAdd(result, pool_osize);
+
+                auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos);
+                auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr");
+                auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit");
+
+                auto gt_limit = builder.CreateICmpSGT(new_cursor, limit);
+
+                auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction());
+                auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction());
+
+                auto next_instr = target->getNextNode();
+                SmallVector<uint32_t, 2> Weights{1, 9};
+
+                MDBuilder MDB(F.getContext());
+                SplitBlockAndInsertIfThenElse(gt_limit, next_instr, &slowpath, &fastpath, false, false, MDB.createBranchWeights(Weights));
+
+                builder.SetInsertPoint(next_instr);
+                auto phiNode = builder.CreatePHI(target->getCalledFunction()->getReturnType(), 2, "phi_fast_slow");
+
+                // slowpath
+                builder.SetInsertPoint(slowpath);
+                auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1);
+                auto new_call = builder.CreateCall(smallAllocFunc, { ptls, pool_offs, pool_osize_i32, type });
+                new_call->setAttributes(new_call->getCalledFunction()->getAttributes());
+                builder.CreateBr(next_instr->getParent());
+
+                // fastpath
+                builder.SetInsertPoint(fastpath);
+                builder.CreateStore(new_cursor, cursor_ptr);
+
+                // ptls->gc_tls.gc_num.allocd += osize;
+                auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, gc_num));
+                auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos);
+                auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc");
+                auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls);
+                auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize);
+                builder.CreateStore(pool_allocd_total, pool_alloc_tls);
+
+                auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t)));
+                auto v_as_ptr = builder.CreateIntToPtr(v_raw, smallAllocFunc->getReturnType());
+                builder.CreateBr(next_instr->getParent());
+
+                phiNode->addIncoming(new_call, slowpath);
+                phiNode->addIncoming(v_as_ptr, fastpath);
+                phiNode->takeName(target);
+                return phiNode;
+            }
+        }
+    }
+    return target;
+}
+
+template<typename TIterator>
+static void replaceInstruction(
+    Instruction *oldInstruction,
+    Value *newInstruction,
+    TIterator &it)
+{
+    if (newInstruction != oldInstruction) {
+        oldInstruction->replaceAllUsesWith(newInstruction);
+        it = oldInstruction->eraseFromParent();
+    }
+    else {
+        ++it;
+    }
+}
+#endif
+
 bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) {
     initAll(*F.getParent());
+    smallAllocFunc = getOrDeclare(jl_well_known::GCSmallAlloc);
     LLVM_DEBUG(dbgs() << "GC ROOT PLACEMENT: Processing function " << F.getName() << "\n");
     if (!pgcstack_getter && !adoptthread_func)
         return CleanupIR(F, nullptr, CFGModified);
@@ -2468,6 +2582,31 @@ bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) {
     std::map<Value *, std::pair<int, int>> CallFrames; // = OptimizeCallFrames(S, Ordering);
     PlaceRootsAndUpdateCalls(Colors, S, CallFrames);
     CleanupIR(F, &S, CFGModified);
+
+#ifdef MMTK_GC
+    // We lower the julia.gc_alloc_bytes intrinsic in this pass to insert slowpath/fastpath blocks for MMTk
+    for (BasicBlock &BB : F) {
+        for (auto it = BB.begin(); it != BB.end();) {
+            auto *CI = dyn_cast<CallInst>(&*it);
+            if (!CI) {
+                ++it;
+                continue;
+            }
+
+            Value *callee = CI->getCalledOperand();
+            assert(callee);
+
+            auto GCAllocBytes = getOrNull(jl_intrinsics::GCAllocBytes);
+            if (GCAllocBytes == callee) {
+                *CFGModified = true;
+                replaceInstruction(CI, lowerGCAllocBytesLate(CI, F), it);
+                continue;
+            }
+            ++it;
+        }
+    }
+#endif
+
     return true;
 }
 

From ecb675a597ab3dcd57fc053c995252618b6b0edd Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 29 Aug 2024 05:51:26 +0000
Subject: [PATCH 081/116] Fixing removed newlines

---
 src/gc-debug.c           | 1 +
 src/gc-heap-snapshot.cpp | 1 +
 src/gc-page-profiler.c   | 1 +
 src/gc-pages.c           | 1 +
 src/gc-stock.c           | 7 +++++++
 5 files changed, 11 insertions(+)

diff --git a/src/gc-debug.c b/src/gc-debug.c
index ecd7f2328cada..2c8e1c6055414 100644
--- a/src/gc-debug.c
+++ b/src/gc-debug.c
@@ -1,4 +1,5 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
+
 #ifndef MMTK_GC
 #include "gc-common.h"
 #include "gc-stock.h"
diff --git a/src/gc-heap-snapshot.cpp b/src/gc-heap-snapshot.cpp
index d3cb1e98d84a4..fcda11dad4f8a 100644
--- a/src/gc-heap-snapshot.cpp
+++ b/src/gc-heap-snapshot.cpp
@@ -1,4 +1,5 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
+
 #include "gc-heap-snapshot.h"
 
 #include "julia.h"
diff --git a/src/gc-page-profiler.c b/src/gc-page-profiler.c
index bfd1c74247df8..e5c6b91978731 100644
--- a/src/gc-page-profiler.c
+++ b/src/gc-page-profiler.c
@@ -1,4 +1,5 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
+
 #ifndef MMTK_GC
 #include "gc-page-profiler.h"
 #include "julia.h"
diff --git a/src/gc-pages.c b/src/gc-pages.c
index ed6e0ed20ba1c..976fc461d5b95 100644
--- a/src/gc-pages.c
+++ b/src/gc-pages.c
@@ -1,4 +1,5 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
+
 #ifndef MMTK_GC
 #include "gc-common.h"
 #include "gc-stock.h"
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 019ae481ce189..05f2f5930448c 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -1,4 +1,5 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
+
 #ifndef MMTK_GC
 #include "gc-common.h"
 #include "gc-stock.h"
@@ -405,6 +406,7 @@ static void sweep_weak_refs(void)
     }
 }
 
+
 STATIC_INLINE void jl_batch_accum_heap_size(jl_ptls_t ptls, uint64_t sz) JL_NOTSAFEPOINT
 {
     uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.alloc_acc) + sz;
@@ -452,6 +454,7 @@ STATIC_INLINE jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz)
     return jl_valueof(&v->header);
 }
 
+
 // Instrumented version of jl_gc_big_alloc_inner, called into by LLVM-generated code.
 JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz, jl_value_t *type)
 {
@@ -3886,6 +3889,7 @@ JL_DLLEXPORT size_t jl_gc_external_obj_hdr_size(void)
     return sizeof(bigval_t);
 }
 
+<<<<<<< HEAD
 <<<<<<< HEAD
 
 <<<<<<< HEAD
@@ -3893,6 +3897,9 @@ JL_DLLEXPORT size_t jl_gc_external_obj_hdr_size(void)
 >>>>>>> c48a701f54 (WIP: Adding support for MMTk/Immix)
 =======
 >>>>>>> 0aee3ba32a (Refactoring to be considered before adding MMTk)
+=======
+
+>>>>>>> 30ac6f081d (Fixing removed newlines)
 JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj)
 {
     arraylist_push(&ptls->gc_tls.sweep_objs, obj);

From 77db2039905d73c9d6a30bef583d7ad15aea9ca1 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 28 Aug 2024 00:44:09 +0000
Subject: [PATCH 082/116] Refactoring to be considered before adding MMTk

---
 src/gc-stock.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/gc-stock.c b/src/gc-stock.c
index 05f2f5930448c..5fd3b7efafead 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -3892,6 +3892,7 @@ JL_DLLEXPORT size_t jl_gc_external_obj_hdr_size(void)
 <<<<<<< HEAD
 <<<<<<< HEAD
 
+<<<<<<< HEAD
 <<<<<<< HEAD
 =======
 >>>>>>> c48a701f54 (WIP: Adding support for MMTk/Immix)
@@ -3900,6 +3901,8 @@ JL_DLLEXPORT size_t jl_gc_external_obj_hdr_size(void)
 =======
 
 >>>>>>> 30ac6f081d (Fixing removed newlines)
+=======
+>>>>>>> 2efcdf8335 (Refactoring to be considered before adding MMTk)
 JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj)
 {
     arraylist_push(&ptls->gc_tls.sweep_objs, obj);

From c5d3a40880cc08014ec6347372ea35c3249f8709 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Mon, 2 Sep 2024 06:07:02 +0000
Subject: [PATCH 083/116] Adding a few comments; Moving some functions to be
 closer together

---
 src/gc-common.c   |  70 -----------
 src/gc-mmtk.c     | 311 ++++++++++++++--------------------------------
 src/gc-tls-mmtk.h |   2 +
 3 files changed, 94 insertions(+), 289 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index 17f6f1330743b..417f12f26d64d 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -705,76 +705,6 @@ JL_DLLEXPORT void jl_throw_out_of_memory_error(void)
     jl_throw(jl_memory_exception);
 }
 
-size_t jl_genericmemory_nbytes(jl_genericmemory_t *m) JL_NOTSAFEPOINT
-{
-    const jl_datatype_layout_t *layout = ((jl_datatype_t*)jl_typetagof(m))->layout;
-    size_t sz = layout->size * m->length;
-    if (layout->flags.arrayelem_isunion)
-        // account for isbits Union array selector bytes
-        sz += m->length;
-    return sz;
-}
-
-// tracking Memorys with malloc'd storage
-void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, int isaligned){
-    // This is **NOT** a GC safe point.
-    mallocmemory_t *ma;
-    if (ptls->gc_tls.heap.mafreelist == NULL) {
-        ma = (mallocmemory_t*)malloc_s(sizeof(mallocmemory_t));
-    }
-    else {
-        ma = ptls->gc_tls.heap.mafreelist;
-        ptls->gc_tls.heap.mafreelist = ma->next;
-    }
-    ma->a = (jl_genericmemory_t*)((uintptr_t)m | !!isaligned);
-    ma->next = ptls->gc_tls.heap.mallocarrays;
-    ptls->gc_tls.heap.mallocarrays = ma;
-}
-
-int gc_logging_enabled = 0;
-
-JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
-    gc_logging_enabled = enable;
-}
-
-JL_DLLEXPORT int jl_is_gc_logging_enabled(void) {
-    return gc_logging_enabled;
-}
-
-// gc-debug common functions
-// ---
-
-int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
-{
-    int nf = (int)jl_datatype_nfields(vt);
-    for (int i = 1; i < nf; i++) {
-        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
-            return i - 1;
-    }
-    return nf - 1;
-}
-
-int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
-{
-    char *slot = (char*)_slot;
-    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
-    char *start = NULL;
-    size_t len = 0;
-    size_t elsize = sizeof(void*);
-    if (vt == jl_module_type) {
-        jl_module_t *m = (jl_module_t*)obj;
-        start = (char*)m->usings.items;
-        len = m->usings.len;
-    }
-    else if (vt == jl_simplevector_type) {
-        start = (char*)jl_svec_data(obj);
-        len = jl_svec_len(obj);
-    }
-    if (slot < start || slot >= start + elsize * len)
-        return -1;
-    return (slot - start) / elsize;
-}
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gc-mmtk.c b/src/gc-mmtk.c
index e459b0f12c41d..98a5612871be0 100644
--- a/src/gc-mmtk.c
+++ b/src/gc-mmtk.c
@@ -10,9 +10,10 @@
 extern "C" {
 #endif
 
-// For now we're using the same values as stock-gc. However
-// for the heap size we use 70% of the free memory available
-// since that is actually a hard limit in MMTk.
+// FIXME: Should the values below be shared between both GC's?
+// Note that MMTk uses a hard max heap limit, which is set by default
+// as 70% of the free available memory. The min heap is set as the
+// default_collect_interval variable below.
 
 // max_total_memory is a suggestion.  We try very hard to stay
 // under this limit, but we will go above it rather than halting.
@@ -33,7 +34,6 @@ static memsize_t max_total_memory = (memsize_t) MAX32HEAP;
 
 void jl_gc_init(void) {
     // TODO: use jl_options.heap_size_hint to set MMTk's fixed heap size? (see issue: https://github.com/mmtk/mmtk-julia/issues/167)
-
     JL_MUTEX_INIT(&finalizers_lock, "finalizers_lock");
 
     arraylist_new(&to_finalize, 0);
@@ -105,10 +105,6 @@ void jl_gc_init(void) {
 void jl_start_gc_threads(void) {
     jl_ptls_t ptls = jl_current_task->ptls;
     mmtk_initialize_collection((void *)ptls);
-    // int nthreads = jl_atomic_load_relaxed(&jl_n_threads);
-    // int ngcthreads = jl_n_gcthreads;
-    // int nmutator_threads = nthreads - ngcthreads;
-    // printf("nthreads = %d, ngcthreads = %d, nmutator_threads = %d\n", nthreads, ngcthreads, nmutator_threads);
 }
 
 void jl_init_thread_heap(struct _jl_tls_states_t *ptls) JL_NOTSAFEPOINT {
@@ -135,38 +131,31 @@ void jl_free_thread_gc_state(struct _jl_tls_states_t *ptls) {
     mmtk_destroy_mutator(&ptls->gc_tls.mmtk_mutator);
 }
 
-// FIXME: mmtk uses the same code as stock to enable/disable the GC
-// Should this be moved to gc-common.c?
-
-_Atomic(uint32_t) jl_gc_disable_counter = 1;
-
-JL_DLLEXPORT int jl_gc_enable(int on) {
-    jl_ptls_t ptls = jl_current_task->ptls;
-    int prev = !ptls->disable_gc;
-    ptls->disable_gc = (on == 0);
-    if (on && !prev) {
-        // disable -> enable
-        if (jl_atomic_fetch_add(&jl_gc_disable_counter, -1) == 1) {
-            gc_num.allocd += gc_num.deferred_alloc;
-            gc_num.deferred_alloc = 0;
-        }
-    }
-    else if (prev && !on) {
-        // enable -> disable
-        jl_atomic_fetch_add(&jl_gc_disable_counter, 1);
-        // check if the GC is running and wait for it to finish
-        jl_gc_safepoint_(ptls);
-    }
-    return prev;
+JL_DLLEXPORT void jl_gc_set_max_memory(uint64_t max_mem) {
+    // MMTk currently does not allow setting the heap size at runtime
 }
 
-JL_DLLEXPORT int jl_gc_is_enabled(void) {
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return !ptls->disable_gc;
+
+inline void maybe_collect(jl_ptls_t ptls)
+{
+    // Just do a safe point for general maybe_collect
+    jl_gc_safepoint_(ptls);
 }
 
-JL_DLLEXPORT void jl_gc_set_max_memory(uint64_t max_mem) {
-    // MMTk currently does not allow setting the heap size at runtime
+// This is only used for malloc. We need to know if we need to do GC. However, keeping checking with MMTk (mmtk_gc_poll),
+// is expensive. So we only check for every few allocations.
+static inline void malloc_maybe_collect(jl_ptls_t ptls, size_t sz)
+{
+    // We do not need to carefully maintain malloc_sz_since_last_poll. We just need to
+    // avoid using mmtk_gc_poll too frequently, and try to be precise on our heap usage
+    // as much as we can.
+    if (ptls->gc_tls.malloc_sz_since_last_poll > 4096) {
+        jl_atomic_store_relaxed(&ptls->gc_tls.malloc_sz_since_last_poll, 0);
+        mmtk_gc_poll(ptls);
+    } else {
+        jl_atomic_fetch_add_relaxed(&ptls->gc_tls.malloc_sz_since_last_poll, sz);
+        jl_gc_safepoint_(ptls);
+    }
 }
 
 JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) {
@@ -182,7 +171,12 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) {
     mmtk_handle_user_collection_request(ptls, collection);
 }
 
-// same as above, some of these are identical to the implementation in gc stock
+// FIXME: The functions combine_thread_gc_counts and reset_thread_gc_counts
+// are currently nearly identical for mmtk and for stock. However, the stats
+// are likely different (e.g., MMTk doesn't track the bytes allocated in the fastpath,
+// but only when the slowpath is called). We might need to adapt these later so that
+// the statistics are the same or as close as possible for each GC.
+
 static void combine_thread_gc_counts(jl_gc_num_t *dest, int update_heap) JL_NOTSAFEPOINT
 {
     int gc_n_threads;
@@ -228,31 +222,6 @@ void reset_thread_gc_counts(void) JL_NOTSAFEPOINT
     }
 }
 
-// weak references
-// ---
-JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref_th(jl_ptls_t ptls, jl_value_t *value)
-{
-    jl_weakref_t *wr = (jl_weakref_t*)jl_gc_alloc(ptls, sizeof(void*), jl_weakref_type);
-    wr->value = value;  // NOTE: wb not needed here
-    mmtk_add_weak_candidate(wr);
-    return wr;
-}
-
-
-// allocation
-int jl_gc_classify_pools(size_t sz, int *osize)
-{
-    if (sz > GC_MAX_SZCLASS)
-        return -1; // call big alloc function
-    size_t allocsz = sz + sizeof(jl_taggedvalue_t);
-    *osize = LLT_ALIGN(allocsz, 16);
-    return 0; // use MMTk's fastpath logic
-}
-
-int64_t last_gc_total_bytes = 0;
-int64_t last_live_bytes = 0; // live_bytes at last collection
-int64_t live_bytes = 0;
-
 // Retrieves Julia's `GC_Num` (structure that stores GC statistics).
 JL_DLLEXPORT jl_gc_num_t jl_gc_num(void) {
     jl_gc_num_t num = gc_num;
@@ -260,6 +229,10 @@ JL_DLLEXPORT jl_gc_num_t jl_gc_num(void) {
     return num;
 }
 
+int64_t last_gc_total_bytes = 0;
+int64_t last_live_bytes = 0; // live_bytes at last collection
+int64_t live_bytes = 0;
+
 JL_DLLEXPORT int64_t jl_gc_diff_total_bytes(void) JL_NOTSAFEPOINT {
     int64_t oldtb = last_gc_total_bytes;
     int64_t newtb;
@@ -325,82 +298,38 @@ JL_DLLEXPORT uint64_t jl_gc_get_max_memory(void)
     return max_total_memory;
 }
 
+// weak references
+// ---
+JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref_th(jl_ptls_t ptls, jl_value_t *value)
+{
+    jl_weakref_t *wr = (jl_weakref_t*)jl_gc_alloc(ptls, sizeof(void*), jl_weakref_type);
+    wr->value = value;  // NOTE: wb not needed here
+    mmtk_add_weak_candidate(wr);
+    return wr;
+}
+
+// allocation
+
 extern void mmtk_object_reference_write_post(void* mutator, const void* parent, const void* ptr);
 extern void mmtk_object_reference_write_slow(void* mutator, const void* parent, const void* ptr);
 extern void* mmtk_alloc(void* mutator, size_t size, size_t align, size_t offset, int allocator);
 extern void mmtk_post_alloc(void* mutator, void* refer, size_t bytes, int allocator);
-
-
 extern const void* MMTK_SIDE_LOG_BIT_BASE_ADDRESS;
 extern const void* MMTK_SIDE_VO_BIT_BASE_ADDRESS;
-
-// These need to be constants.
-
-#define MMTK_OBJECT_BARRIER (1)
-// Stickyimmix needs write barrier. Immix does not need write barrier.
-#ifdef MMTK_PLAN_IMMIX
-#define MMTK_NEEDS_WRITE_BARRIER (0)
-#endif
-#ifdef MMTK_PLAN_STICKYIMMIX
-#define MMTK_NEEDS_WRITE_BARRIER (1)
-#endif
-
-#ifdef MMTK_CONSERVATIVE_SCAN
-#define MMTK_NEEDS_VO_BIT (1)
-#else
-#define MMTK_NEEDS_VO_BIT (0)
-#endif
+extern void mmtk_store_obj_size_c(void* obj, size_t size);
 
 #define MMTK_DEFAULT_IMMIX_ALLOCATOR (0)
 #define MMTK_IMMORTAL_BUMP_ALLOCATOR (0)
 
-// Directly call into MMTk for write barrier (debugging only)
-inline void mmtk_gc_wb_full(const void *parent, const void *ptr) JL_NOTSAFEPOINT
-{
-    jl_task_t *ct = jl_current_task;
-    jl_ptls_t ptls = ct->ptls;
-    mmtk_object_reference_write_post(&ptls->gc_tls.mmtk_mutator, parent, ptr);
-}
-
-// Fastpath. Return 1 if we should go to slowpath
-inline int mmtk_gc_wb_fast_check(const void *parent, const void *ptr) JL_NOTSAFEPOINT
-{
-    if (MMTK_NEEDS_WRITE_BARRIER == MMTK_OBJECT_BARRIER) {
-        intptr_t addr = (intptr_t) (void*) parent;
-        uint8_t* meta_addr = (uint8_t*) (MMTK_SIDE_LOG_BIT_BASE_ADDRESS) + (addr >> 6);
-        intptr_t shift = (addr >> 3) & 0b111;
-        uint8_t byte_val = *meta_addr;
-        return ((byte_val >> shift) & 1) == 1;
-    } else {
-        return 0;
-    }
-}
-
-// Slowpath.
-inline void mmtk_gc_wb_slow(const void *parent, const void *ptr) JL_NOTSAFEPOINT
-{
-    if (MMTK_NEEDS_WRITE_BARRIER == MMTK_OBJECT_BARRIER) {
-        jl_task_t *ct = jl_current_task;
-        jl_ptls_t ptls = ct->ptls;
-        mmtk_object_reference_write_slow(&ptls->gc_tls.mmtk_mutator, parent, ptr);
-    }
-}
 
-inline void mmtk_gc_wb(const void *parent, const void *ptr) JL_NOTSAFEPOINT
-{
-    if (mmtk_gc_wb_fast_check(parent, ptr)) {
-        mmtk_gc_wb_slow(parent, ptr);
-    }
-}
-
-inline void mmtk_gc_wb_binding(const void *bnd, const void *val) JL_NOTSAFEPOINT
+int jl_gc_classify_pools(size_t sz, int *osize)
 {
-    if (mmtk_gc_wb_fast_check(bnd, val)) {
-        jl_astaggedvalue(bnd)->bits.gc = 2; // to indicate that the buffer is a binding
-        mmtk_gc_wb_slow(bnd, val);
-    }
+    if (sz > GC_MAX_SZCLASS)
+        return -1; // call big alloc function
+    size_t allocsz = sz + sizeof(jl_taggedvalue_t);
+    *osize = LLT_ALIGN(allocsz, 16);
+    return 0; // use MMTk's fastpath logic
 }
-
 #define MMTK_MIN_ALIGNMENT 4
 // MMTk assumes allocation size is aligned to min alignment.
 inline size_t mmtk_align_alloc_sz(size_t sz) JL_NOTSAFEPOINT
@@ -429,19 +358,9 @@ inline void mmtk_immix_post_alloc_slow(MMTkMutatorContext* mutator, void* obj, s
     mmtk_post_alloc(mutator, obj, size, 0);
 }
 
-inline void mmtk_set_vo_bit(void* obj) {
-        intptr_t addr = (intptr_t) obj;
-        intptr_t shift = (addr >> 3) & 0b111;
-        uint8_t* vo_meta_addr = (uint8_t*) (MMTK_SIDE_VO_BIT_BASE_ADDRESS) + (addr >> 6);
-        uint8_t new_val = (*vo_meta_addr) | (1 << shift);
-        (*vo_meta_addr) = new_val;
-}
-
 inline void mmtk_immix_post_alloc_fast(MMTkMutatorContext* mutator, void* obj, size_t size) {
-    if (MMTK_NEEDS_VO_BIT) {
-        // set VO bit
-        mmtk_set_vo_bit(obj);
-    }
+    // FIXME: for now, we do nothing
+    // but when supporting moving, this is where we set the valid object (VO) bit
 }
 
 inline void* mmtk_immortal_alloc_fast(MMTkMutatorContext* mutator, size_t size, size_t align, size_t offset) {
@@ -450,79 +369,12 @@ inline void* mmtk_immortal_alloc_fast(MMTkMutatorContext* mutator, size_t size,
 }
 
 inline void mmtk_immortal_post_alloc_fast(MMTkMutatorContext* mutator, void* obj, size_t size) {
-    if (MMTK_NEEDS_VO_BIT) {
-        // set VO bit
-        mmtk_set_vo_bit(obj);
-    }
-
-    if (MMTK_NEEDS_WRITE_BARRIER == MMTK_OBJECT_BARRIER) {
-        intptr_t addr = (intptr_t) obj;
-        intptr_t shift = (addr >> 3) & 0b111;
-        uint8_t* meta_addr = (uint8_t*) (MMTK_SIDE_LOG_BIT_BASE_ADDRESS) + (addr >> 6);
-        while(1) {
-            uint8_t old_val = *meta_addr;
-            uint8_t new_val = old_val | (1 << shift);
-            if (jl_atomic_cmpswap((_Atomic(uint8_t)*)meta_addr, &old_val, new_val)) {
-                break;
-            }
-        }
-    }
-}
-
-// mutex for page profile
-uv_mutex_t page_profile_lock;
-
-JL_DLLEXPORT void jl_gc_take_page_profile(ios_t *stream)
-{
-    uv_mutex_lock(&page_profile_lock);
-    const char *str = "Page profiler in unsupported in MMTk.";
-    ios_write(stream, str, strlen(str));
-    uv_mutex_unlock(&page_profile_lock);
-}
-
-// this seems to be needed by the gc tests
-#define JL_GC_N_MAX_POOLS 51
-JL_DLLEXPORT double jl_gc_page_utilization_stats[JL_GC_N_MAX_POOLS];
-
-STATIC_INLINE void gc_dump_page_utilization_data(void) JL_NOTSAFEPOINT
-{
-    // FIXME: MMTk would have to provide its own stats
-}
-
-#define MMTK_GC_PAGE_SZ (1 << 12) // MMTk's page size is defined in mmtk-core constants
-
-JL_DLLEXPORT uint64_t jl_get_pg_size(void)
-{
-    return MMTK_GC_PAGE_SZ;
-}
-
-
-extern void mmtk_store_obj_size_c(void* obj, size_t size);
-
-inline void maybe_collect(jl_ptls_t ptls)
-{
-    // Just do a safe point for general maybe_collect
-    jl_gc_safepoint_(ptls);
-}
-
-// This is only used for malloc. We need to know if we need to do GC. However, keeping checking with MMTk (mmtk_gc_poll),
-// is expensive. So we only check for every few allocations.
-static inline void malloc_maybe_collect(jl_ptls_t ptls, size_t sz)
-{
-    // We do not need to carefully maintain malloc_sz_since_last_poll. We just need to
-    // avoid using mmtk_gc_poll too frequently, and try to be precise on our heap usage
-    // as much as we can.
-    if (ptls->gc_tls.malloc_sz_since_last_poll > 4096) {
-        jl_atomic_store_relaxed(&ptls->gc_tls.malloc_sz_since_last_poll, 0);
-        mmtk_gc_poll(ptls);
-    } else {
-        jl_atomic_fetch_add_relaxed(&ptls->gc_tls.malloc_sz_since_last_poll, sz);
-        jl_gc_safepoint_(ptls);
-    }
+    // FIXME: Similarly, for now, we do nothing
+    // but when supporting moving, this is where we set the valid object (VO) bit
+    // and log (old gen) bit
 }
 
 // allocation wrappers that track allocation and let collection run
-
 JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
 {
     jl_gcframe_t **pgcstack = jl_get_pgcstack();
@@ -601,7 +453,6 @@ jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT
     return jl_valueof(o);
 }
 
-
 JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_default(jl_ptls_t ptls, int osize, size_t align, void *ty)
 {
     // safepoint
@@ -628,11 +479,6 @@ JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_default(jl_ptls_t ptls, int osize, siz
     return v;
 }
 
-void jl_gc_notify_image_load(const char* img_data, size_t len)
-{
-    mmtk_set_vm_space((void*)img_data, len);
-}
-
 JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_big(jl_ptls_t ptls, size_t sz)
 {
     // safepoint
@@ -735,6 +581,38 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
     return b;
 }
 
+void jl_gc_notify_image_load(const char* img_data, size_t len)
+{
+    mmtk_set_vm_space((void*)img_data, len);
+}
+
+// mutex for page profile
+uv_mutex_t page_profile_lock;
+
+JL_DLLEXPORT void jl_gc_take_page_profile(ios_t *stream)
+{
+    uv_mutex_lock(&page_profile_lock);
+    const char *str = "Page profiler in unsupported in MMTk.";
+    ios_write(stream, str, strlen(str));
+    uv_mutex_unlock(&page_profile_lock);
+}
+
+// this seems to be needed by the gc tests
+#define JL_GC_N_MAX_POOLS 51
+JL_DLLEXPORT double jl_gc_page_utilization_stats[JL_GC_N_MAX_POOLS];
+
+STATIC_INLINE void gc_dump_page_utilization_data(void) JL_NOTSAFEPOINT
+{
+    // FIXME: MMTk would have to provide its own stats
+}
+
+#define MMTK_GC_PAGE_SZ (1 << 12) // MMTk's page size is defined in mmtk-core constants
+
+JL_DLLEXPORT uint64_t jl_get_pg_size(void)
+{
+    return MMTK_GC_PAGE_SZ;
+}
+
 // Not used by mmtk
 // Number of GC threads that may run parallel marking
 int jl_n_markthreads;
@@ -791,12 +669,7 @@ void jl_gc_debug_critical_error(void) JL_NOTSAFEPOINT
 {
 }
 
-int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
-{
-    return 0;
-}
-
-int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
+int gc_is_collector_thread(int tid) JL_NOTSAFEPOINT
 {
     return 0;
 }
diff --git a/src/gc-tls-mmtk.h b/src/gc-tls-mmtk.h
index 2eb5f2a6a44d9..64a1bae192445 100644
--- a/src/gc-tls-mmtk.h
+++ b/src/gc-tls-mmtk.h
@@ -9,6 +9,8 @@
 extern "C" {
 #endif
 
+// This mostly remove some fields that are not used by MMTk
+
 typedef struct {
     // variable for tracking weak references
     small_arraylist_t weak_refs;

From c26632ed5d2be1effebe86bfa5ca844195933095 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 25 Sep 2024 01:20:29 +0000
Subject: [PATCH 084/116] Fixing merge conflicts

---
 src/gc-stock.c | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/src/gc-stock.c b/src/gc-stock.c
index 5fd3b7efafead..078635f18e3ce 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -3889,20 +3889,6 @@ JL_DLLEXPORT size_t jl_gc_external_obj_hdr_size(void)
     return sizeof(bigval_t);
 }
 
-<<<<<<< HEAD
-<<<<<<< HEAD
-
-<<<<<<< HEAD
-<<<<<<< HEAD
-=======
->>>>>>> c48a701f54 (WIP: Adding support for MMTk/Immix)
-=======
->>>>>>> 0aee3ba32a (Refactoring to be considered before adding MMTk)
-=======
-
->>>>>>> 30ac6f081d (Fixing removed newlines)
-=======
->>>>>>> 2efcdf8335 (Refactoring to be considered before adding MMTk)
 JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj)
 {
     arraylist_push(&ptls->gc_tls.sweep_objs, obj);

From c283442edf340d882b13c9ec887a6d9bd44b2527 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 25 Sep 2024 01:24:44 +0000
Subject: [PATCH 085/116] Applying changes from refactoring before adding MMTk

---
 src/gc-stock.h       | 16 ++++++++++++++++
 src/julia.h          |  2 +-
 src/julia_internal.h |  2 +-
 src/stackwalk.c      |  2 --
 4 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/gc-stock.h b/src/gc-stock.h
index 8e563f32ab9d3..6f75dcd014176 100644
--- a/src/gc-stock.h
+++ b/src/gc-stock.h
@@ -423,6 +423,21 @@ STATIC_INLINE int gc_ith_parallel_collector_thread_id(int i) JL_NOTSAFEPOINT
     return gc_first_tid + i;
 }
 
+STATIC_INLINE int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    return tid >= gc_first_tid && tid <= gc_last_parallel_collector_thread_id();
+}
+
+STATIC_INLINE int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    if (jl_n_sweepthreads == 0) {
+        return 0;
+    }
+    int last_parallel_collector_thread_id = gc_last_parallel_collector_thread_id();
+    int concurrent_collector_thread_id = last_parallel_collector_thread_id + 1;
+    return tid == concurrent_collector_thread_id;
+}
+
 STATIC_INLINE int gc_random_parallel_collector_thread_id(jl_ptls_t ptls) JL_NOTSAFEPOINT
 {
     assert(jl_n_markthreads > 0);
@@ -699,4 +714,5 @@ void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect
 
 #endif
 
+
 #endif // !MMTK_GC
diff --git a/src/julia.h b/src/julia.h
index db57db1fbeb38..abb8a57ff13b0 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -850,7 +850,7 @@ static inline jl_value_t *jl_to_typeof(uintptr_t t)
     return (jl_value_t*)t;
 }
 #else
-extern JL_DLLEXPORT jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
+extern JL_HIDDEN jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
 static inline jl_value_t *jl_to_typeof(uintptr_t t)
 {
     if (t < (jl_max_tags << 4))
diff --git a/src/julia_internal.h b/src/julia_internal.h
index d5013601a9124..e677f40907dfd 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -1052,7 +1052,7 @@ STATIC_INLINE int jl_addr_is_safepoint(uintptr_t addr)
     return addr >= safepoint_addr && addr < safepoint_addr + jl_page_size * 4;
 }
 extern _Atomic(uint32_t) jl_gc_running;
-extern JL_DLLEXPORT _Atomic(uint32_t) jl_gc_disable_counter;
+extern _Atomic(uint32_t) jl_gc_disable_counter;
 // All the functions are safe to be called from within a signal handler
 // provided that the thread will not be interrupted by another asynchronous
 // signal.
diff --git a/src/stackwalk.c b/src/stackwalk.c
index e6fc2c7bbf56a..a1de3a6d61a07 100644
--- a/src/stackwalk.c
+++ b/src/stackwalk.c
@@ -1294,8 +1294,6 @@ JL_DLLEXPORT void jl_print_backtrace(void) JL_NOTSAFEPOINT
 }
 
 extern int gc_first_tid;
-extern int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT;
-extern int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT;
 
 // Print backtraces for all live tasks, for all threads, to jl_safe_printf stderr
 JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT

From 01aa62331858a7810efbcf5857edfda990a93e72 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 28 Aug 2024 00:44:09 +0000
Subject: [PATCH 086/116] Refactoring to be considered before adding MMTk

---
 src/gc-common.c      | 156 +++++++++++++++++++++++++++++++++++++++++++
 src/gc-common.h      |   6 ++
 src/gc-debug.c       |  41 +-----------
 src/gc-interface.h   |  12 ++++
 src/gc-stacks.c      |   4 +-
 src/gc-stock.c       | 156 ++++++++++++-------------------------------
 src/gc-stock.h       |  21 ------
 src/julia.h          |   2 +-
 src/julia_internal.h |  26 +-------
 src/scheduler.c      |  11 +++
 src/stackwalk.c      |   4 +-
 src/staticdata.c     |   2 +
 12 files changed, 237 insertions(+), 204 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index ee461b576ea9e..2ec167caa667a 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -20,6 +20,11 @@ extern "C" {
 
 jl_gc_num_t gc_num = {0};
 
+JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void)
+{
+    return gc_num.total_time;
+}
+
 // =========================================================================== //
 // GC Callbacks
 // =========================================================================== //
@@ -489,6 +494,87 @@ jl_ptls_t* gc_all_tls_states;
 // MISC
 // =========================================================================== //
 
+JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return jl_gc_new_weakref_th(ptls, value);
+}
+
+JL_DLLEXPORT void * jl_gc_alloc_typed(jl_ptls_t ptls, size_t sz, void *ty)
+{
+    return jl_gc_alloc(ptls, sz, ty);
+}
+
+JL_DLLEXPORT jl_value_t *jl_gc_allocobj(size_t sz)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return jl_gc_alloc(ptls, sz, NULL);
+}
+
+// allocation wrappers that save the size of allocations, to allow using
+// jl_gc_counted_* functions with a libc-compatible API.
+
+JL_DLLEXPORT void *jl_malloc(size_t sz)
+{
+    int64_t *p = (int64_t *)jl_gc_counted_malloc(sz + JL_SMALL_BYTE_ALIGNMENT);
+    if (p == NULL)
+        return NULL;
+    p[0] = sz;
+    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
+}
+
+//_unchecked_calloc does not check for potential overflow of nm*sz
+STATIC_INLINE void *_unchecked_calloc(size_t nm, size_t sz) {
+    size_t nmsz = nm*sz;
+    int64_t *p = (int64_t *)jl_gc_counted_calloc(nmsz + JL_SMALL_BYTE_ALIGNMENT, 1);
+    if (p == NULL)
+        return NULL;
+    p[0] = nmsz;
+    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
+}
+
+JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz)
+{
+    if (nm > SSIZE_MAX/sz - JL_SMALL_BYTE_ALIGNMENT)
+        return NULL;
+    return _unchecked_calloc(nm, sz);
+}
+
+JL_DLLEXPORT void jl_free(void *p)
+{
+    if (p != NULL) {
+        int64_t *pp = (int64_t *)p - 2;
+        size_t sz = pp[0];
+        jl_gc_counted_free_with_size(pp, sz + JL_SMALL_BYTE_ALIGNMENT);
+    }
+}
+
+JL_DLLEXPORT void *jl_realloc(void *p, size_t sz)
+{
+    int64_t *pp;
+    size_t szold;
+    if (p == NULL) {
+        pp = NULL;
+        szold = 0;
+    }
+    else {
+        pp = (int64_t *)p - 2;
+        szold = pp[0] + JL_SMALL_BYTE_ALIGNMENT;
+    }
+    int64_t *pnew = (int64_t *)jl_gc_counted_realloc_with_old_size(pp, szold, sz + JL_SMALL_BYTE_ALIGNMENT);
+    if (pnew == NULL)
+        return NULL;
+    pnew[0] = sz;
+    return (void *)(pnew + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
+}
+
+// allocator entry points
+
+JL_DLLEXPORT jl_value_t *(jl_gc_alloc)(jl_ptls_t ptls, size_t sz, void *ty)
+{
+    return jl_gc_alloc_(ptls, sz, ty);
+}
+
 const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00
 JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) JL_NOTSAFEPOINT
 {
@@ -501,6 +587,76 @@ JL_DLLEXPORT void jl_throw_out_of_memory_error(void)
     jl_throw(jl_memory_exception);
 }
 
+size_t jl_genericmemory_nbytes(jl_genericmemory_t *m) JL_NOTSAFEPOINT
+{
+    const jl_datatype_layout_t *layout = ((jl_datatype_t*)jl_typetagof(m))->layout;
+    size_t sz = layout->size * m->length;
+    if (layout->flags.arrayelem_isunion)
+        // account for isbits Union array selector bytes
+        sz += m->length;
+    return sz;
+}
+
+// tracking Memorys with malloc'd storage
+void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, int isaligned){
+    // This is **NOT** a GC safe point.
+    mallocmemory_t *ma;
+    if (ptls->gc_tls.heap.mafreelist == NULL) {
+        ma = (mallocmemory_t*)malloc_s(sizeof(mallocmemory_t));
+    }
+    else {
+        ma = ptls->gc_tls.heap.mafreelist;
+        ptls->gc_tls.heap.mafreelist = ma->next;
+    }
+    ma->a = (jl_genericmemory_t*)((uintptr_t)m | !!isaligned);
+    ma->next = ptls->gc_tls.heap.mallocarrays;
+    ptls->gc_tls.heap.mallocarrays = ma;
+}
+
+int gc_logging_enabled = 0;
+
+JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
+    gc_logging_enabled = enable;
+}
+
+JL_DLLEXPORT int jl_is_gc_logging_enabled(void) {
+    return gc_logging_enabled;
+}
+
+// gc-debug common functions
+// ---
+
+int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
+{
+    int nf = (int)jl_datatype_nfields(vt);
+    for (int i = 1; i < nf; i++) {
+        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
+            return i - 1;
+    }
+    return nf - 1;
+}
+
+int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
+{
+    char *slot = (char*)_slot;
+    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
+    char *start = NULL;
+    size_t len = 0;
+    size_t elsize = sizeof(void*);
+    if (vt == jl_module_type) {
+        jl_module_t *m = (jl_module_t*)obj;
+        start = (char*)m->usings.items;
+        len = m->usings.len;
+    }
+    else if (vt == jl_simplevector_type) {
+        start = (char*)jl_svec_data(obj);
+        len = jl_svec_len(obj);
+    }
+    if (slot < start || slot >= start + elsize * len)
+        return -1;
+    return (slot - start) / elsize;
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gc-common.h b/src/gc-common.h
index 4d53830442a7d..154b9659e9ccb 100644
--- a/src/gc-common.h
+++ b/src/gc-common.h
@@ -53,6 +53,12 @@ extern jl_gc_callback_list_t *gc_cblist_notify_gc_pressure;
 // malloc wrappers, aligned allocation
 // =========================================================================== //
 
+// data structure for tracking malloc'd genericmemory.
+typedef struct _mallocmemory_t {
+    jl_genericmemory_t *a; // lowest bit is tagged if this is aligned memory
+    struct _mallocmemory_t *next;
+} mallocmemory_t;
+
 #if defined(_OS_WINDOWS_)
 STATIC_INLINE void *jl_malloc_aligned(size_t sz, size_t align)
 {
diff --git a/src/gc-debug.c b/src/gc-debug.c
index 19dd93af5f236..d05fb4b49e9f7 100644
--- a/src/gc-debug.c
+++ b/src/gc-debug.c
@@ -1105,46 +1105,7 @@ void gc_count_pool(void)
     jl_safe_printf("************************\n");
 }
 
-int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
-{
-    int nf = (int)jl_datatype_nfields(vt);
-    for (int i = 1; i < nf; i++) {
-        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
-            return i - 1;
-    }
-    return nf - 1;
-}
-
-int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
-{
-    char *slot = (char*)_slot;
-    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
-    char *start = NULL;
-    size_t len = 0;
-    size_t elsize = sizeof(void*);
-    if (vt == jl_module_type) {
-        jl_module_t *m = (jl_module_t*)obj;
-        start = (char*)m->usings.items;
-        len = m->usings.len;
-    }
-    else if (vt == jl_simplevector_type) {
-        start = (char*)jl_svec_data(obj);
-        len = jl_svec_len(obj);
-    }
-    if (slot < start || slot >= start + elsize * len)
-        return -1;
-    return (slot - start) / elsize;
-}
-
-static int gc_logging_enabled = 0;
-
-JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
-    gc_logging_enabled = enable;
-}
-
-JL_DLLEXPORT int jl_is_gc_logging_enabled(void) {
-    return gc_logging_enabled;
-}
+extern int gc_logging_enabled;
 
 void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect, int64_t live_bytes) JL_NOTSAFEPOINT {
     if (!gc_logging_enabled) {
diff --git a/src/gc-interface.h b/src/gc-interface.h
index e543b4b5879f1..682f22344d69d 100644
--- a/src/gc-interface.h
+++ b/src/gc-interface.h
@@ -128,6 +128,13 @@ JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void);
 // Allocation
 // ========================================================================= //
 
+// On GCC, this function is inlined when sz is constant (see julia_internal.h)
+// In general, this function should implement allocation and should use the specific GC's logic
+// to decide whether to allocate a small or a large object. Finally, note that this function
+// **must** also set the type of the returning object to be `ty`. The type `ty` may also be used to record
+// an allocation of that type in the allocation profiler.
+struct _jl_value_t *jl_gc_alloc_(struct _jl_tls_states_t * ptls, size_t sz, void *ty);
+
 // Allocates small objects and increments Julia allocation counterst. Size of the object
 // header must be included in the object size. The (possibly unused in some implementations)
 // offset to the arena in which we're allocating is passed in the second parameter, and the
@@ -211,6 +218,11 @@ JL_DLLEXPORT void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align,
 // object being allocated and will be used to set the object header.
 struct _jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT;
 
+// This function notifies the GC about memory addresses that are set when loading the boot image.
+// The GC may use that information to, for instance, determine that such objects should
+// be treated as marked and belonged to the old generation in nursery collections.
+void jl_gc_notify_image_load(const char* img_data, size_t len);
+
 // ========================================================================= //
 // Runtime Write-Barriers
 // ========================================================================= //
diff --git a/src/gc-stacks.c b/src/gc-stacks.c
index 783129ea97693..8c44b65284386 100644
--- a/src/gc-stacks.c
+++ b/src/gc-stacks.c
@@ -46,7 +46,7 @@ static void *malloc_stack(size_t bufsz) JL_NOTSAFEPOINT
 }
 
 
-static void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
+void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
 {
     VirtualFree(stkbuf, 0, MEM_RELEASE);
     jl_atomic_fetch_add_relaxed(&num_stack_mappings, -1);
@@ -81,7 +81,7 @@ static void *malloc_stack(size_t bufsz) JL_NOTSAFEPOINT
     return stk;
 }
 
-static void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
+void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
 {
     munmap(stkbuf, bufsz);
     jl_atomic_fetch_add_relaxed(&num_stack_mappings, -1);
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 6b97881909bbd..6ebac8a0c079e 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -555,24 +555,6 @@ static void sweep_big(jl_ptls_t ptls) JL_NOTSAFEPOINT
     gc_time_big_end();
 }
 
-// tracking Memorys with malloc'd storage
-
-void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, int isaligned){
-    // This is **NOT** a GC safe point.
-    mallocmemory_t *ma;
-    if (ptls->gc_tls.heap.mafreelist == NULL) {
-        ma = (mallocmemory_t*)malloc_s(sizeof(mallocmemory_t));
-    }
-    else {
-        ma = ptls->gc_tls.heap.mafreelist;
-        ptls->gc_tls.heap.mafreelist = ma->next;
-    }
-    ma->a = (jl_genericmemory_t*)((uintptr_t)m | !!isaligned);
-    ma->next = ptls->gc_tls.heap.mallocarrays;
-    ptls->gc_tls.heap.mallocarrays = ma;
-}
-
-
 void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT
 {
     jl_ptls_t ptls = jl_current_task->ptls;
@@ -649,17 +631,6 @@ void jl_gc_reset_alloc_count(void) JL_NOTSAFEPOINT
     reset_thread_gc_counts();
 }
 
-size_t jl_genericmemory_nbytes(jl_genericmemory_t *m) JL_NOTSAFEPOINT
-{
-    const jl_datatype_layout_t *layout = ((jl_datatype_t*)jl_typetagof(m))->layout;
-    size_t sz = layout->size * m->length;
-    if (layout->flags.arrayelem_isunion)
-        // account for isbits Union array selector bytes
-        sz += m->length;
-    return sz;
-}
-
-
 static void jl_gc_free_memory(jl_value_t *v, int isaligned) JL_NOTSAFEPOINT
 {
     assert(jl_is_genericmemory(v));
@@ -818,6 +789,29 @@ jl_value_t *jl_gc_small_alloc_noinline(jl_ptls_t ptls, int offset, int osize) {
     return jl_gc_small_alloc_inner(ptls, offset, osize);
 }
 
+// Size does NOT include the type tag!!
+inline jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty)
+{
+    jl_value_t *v;
+    const size_t allocsz = sz + sizeof(jl_taggedvalue_t);
+    if (sz <= GC_MAX_SZCLASS) {
+        int pool_id = jl_gc_szclass(allocsz);
+        jl_gc_pool_t *p = &ptls->gc_tls.heap.norm_pools[pool_id];
+        int osize = jl_gc_sizeclasses[pool_id];
+        // We call `jl_gc_small_alloc_noinline` instead of `jl_gc_small_alloc` to avoid double-counting in
+        // the Allocations Profiler. (See https://github.com/JuliaLang/julia/pull/43868 for more details.)
+        v = jl_gc_small_alloc_noinline(ptls, (char*)p - (char*)ptls, osize);
+    }
+    else {
+        if (allocsz < sz) // overflow in adding offs, size was "negative"
+            jl_throw(jl_memory_exception);
+        v = jl_gc_big_alloc_noinline(ptls, allocsz);
+    }
+    jl_set_typeof(v, ty);
+    maybe_record_alloc_to_profile(v, sz, (jl_datatype_t*)ty);
+    return v;
+}
+
 int jl_gc_classify_pools(size_t sz, int *osize)
 {
     if (sz > GC_MAX_SZCLASS)
@@ -2794,6 +2788,21 @@ static void sweep_finalizer_list(arraylist_t *list)
     list->len = j;
 }
 
+int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    return tid >= gc_first_tid && tid <= gc_last_parallel_collector_thread_id();
+}
+
+int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    if (jl_n_sweepthreads == 0) {
+        return 0;
+    }
+    int last_parallel_collector_thread_id = gc_last_parallel_collector_thread_id();
+    int concurrent_collector_thread_id = last_parallel_collector_thread_id + 1;
+    return tid == concurrent_collector_thread_id;
+}
+
 // collector entry point and control
 _Atomic(uint32_t) jl_gc_disable_counter = 1;
 
@@ -2832,11 +2841,6 @@ JL_DLLEXPORT void jl_gc_get_total_bytes(int64_t *bytes) JL_NOTSAFEPOINT
     *bytes = (num.total_allocd + num.deferred_alloc + num.allocd);
 }
 
-JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void)
-{
-    return gc_num.total_time;
-}
-
 JL_DLLEXPORT jl_gc_num_t jl_gc_num(void)
 {
     jl_gc_num_t num = gc_num;
@@ -3397,13 +3401,6 @@ void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq)
     gc_mark_roots(mq);
 }
 
-// allocator entry points
-
-JL_DLLEXPORT jl_value_t *(jl_gc_alloc)(jl_ptls_t ptls, size_t sz, void *ty)
-{
-    return jl_gc_alloc_(ptls, sz, ty);
-}
-
 // Per-thread initialization
 void jl_init_thread_heap(jl_ptls_t ptls)
 {
@@ -3685,63 +3682,6 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size
     return data;
 }
 
-// allocation wrappers that save the size of allocations, to allow using
-// jl_gc_counted_* functions with a libc-compatible API.
-
-JL_DLLEXPORT void *jl_malloc(size_t sz)
-{
-    int64_t *p = (int64_t *)jl_gc_counted_malloc(sz + JL_SMALL_BYTE_ALIGNMENT);
-    if (p == NULL)
-        return NULL;
-    p[0] = sz;
-    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
-}
-
-//_unchecked_calloc does not check for potential overflow of nm*sz
-STATIC_INLINE void *_unchecked_calloc(size_t nm, size_t sz) {
-    size_t nmsz = nm*sz;
-    int64_t *p = (int64_t *)jl_gc_counted_calloc(nmsz + JL_SMALL_BYTE_ALIGNMENT, 1);
-    if (p == NULL)
-        return NULL;
-    p[0] = nmsz;
-    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
-}
-
-JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz)
-{
-    if (nm > SSIZE_MAX/sz - JL_SMALL_BYTE_ALIGNMENT)
-        return NULL;
-    return _unchecked_calloc(nm, sz);
-}
-
-JL_DLLEXPORT void jl_free(void *p)
-{
-    if (p != NULL) {
-        int64_t *pp = (int64_t *)p - 2;
-        size_t sz = pp[0];
-        jl_gc_counted_free_with_size(pp, sz + JL_SMALL_BYTE_ALIGNMENT);
-    }
-}
-
-JL_DLLEXPORT void *jl_realloc(void *p, size_t sz)
-{
-    int64_t *pp;
-    size_t szold;
-    if (p == NULL) {
-        pp = NULL;
-        szold = 0;
-    }
-    else {
-        pp = (int64_t *)p - 2;
-        szold = pp[0] + JL_SMALL_BYTE_ALIGNMENT;
-    }
-    int64_t *pnew = (int64_t *)jl_gc_counted_realloc_with_old_size(pp, szold, sz + JL_SMALL_BYTE_ALIGNMENT);
-    if (pnew == NULL)
-        return NULL;
-    pnew[0] = sz;
-    return (void *)(pnew + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
-}
-
 // allocating blocks for Arrays and Strings
 
 JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
@@ -3875,18 +3815,6 @@ jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT
     return jl_valueof(o);
 }
 
-JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return jl_gc_new_weakref_th(ptls, value);
-}
-
-JL_DLLEXPORT jl_value_t *jl_gc_allocobj(size_t sz)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return jl_gc_alloc(ptls, sz, NULL);
-}
-
 JL_DLLEXPORT int jl_gc_enable_conservative_gc_support(void)
 {
     if (jl_is_initialized()) {
@@ -4014,14 +3942,14 @@ JL_DLLEXPORT size_t jl_gc_external_obj_hdr_size(void)
 }
 
 
-JL_DLLEXPORT void * jl_gc_alloc_typed(jl_ptls_t ptls, size_t sz, void *ty)
+JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj)
 {
-    return jl_gc_alloc(ptls, sz, ty);
+    arraylist_push(&ptls->gc_tls.sweep_objs, obj);
 }
 
-JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj)
+void jl_gc_notify_image_load(const char* img_data, size_t len)
 {
-    arraylist_push(&ptls->gc_tls.sweep_objs, obj);
+    // Do nothing
 }
 
 #ifdef __cplusplus
diff --git a/src/gc-stock.h b/src/gc-stock.h
index 46f7d3e11e105..cc661ce6e1600 100644
--- a/src/gc-stock.h
+++ b/src/gc-stock.h
@@ -106,12 +106,6 @@ JL_EXTENSION typedef struct _bigval_t {
     // must be 64-byte aligned here, in 32 & 64 bit modes
 } bigval_t;
 
-// data structure for tracking malloc'd genericmemory.
-typedef struct _mallocmemory_t {
-    jl_genericmemory_t *a; // lowest bit is tagged if this is aligned memory
-    struct _mallocmemory_t *next;
-} mallocmemory_t;
-
 // pool page metadata
 typedef struct _jl_gc_pagemeta_t {
     // next metadata structure in per-thread list
@@ -428,21 +422,6 @@ STATIC_INLINE int gc_ith_parallel_collector_thread_id(int i) JL_NOTSAFEPOINT
     return gc_first_tid + i;
 }
 
-STATIC_INLINE int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
-{
-    return tid >= gc_first_tid && tid <= gc_last_parallel_collector_thread_id();
-}
-
-STATIC_INLINE int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
-{
-    if (jl_n_sweepthreads == 0) {
-        return 0;
-    }
-    int last_parallel_collector_thread_id = gc_last_parallel_collector_thread_id();
-    int concurrent_collector_thread_id = last_parallel_collector_thread_id + 1;
-    return tid == concurrent_collector_thread_id;
-}
-
 STATIC_INLINE int gc_random_parallel_collector_thread_id(jl_ptls_t ptls) JL_NOTSAFEPOINT
 {
     assert(jl_n_markthreads > 0);
diff --git a/src/julia.h b/src/julia.h
index ed3d9bf825658..b74de3060d26a 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -858,7 +858,7 @@ static inline jl_value_t *jl_to_typeof(uintptr_t t)
     return (jl_value_t*)t;
 }
 #else
-extern JL_HIDDEN jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
+extern JL_DLLEXPORT jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
 static inline jl_value_t *jl_to_typeof(uintptr_t t)
 {
     if (t < (jl_max_tags << 4))
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 20d90fede3d5e..04857d440b643 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -513,30 +513,6 @@ STATIC_INLINE uint8_t JL_CONST_FUNC jl_gc_szclass_align8(unsigned sz) JL_NOTSAFE
 #define GC_MAX_SZCLASS (2032-sizeof(void*))
 static_assert(ARRAY_CACHE_ALIGN_THRESHOLD > GC_MAX_SZCLASS, "");
 
-
-// Size does NOT include the type tag!!
-STATIC_INLINE jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty)
-{
-    jl_value_t *v;
-    const size_t allocsz = sz + sizeof(jl_taggedvalue_t);
-    if (sz <= GC_MAX_SZCLASS) {
-        int pool_id = jl_gc_szclass(allocsz);
-        jl_gc_pool_t *p = &ptls->gc_tls.heap.norm_pools[pool_id];
-        int osize = jl_gc_sizeclasses[pool_id];
-        // We call `jl_gc_small_alloc_noinline` instead of `jl_gc_small_alloc` to avoid double-counting in
-        // the Allocations Profiler. (See https://github.com/JuliaLang/julia/pull/43868 for more details.)
-        v = jl_gc_small_alloc_noinline(ptls, (char*)p - (char*)ptls, osize);
-    }
-    else {
-        if (allocsz < sz) // overflow in adding offs, size was "negative"
-            jl_throw(jl_memory_exception);
-        v = jl_gc_big_alloc_noinline(ptls, allocsz);
-    }
-    jl_set_typeof(v, ty);
-    maybe_record_alloc_to_profile(v, sz, (jl_datatype_t*)ty);
-    return v;
-}
-
 /* Programming style note: When using jl_gc_alloc, do not JL_GC_PUSH it into a
  * gc frame, until it has been fully initialized. An uninitialized value in a
  * gc frame can crash upon encountering the first safepoint. By delaying use of
@@ -1077,7 +1053,7 @@ STATIC_INLINE int jl_addr_is_safepoint(uintptr_t addr)
     return addr >= safepoint_addr && addr < safepoint_addr + jl_page_size * 4;
 }
 extern _Atomic(uint32_t) jl_gc_running;
-extern _Atomic(uint32_t) jl_gc_disable_counter;
+extern JL_DLLEXPORT _Atomic(uint32_t) jl_gc_disable_counter;
 // All the functions are safe to be called from within a signal handler
 // provided that the thread will not be interrupted by another asynchronous
 // signal.
diff --git a/src/scheduler.c b/src/scheduler.c
index bb2f85b52283f..b85a481588e4f 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -80,9 +80,20 @@ JL_DLLEXPORT int jl_set_task_threadpoolid(jl_task_t *task, int8_t tpid) JL_NOTSA
     return 1;
 }
 
+<<<<<<< HEAD
 // GC functions used
 extern int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache,
                                          jl_gc_markqueue_t *mq, jl_value_t *obj) JL_NOTSAFEPOINT;
+=======
+// parallel task runtime
+// ---
+
+JL_DLLEXPORT uint32_t jl_rand_ptls(uint32_t max) // [0, n)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return cong(max, &ptls->rngseed);
+}
+>>>>>>> 4f39869d04 (Refactoring to be considered before adding MMTk)
 
 // initialize the threading infrastructure
 // (called only by the main thread)
diff --git a/src/stackwalk.c b/src/stackwalk.c
index 6aa36fa8b499c..5f28b61c4a8fe 100644
--- a/src/stackwalk.c
+++ b/src/stackwalk.c
@@ -5,7 +5,7 @@
   utilities for walking the stack and looking up information about code addresses
 */
 #include <inttypes.h>
-#include "gc-stock.h"
+#include "gc-common.h"
 #include "julia.h"
 #include "julia_internal.h"
 #include "threading.h"
@@ -1294,6 +1294,8 @@ JL_DLLEXPORT void jl_print_backtrace(void) JL_NOTSAFEPOINT
 }
 
 extern int gc_first_tid;
+extern int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT;
+extern int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT;
 
 // Print backtraces for all live tasks, for all threads, to jl_safe_printf stderr
 JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT
diff --git a/src/staticdata.c b/src/staticdata.c
index 0a8cbe6db7c67..bba35e6dcb5f9 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -657,6 +657,7 @@ static void jl_load_sysimg_so(void)
         plen = (size_t *)&jl_system_image_size;
     else
         jl_dlsym(jl_sysimg_handle, "jl_system_image_size", (void **)&plen, 1);
+    jl_gc_notify_image_load(sysimg_data, *plen);
     jl_restore_system_image_data(sysimg_data, *plen);
 }
 
@@ -4054,6 +4055,7 @@ JL_DLLEXPORT jl_value_t *jl_restore_package_image_from_file(const char *fname, j
     jl_dlsym(pkgimg_handle, "jl_system_image_data", (void **)&pkgimg_data, 1);
     size_t *plen;
     jl_dlsym(pkgimg_handle, "jl_system_image_size", (void **)&plen, 1);
+    jl_gc_notify_image_load(pkgimg_data, *plen);
 
     jl_image_t pkgimage = jl_init_processor_pkgimg(pkgimg_handle);
 

From e10e3caef963bd1086deb3fb7d42f014ca2a3771 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 29 Aug 2024 04:57:59 +0000
Subject: [PATCH 087/116] Removing jl_gc_notify_image_load, since it's a new
 function and not part of the refactoring

---
 src/gc-interface.h | 5 -----
 src/gc-stock.c     | 5 -----
 src/staticdata.c   | 2 --
 3 files changed, 12 deletions(-)

diff --git a/src/gc-interface.h b/src/gc-interface.h
index 682f22344d69d..25ffed4524f0c 100644
--- a/src/gc-interface.h
+++ b/src/gc-interface.h
@@ -218,11 +218,6 @@ JL_DLLEXPORT void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align,
 // object being allocated and will be used to set the object header.
 struct _jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT;
 
-// This function notifies the GC about memory addresses that are set when loading the boot image.
-// The GC may use that information to, for instance, determine that such objects should
-// be treated as marked and belonged to the old generation in nursery collections.
-void jl_gc_notify_image_load(const char* img_data, size_t len);
-
 // ========================================================================= //
 // Runtime Write-Barriers
 // ========================================================================= //
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 6ebac8a0c079e..88b201a687eba 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -3947,11 +3947,6 @@ JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *o
     arraylist_push(&ptls->gc_tls.sweep_objs, obj);
 }
 
-void jl_gc_notify_image_load(const char* img_data, size_t len)
-{
-    // Do nothing
-}
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/staticdata.c b/src/staticdata.c
index bba35e6dcb5f9..0a8cbe6db7c67 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -657,7 +657,6 @@ static void jl_load_sysimg_so(void)
         plen = (size_t *)&jl_system_image_size;
     else
         jl_dlsym(jl_sysimg_handle, "jl_system_image_size", (void **)&plen, 1);
-    jl_gc_notify_image_load(sysimg_data, *plen);
     jl_restore_system_image_data(sysimg_data, *plen);
 }
 
@@ -4055,7 +4054,6 @@ JL_DLLEXPORT jl_value_t *jl_restore_package_image_from_file(const char *fname, j
     jl_dlsym(pkgimg_handle, "jl_system_image_data", (void **)&pkgimg_data, 1);
     size_t *plen;
     jl_dlsym(pkgimg_handle, "jl_system_image_size", (void **)&plen, 1);
-    jl_gc_notify_image_load(pkgimg_data, *plen);
 
     jl_image_t pkgimage = jl_init_processor_pkgimg(pkgimg_handle);
 

From d4c4360ab89dc9052cd87933b1f4b9e3581f4daa Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Mon, 2 Sep 2024 01:27:08 +0000
Subject: [PATCH 088/116] Moving gc_enable code to gc-common.c

---
 src/gc-common.c | 30 ++++++++++++++++++++++++++++++
 src/gc-stock.c  | 30 ------------------------------
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index 2ec167caa667a..03c046bc300f2 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -613,6 +613,36 @@ void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, i
     ptls->gc_tls.heap.mallocarrays = ma;
 }
 
+// collector entry point and control
+_Atomic(uint32_t) jl_gc_disable_counter = 1;
+
+JL_DLLEXPORT int jl_gc_enable(int on)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    int prev = !ptls->disable_gc;
+    ptls->disable_gc = (on == 0);
+    if (on && !prev) {
+        // disable -> enable
+        if (jl_atomic_fetch_add(&jl_gc_disable_counter, -1) == 1) {
+            gc_num.allocd += gc_num.deferred_alloc;
+            gc_num.deferred_alloc = 0;
+        }
+    }
+    else if (prev && !on) {
+        // enable -> disable
+        jl_atomic_fetch_add(&jl_gc_disable_counter, 1);
+        // check if the GC is running and wait for it to finish
+        jl_gc_safepoint_(ptls);
+    }
+    return prev;
+}
+
+JL_DLLEXPORT int jl_gc_is_enabled(void)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return !ptls->disable_gc;
+}
+
 int gc_logging_enabled = 0;
 
 JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 88b201a687eba..55499bce61182 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -2803,36 +2803,6 @@ int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
     return tid == concurrent_collector_thread_id;
 }
 
-// collector entry point and control
-_Atomic(uint32_t) jl_gc_disable_counter = 1;
-
-JL_DLLEXPORT int jl_gc_enable(int on)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    int prev = !ptls->disable_gc;
-    ptls->disable_gc = (on == 0);
-    if (on && !prev) {
-        // disable -> enable
-        if (jl_atomic_fetch_add(&jl_gc_disable_counter, -1) == 1) {
-            gc_num.allocd += gc_num.deferred_alloc;
-            gc_num.deferred_alloc = 0;
-        }
-    }
-    else if (prev && !on) {
-        // enable -> disable
-        jl_atomic_fetch_add(&jl_gc_disable_counter, 1);
-        // check if the GC is running and wait for it to finish
-        jl_gc_safepoint_(ptls);
-    }
-    return prev;
-}
-
-JL_DLLEXPORT int jl_gc_is_enabled(void)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return !ptls->disable_gc;
-}
-
 JL_DLLEXPORT void jl_gc_get_total_bytes(int64_t *bytes) JL_NOTSAFEPOINT
 {
     jl_gc_num_t num = gc_num;

From d07cae75b0b36b34a1b5150feab2b52d62a0c1ad Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Mon, 16 Sep 2024 06:38:02 +0000
Subject: [PATCH 089/116] Addressing PR comments

---
 src/gc-common.c      | 134 +++++++++++++++++++++++++------------------
 src/gc-common.h      |   6 ++
 src/gc-debug.c       |   2 -
 src/gc-interface.h   |  30 +---------
 src/gc-stock.c       |  18 +-----
 src/gc-stock.h       |  15 +++++
 src/julia.h          |   2 +-
 src/julia_internal.h |   4 +-
 src/stackwalk.c      |  10 +---
 9 files changed, 110 insertions(+), 111 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index 03c046bc300f2..046feae6aa4c5 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -491,15 +491,9 @@ int gc_n_threads;
 jl_ptls_t* gc_all_tls_states;
 
 // =========================================================================== //
-// MISC
+// Allocation
 // =========================================================================== //
 
-JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return jl_gc_new_weakref_th(ptls, value);
-}
-
 JL_DLLEXPORT void * jl_gc_alloc_typed(jl_ptls_t ptls, size_t sz, void *ty)
 {
     return jl_gc_alloc(ptls, sz, ty);
@@ -575,17 +569,9 @@ JL_DLLEXPORT jl_value_t *(jl_gc_alloc)(jl_ptls_t ptls, size_t sz, void *ty)
     return jl_gc_alloc_(ptls, sz, ty);
 }
 
-const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00
-JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) JL_NOTSAFEPOINT
-{
-    return jl_buff_tag;
-}
-
-// callback for passing OOM errors from gmp
-JL_DLLEXPORT void jl_throw_out_of_memory_error(void)
-{
-    jl_throw(jl_memory_exception);
-}
+// =========================================================================== //
+// Generic Memory
+// =========================================================================== //
 
 size_t jl_genericmemory_nbytes(jl_genericmemory_t *m) JL_NOTSAFEPOINT
 {
@@ -613,6 +599,66 @@ void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, i
     ptls->gc_tls.heap.mallocarrays = ma;
 }
 
+// =========================================================================== //
+// GC Debug
+// =========================================================================== //
+
+int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
+{
+    int nf = (int)jl_datatype_nfields(vt);
+    for (int i = 1; i < nf; i++) {
+        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
+            return i - 1;
+    }
+    return nf - 1;
+}
+
+int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
+{
+    char *slot = (char*)_slot;
+    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
+    char *start = NULL;
+    size_t len = 0;
+    size_t elsize = sizeof(void*);
+    if (vt == jl_module_type) {
+        jl_module_t *m = (jl_module_t*)obj;
+        start = (char*)m->usings.items;
+        len = m->usings.len;
+    }
+    else if (vt == jl_simplevector_type) {
+        start = (char*)jl_svec_data(obj);
+        len = jl_svec_len(obj);
+    }
+    if (slot < start || slot >= start + elsize * len)
+        return -1;
+    return (slot - start) / elsize;
+}
+
+// =========================================================================== //
+// GC Control
+// =========================================================================== //
+
+JL_DLLEXPORT uint32_t jl_get_gc_disable_counter(void) {
+    return jl_atomic_load_acquire(&jl_gc_disable_counter);
+}
+
+JL_DLLEXPORT int jl_gc_is_enabled(void)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return !ptls->disable_gc;
+}
+
+int gc_logging_enabled = 0;
+
+JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
+    gc_logging_enabled = enable;
+}
+
+JL_DLLEXPORT int jl_is_gc_logging_enabled(void) {
+    return gc_logging_enabled;
+}
+
+
 // collector entry point and control
 _Atomic(uint32_t) jl_gc_disable_counter = 1;
 
@@ -637,54 +683,30 @@ JL_DLLEXPORT int jl_gc_enable(int on)
     return prev;
 }
 
-JL_DLLEXPORT int jl_gc_is_enabled(void)
+// =========================================================================== //
+// MISC
+// =========================================================================== //
+
+JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
 {
     jl_ptls_t ptls = jl_current_task->ptls;
-    return !ptls->disable_gc;
-}
-
-int gc_logging_enabled = 0;
-
-JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
-    gc_logging_enabled = enable;
+    return jl_gc_new_weakref_th(ptls, value);
 }
 
-JL_DLLEXPORT int jl_is_gc_logging_enabled(void) {
-    return gc_logging_enabled;
+JL_DLLEXPORT jl_datatype_t **jl_get_ijl_small_typeof(void) {
+    return ijl_small_typeof;
 }
 
-// gc-debug common functions
-// ---
-
-int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
+const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00
+JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) JL_NOTSAFEPOINT
 {
-    int nf = (int)jl_datatype_nfields(vt);
-    for (int i = 1; i < nf; i++) {
-        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
-            return i - 1;
-    }
-    return nf - 1;
+    return jl_buff_tag;
 }
 
-int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
+// callback for passing OOM errors from gmp
+JL_DLLEXPORT void jl_throw_out_of_memory_error(void)
 {
-    char *slot = (char*)_slot;
-    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
-    char *start = NULL;
-    size_t len = 0;
-    size_t elsize = sizeof(void*);
-    if (vt == jl_module_type) {
-        jl_module_t *m = (jl_module_t*)obj;
-        start = (char*)m->usings.items;
-        len = m->usings.len;
-    }
-    else if (vt == jl_simplevector_type) {
-        start = (char*)jl_svec_data(obj);
-        len = jl_svec_len(obj);
-    }
-    if (slot < start || slot >= start + elsize * len)
-        return -1;
-    return (slot - start) / elsize;
+    jl_throw(jl_memory_exception);
 }
 
 #ifdef __cplusplus
diff --git a/src/gc-common.h b/src/gc-common.h
index 154b9659e9ccb..32b7470b13a58 100644
--- a/src/gc-common.h
+++ b/src/gc-common.h
@@ -179,4 +179,10 @@ JL_DLLEXPORT void jl_finalize_th(jl_task_t *ct, jl_value_t *o);
 extern int gc_n_threads;
 extern jl_ptls_t* gc_all_tls_states;
 
+// =========================================================================== //
+// Logging
+// =========================================================================== //
+
+extern int gc_logging_enabled;
+
 #endif // JL_GC_COMMON_H
diff --git a/src/gc-debug.c b/src/gc-debug.c
index d05fb4b49e9f7..7c479484cde45 100644
--- a/src/gc-debug.c
+++ b/src/gc-debug.c
@@ -1105,8 +1105,6 @@ void gc_count_pool(void)
     jl_safe_printf("************************\n");
 }
 
-extern int gc_logging_enabled;
-
 void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect, int64_t live_bytes) JL_NOTSAFEPOINT {
     if (!gc_logging_enabled) {
         return;
diff --git a/src/gc-interface.h b/src/gc-interface.h
index 25ffed4524f0c..0e9ce32697f35 100644
--- a/src/gc-interface.h
+++ b/src/gc-interface.h
@@ -94,6 +94,8 @@ JL_DLLEXPORT void jl_gc_set_max_memory(uint64_t max_mem);
 // should run a collection cycle again (e.g. a full mark right after a full sweep to ensure
 // we do a full heap traversal).
 JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection);
+// Returns whether the thread with `tid` is a collector thread
+JL_DLLEXPORT int gc_is_collector_thread(int tid) JL_NOTSAFEPOINT;
 
 // ========================================================================= //
 // Metrics
@@ -162,26 +164,6 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz);
 JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz);
 // Wrapper around Libc realloc that updates Julia allocation counters.
 JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz);
-// Wrapper around Libc malloc that allocates a memory region with a few additional machine
-// words before the actual payload that are used to record the size of the requested
-// allocation. Also updates Julia allocation counters. The function returns a pointer to the
-// payload as a result of the allocation.
-JL_DLLEXPORT void *jl_malloc(size_t sz);
-// Wrapper around Libc calloc that allocates a memory region with a few additional machine
-// words before the actual payload that are used to record the size of the requested
-// allocation. Also updates Julia allocation counters. The function returns a pointer to the
-// payload as a result of the allocation.
-JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz);
-// Wrapper around Libc free that takes a pointer to the payload of a memory region allocated
-// with jl_malloc or jl_calloc, and uses the size information stored in the first machine
-// words of the memory buffer update Julia allocation counters, and then frees the
-// corresponding memory buffer.
-JL_DLLEXPORT void jl_free(void *p);
-// Wrapper around Libc realloc that takes a memory region allocated with jl_malloc or
-// jl_calloc, and uses the size information stored in the first machine words of the memory
-// buffer to update Julia allocation counters, reallocating the corresponding memory buffer
-// in the end.
-JL_DLLEXPORT void *jl_realloc(void *p, size_t sz);
 // Wrapper around Libc malloc that's used to dynamically allocate memory for Arrays and
 // Strings. It increments Julia allocation counters and should check whether we're close to
 // the Julia heap target, and therefore, whether we should run a collection. Note that this
@@ -195,14 +177,6 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz);
 // thread-local allocator of the thread referenced by the first jl_ptls_t argument.
 JL_DLLEXPORT struct _jl_weakref_t *jl_gc_new_weakref_th(struct _jl_tls_states_t *ptls,
                                                         struct _jl_value_t *value);
-// Allocates a new weak-reference, assigns its value and increments Julia allocation
-// counters. If thread-local allocators are used, then this function should allocate in the
-// thread-local allocator of the current thread.
-JL_DLLEXPORT struct _jl_weakref_t *jl_gc_new_weakref(struct _jl_value_t *value);
-// Allocates an object whose size is specified by the function argument and increments Julia
-// allocation counters. If thread-local allocators are used, then this function should
-// allocate in the thread-local allocator of the current thread.
-JL_DLLEXPORT struct _jl_value_t *jl_gc_allocobj(size_t sz);
 // Permanently allocates a memory slot of the size specified by the first parameter. This
 // block of memory is allocated in an immortal region that is never swept. The second
 // parameter specifies whether the memory should be filled with zeros. The third and fourth
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 55499bce61182..b345fe08ff69c 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -2788,19 +2788,8 @@ static void sweep_finalizer_list(arraylist_t *list)
     list->len = j;
 }
 
-int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
-{
-    return tid >= gc_first_tid && tid <= gc_last_parallel_collector_thread_id();
-}
-
-int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
-{
-    if (jl_n_sweepthreads == 0) {
-        return 0;
-    }
-    int last_parallel_collector_thread_id = gc_last_parallel_collector_thread_id();
-    int concurrent_collector_thread_id = last_parallel_collector_thread_id + 1;
-    return tid == concurrent_collector_thread_id;
+int gc_is_collector_thread(int tid) JL_NOTSAFEPOINT {
+    return gc_is_parallel_collector_thread(tid) || gc_is_concurrent_collector_thread(tid);
 }
 
 JL_DLLEXPORT void jl_gc_get_total_bytes(int64_t *bytes) JL_NOTSAFEPOINT
@@ -3193,8 +3182,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
         // free empty GC state for threads that have exited
         if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
             // GC threads should never exit
-            assert(!gc_is_parallel_collector_thread(t_i));
-            assert(!gc_is_concurrent_collector_thread(t_i));
+            assert(!gc_is_collector_thread(t_i));
             jl_thread_heap_t *heap = &ptls2->gc_tls.heap;
             if (heap->weak_refs.len == 0)
                 small_arraylist_free(&heap->weak_refs);
diff --git a/src/gc-stock.h b/src/gc-stock.h
index cc661ce6e1600..0f8d1eee67581 100644
--- a/src/gc-stock.h
+++ b/src/gc-stock.h
@@ -422,6 +422,21 @@ STATIC_INLINE int gc_ith_parallel_collector_thread_id(int i) JL_NOTSAFEPOINT
     return gc_first_tid + i;
 }
 
+STATIC_INLINE int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    return tid >= gc_first_tid && tid <= gc_last_parallel_collector_thread_id();
+}
+
+STATIC_INLINE int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    if (jl_n_sweepthreads == 0) {
+        return 0;
+    }
+    int last_parallel_collector_thread_id = gc_last_parallel_collector_thread_id();
+    int concurrent_collector_thread_id = last_parallel_collector_thread_id + 1;
+    return tid == concurrent_collector_thread_id;
+}
+
 STATIC_INLINE int gc_random_parallel_collector_thread_id(jl_ptls_t ptls) JL_NOTSAFEPOINT
 {
     assert(jl_n_markthreads > 0);
diff --git a/src/julia.h b/src/julia.h
index b74de3060d26a..ed3d9bf825658 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -858,7 +858,7 @@ static inline jl_value_t *jl_to_typeof(uintptr_t t)
     return (jl_value_t*)t;
 }
 #else
-extern JL_DLLEXPORT jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
+extern JL_HIDDEN jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
 static inline jl_value_t *jl_to_typeof(uintptr_t t)
 {
     if (t < (jl_max_tags << 4))
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 04857d440b643..c079c06f0189a 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -367,6 +367,8 @@ extern jl_function_t *jl_typeinf_func JL_GLOBALLY_ROOTED;
 extern JL_DLLEXPORT size_t jl_typeinf_world;
 extern _Atomic(jl_typemap_entry_t*) call_cache[N_CALL_CACHE] JL_GLOBALLY_ROOTED;
 
+extern void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT;
+
 JL_DLLEXPORT extern int jl_lineno;
 JL_DLLEXPORT extern const char *jl_filename;
 
@@ -1053,7 +1055,7 @@ STATIC_INLINE int jl_addr_is_safepoint(uintptr_t addr)
     return addr >= safepoint_addr && addr < safepoint_addr + jl_page_size * 4;
 }
 extern _Atomic(uint32_t) jl_gc_running;
-extern JL_DLLEXPORT _Atomic(uint32_t) jl_gc_disable_counter;
+extern _Atomic(uint32_t) jl_gc_disable_counter;
 // All the functions are safe to be called from within a signal handler
 // provided that the thread will not be interrupted by another asynchronous
 // signal.
diff --git a/src/stackwalk.c b/src/stackwalk.c
index 5f28b61c4a8fe..a1de3a6d61a07 100644
--- a/src/stackwalk.c
+++ b/src/stackwalk.c
@@ -1294,8 +1294,6 @@ JL_DLLEXPORT void jl_print_backtrace(void) JL_NOTSAFEPOINT
 }
 
 extern int gc_first_tid;
-extern int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT;
-extern int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT;
 
 // Print backtraces for all live tasks, for all threads, to jl_safe_printf stderr
 JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT
@@ -1304,12 +1302,8 @@ JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT
     jl_ptls_t *allstates = jl_atomic_load_relaxed(&jl_all_tls_states);
     for (size_t i = 0; i < nthreads; i++) {
         jl_ptls_t ptls2 = allstates[i];
-        if (gc_is_parallel_collector_thread(i)) {
-            jl_safe_printf("==== Skipping backtrace for parallel GC thread %zu\n", i + 1);
-            continue;
-        }
-        if (gc_is_concurrent_collector_thread(i)) {
-            jl_safe_printf("==== Skipping backtrace for concurrent GC thread %zu\n", i + 1);
+        if (gc_is_collector_thread(i)) {
+            jl_safe_printf("==== Skipping backtrace for parallel/concurrent GC thread %zu\n", i + 1);
             continue;
         }
         if (ptls2 == NULL) {

From 8e15217b8a5eaea51335f6b7577ba929905a4a54 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 19 Sep 2024 04:18:13 +0000
Subject: [PATCH 090/116] Push resolution of merge conflict

---
 src/scheduler.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/src/scheduler.c b/src/scheduler.c
index b85a481588e4f..bb2f85b52283f 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -80,20 +80,9 @@ JL_DLLEXPORT int jl_set_task_threadpoolid(jl_task_t *task, int8_t tpid) JL_NOTSA
     return 1;
 }
 
-<<<<<<< HEAD
 // GC functions used
 extern int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache,
                                          jl_gc_markqueue_t *mq, jl_value_t *obj) JL_NOTSAFEPOINT;
-=======
-// parallel task runtime
-// ---
-
-JL_DLLEXPORT uint32_t jl_rand_ptls(uint32_t max) // [0, n)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return cong(max, &ptls->rngseed);
-}
->>>>>>> 4f39869d04 (Refactoring to be considered before adding MMTk)
 
 // initialize the threading infrastructure
 // (called only by the main thread)

From 0cb0784a43aa01803b73407c90bd5ee44d09531f Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 25 Sep 2024 01:10:31 +0000
Subject: [PATCH 091/116] Removing jl_gc_mark_queue_obj_explicit extern
 definition from scheduler.c

---
 src/scheduler.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/scheduler.c b/src/scheduler.c
index bb2f85b52283f..7e23f654c2566 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -80,10 +80,6 @@ JL_DLLEXPORT int jl_set_task_threadpoolid(jl_task_t *task, int8_t tpid) JL_NOTSA
     return 1;
 }
 
-// GC functions used
-extern int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache,
-                                         jl_gc_markqueue_t *mq, jl_value_t *obj) JL_NOTSAFEPOINT;
-
 // initialize the threading infrastructure
 // (called only by the main thread)
 void jl_init_threadinginfra(void)

From 12634f36d67bd9c8275feda1e2729b0910ca2664 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 25 Sep 2024 02:50:25 +0000
Subject: [PATCH 092/116] Don't need the getter function since it's possible to
 use jl_small_typeof directly

---
 src/gc-common.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index 046feae6aa4c5..417f12f26d64d 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -693,10 +693,6 @@ JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
     return jl_gc_new_weakref_th(ptls, value);
 }
 
-JL_DLLEXPORT jl_datatype_t **jl_get_ijl_small_typeof(void) {
-    return ijl_small_typeof;
-}
-
 const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00
 JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) JL_NOTSAFEPOINT
 {

From aa8093328cf5f70d9df78fda2315b077a76e4d8b Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Fri, 27 Sep 2024 00:49:07 +0000
Subject: [PATCH 093/116] Remove extern from free_stack declaration in
 julia_internal.h

---
 src/julia_internal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/julia_internal.h b/src/julia_internal.h
index c079c06f0189a..6fd537ed6baf8 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -367,7 +367,7 @@ extern jl_function_t *jl_typeinf_func JL_GLOBALLY_ROOTED;
 extern JL_DLLEXPORT size_t jl_typeinf_world;
 extern _Atomic(jl_typemap_entry_t*) call_cache[N_CALL_CACHE] JL_GLOBALLY_ROOTED;
 
-extern void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT;
+void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT;
 
 JL_DLLEXPORT extern int jl_lineno;
 JL_DLLEXPORT extern const char *jl_filename;

From 7ce3fe392616d4da1035de6b02a21056f05072b6 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Tue, 8 Oct 2024 09:12:49 +0000
Subject: [PATCH 094/116] Putting everything that is common GC tls into
 gc-tls-common.h

---
 src/gc-common.c     |  10 +--
 src/gc-stacks.c     |  18 +++---
 src/gc-stock.c      | 154 ++++++++++++++++++++++----------------------
 src/gc-tls-common.h |  52 +++++++++++++++
 src/gc-tls.h        |  25 -------
 src/julia_threads.h |   2 +
 src/stackwalk.c     |   2 +-
 7 files changed, 147 insertions(+), 116 deletions(-)
 create mode 100644 src/gc-tls-common.h

diff --git a/src/gc-common.c b/src/gc-common.c
index 417f12f26d64d..6ce455d3923ad 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -587,16 +587,16 @@ size_t jl_genericmemory_nbytes(jl_genericmemory_t *m) JL_NOTSAFEPOINT
 void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, int isaligned){
     // This is **NOT** a GC safe point.
     mallocmemory_t *ma;
-    if (ptls->gc_tls.heap.mafreelist == NULL) {
+    if (ptls->gc_tls_common.heap.mafreelist == NULL) {
         ma = (mallocmemory_t*)malloc_s(sizeof(mallocmemory_t));
     }
     else {
-        ma = ptls->gc_tls.heap.mafreelist;
-        ptls->gc_tls.heap.mafreelist = ma->next;
+        ma = ptls->gc_tls_common.heap.mafreelist;
+        ptls->gc_tls_common.heap.mafreelist = ma->next;
     }
     ma->a = (jl_genericmemory_t*)((uintptr_t)m | !!isaligned);
-    ma->next = ptls->gc_tls.heap.mallocarrays;
-    ptls->gc_tls.heap.mallocarrays = ma;
+    ma->next = ptls->gc_tls_common.heap.mallocarrays;
+    ptls->gc_tls_common.heap.mallocarrays = ma;
 }
 
 // =========================================================================== //
diff --git a/src/gc-stacks.c b/src/gc-stacks.c
index 8c44b65284386..a8fec938456a3 100644
--- a/src/gc-stacks.c
+++ b/src/gc-stacks.c
@@ -131,7 +131,7 @@ void _jl_free_stack(jl_ptls_t ptls, void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
     if (bufsz <= pool_sizes[JL_N_STACK_POOLS - 1]) {
         unsigned pool_id = select_pool(bufsz);
         if (pool_sizes[pool_id] == bufsz) {
-            small_arraylist_push(&ptls->gc_tls.heap.free_stacks[pool_id], stkbuf);
+            small_arraylist_push(&ptls->gc_tls_common.heap.free_stacks[pool_id], stkbuf);
             return;
         }
     }
@@ -160,7 +160,7 @@ void jl_release_task_stack(jl_ptls_t ptls, jl_task_t *task)
 #ifdef _COMPILER_ASAN_ENABLED_
             __asan_unpoison_stack_memory((uintptr_t)stkbuf, bufsz);
 #endif
-            small_arraylist_push(&ptls->gc_tls.heap.free_stacks[pool_id], stkbuf);
+            small_arraylist_push(&ptls->gc_tls_common.heap.free_stacks[pool_id], stkbuf);
         }
     }
 }
@@ -175,7 +175,7 @@ JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, jl_task_t *owner) JL_NOTSAFEPO
     if (ssize <= pool_sizes[JL_N_STACK_POOLS - 1]) {
         unsigned pool_id = select_pool(ssize);
         ssize = pool_sizes[pool_id];
-        small_arraylist_t *pool = &ptls->gc_tls.heap.free_stacks[pool_id];
+        small_arraylist_t *pool = &ptls->gc_tls_common.heap.free_stacks[pool_id];
         if (pool->len > 0) {
             stk = small_arraylist_pop(pool);
         }
@@ -196,7 +196,7 @@ JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, jl_task_t *owner) JL_NOTSAFEPO
     }
     *bufsz = ssize;
     if (owner) {
-        small_arraylist_t *live_tasks = &ptls->gc_tls.heap.live_tasks;
+        small_arraylist_t *live_tasks = &ptls->gc_tls_common.heap.live_tasks;
         mtarraylist_push(live_tasks, owner);
     }
     return stk;
@@ -223,7 +223,7 @@ void sweep_stack_pools(void) JL_NOTSAFEPOINT
 
         // free half of stacks that remain unused since last sweep
         for (int p = 0; p < JL_N_STACK_POOLS; p++) {
-            small_arraylist_t *al = &ptls2->gc_tls.heap.free_stacks[p];
+            small_arraylist_t *al = &ptls2->gc_tls_common.heap.free_stacks[p];
             size_t n_to_free;
             if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
                 n_to_free = al->len; // not alive yet or dead, so it does not need these anymore
@@ -245,10 +245,10 @@ void sweep_stack_pools(void) JL_NOTSAFEPOINT
             }
         }
         if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
-            small_arraylist_free(ptls2->gc_tls.heap.free_stacks);
+            small_arraylist_free(ptls2->gc_tls_common.heap.free_stacks);
         }
 
-        small_arraylist_t *live_tasks = &ptls2->gc_tls.heap.live_tasks;
+        small_arraylist_t *live_tasks = &ptls2->gc_tls_common.heap.live_tasks;
         size_t n = 0;
         size_t ndel = 0;
         size_t l = live_tasks->len;
@@ -299,7 +299,7 @@ JL_DLLEXPORT jl_array_t *jl_live_tasks(void)
         jl_ptls_t ptls2 = allstates[i];
         if (ptls2 == NULL)
             continue;
-        small_arraylist_t *live_tasks = &ptls2->gc_tls.heap.live_tasks;
+        small_arraylist_t *live_tasks = &ptls2->gc_tls_common.heap.live_tasks;
         size_t n = mtarraylist_length(live_tasks);
         l += n + (ptls2->root_task->ctx.stkbuf != NULL);
     }
@@ -318,7 +318,7 @@ JL_DLLEXPORT jl_array_t *jl_live_tasks(void)
                 goto restart;
             jl_array_data(a,void*)[j++] = t;
         }
-        small_arraylist_t *live_tasks = &ptls2->gc_tls.heap.live_tasks;
+        small_arraylist_t *live_tasks = &ptls2->gc_tls_common.heap.live_tasks;
         size_t n = mtarraylist_length(live_tasks);
         for (size_t i = 0; i < n; i++) {
             jl_task_t *t = (jl_task_t*)mtarraylist_get(live_tasks, i);
diff --git a/src/gc-stock.c b/src/gc-stock.c
index b345fe08ff69c..8e040c9b25dcf 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -357,7 +357,7 @@ JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref_th(jl_ptls_t ptls, jl_value_t *valu
     jl_weakref_t *wr = (jl_weakref_t*)jl_gc_alloc(ptls, sizeof(void*),
                                                   jl_weakref_type);
     wr->value = value;  // NOTE: wb not needed here
-    small_arraylist_push(&ptls->gc_tls.heap.weak_refs, wr);
+    small_arraylist_push(&ptls->gc_tls_common.heap.weak_refs, wr);
     return wr;
 }
 
@@ -367,8 +367,8 @@ static void clear_weak_refs(void)
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
         if (ptls2 != NULL) {
-            size_t n, l = ptls2->gc_tls.heap.weak_refs.len;
-            void **lst = ptls2->gc_tls.heap.weak_refs.items;
+            size_t n, l = ptls2->gc_tls_common.heap.weak_refs.len;
+            void **lst = ptls2->gc_tls_common.heap.weak_refs.items;
             for (n = 0; n < l; n++) {
                 jl_weakref_t *wr = (jl_weakref_t*)lst[n];
                 if (!gc_marked(jl_astaggedvalue(wr->value)->bits.gc))
@@ -386,8 +386,8 @@ static void sweep_weak_refs(void)
         if (ptls2 != NULL) {
             size_t n = 0;
             size_t ndel = 0;
-            size_t l = ptls2->gc_tls.heap.weak_refs.len;
-            void **lst = ptls2->gc_tls.heap.weak_refs.items;
+            size_t l = ptls2->gc_tls_common.heap.weak_refs.len;
+            void **lst = ptls2->gc_tls_common.heap.weak_refs.items;
             if (l == 0)
                 continue;
             while (1) {
@@ -402,7 +402,7 @@ static void sweep_weak_refs(void)
                 lst[n] = lst[n + ndel];
                 lst[n + ndel] = tmp;
             }
-            ptls2->gc_tls.heap.weak_refs.len -= ndel;
+            ptls2->gc_tls_common.heap.weak_refs.len -= ndel;
         }
     }
 }
@@ -410,18 +410,18 @@ static void sweep_weak_refs(void)
 
 STATIC_INLINE void jl_batch_accum_heap_size(jl_ptls_t ptls, uint64_t sz) JL_NOTSAFEPOINT
 {
-    uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.alloc_acc) + sz;
+    uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc) + sz;
     if (alloc_acc < 16*1024)
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.alloc_acc, alloc_acc);
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc, alloc_acc);
     else {
         jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_acc);
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.alloc_acc, 0);
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc, 0);
     }
 }
 
 STATIC_INLINE void jl_batch_accum_free_size(jl_ptls_t ptls, uint64_t sz) JL_NOTSAFEPOINT
 {
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.free_acc, jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.free_acc) + sz);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.free_acc, jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.free_acc) + sz);
 }
 
 // big value list
@@ -442,10 +442,10 @@ STATIC_INLINE jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz)
         jl_throw(jl_memory_exception);
     gc_invoke_callbacks(jl_gc_cb_notify_external_alloc_t,
         gc_cblist_notify_external_alloc, (v, allocsz));
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + allocsz);
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.bigalloc,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.bigalloc) + 1);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + allocsz);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.bigalloc,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.bigalloc) + 1);
     jl_batch_accum_heap_size(ptls, allocsz);
 #ifdef MEMDEBUG
     memset(v, 0xee, allocsz);
@@ -558,8 +558,8 @@ static void sweep_big(jl_ptls_t ptls) JL_NOTSAFEPOINT
 void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT
 {
     jl_ptls_t ptls = jl_current_task->ptls;
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + sz);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + sz);
     jl_batch_accum_heap_size(ptls, sz);
 }
 
@@ -578,18 +578,18 @@ static void combine_thread_gc_counts(jl_gc_num_t *dest, int update_heap) JL_NOTS
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls = gc_all_tls_states[i];
         if (ptls) {
-            dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + gc_num.interval);
-            dest->malloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.malloc);
-            dest->realloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.realloc);
-            dest->poolalloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.poolalloc);
-            dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.bigalloc);
-            dest->freed += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.free_acc);
+            dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + gc_num.interval);
+            dest->malloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.malloc);
+            dest->realloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.realloc);
+            dest->poolalloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.poolalloc);
+            dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.bigalloc);
+            dest->freed += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.free_acc);
             if (update_heap) {
-                uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.alloc_acc);
-                freed_in_runtime += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.free_acc);
+                uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc);
+                freed_in_runtime += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.free_acc);
                 jl_atomic_store_relaxed(&gc_heap_stats.heap_size, alloc_acc + jl_atomic_load_relaxed(&gc_heap_stats.heap_size));
-                jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.alloc_acc, 0);
-                jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.free_acc, 0);
+                jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc, 0);
+                jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.free_acc, 0);
             }
         }
     }
@@ -605,13 +605,13 @@ static void reset_thread_gc_counts(void) JL_NOTSAFEPOINT
         jl_ptls_t ptls = gc_all_tls_states[i];
         if (ptls != NULL) {
             // don't reset `pool_live_bytes` here
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, -(int64_t)gc_num.interval);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.malloc, 0);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.realloc, 0);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.poolalloc, 0);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.bigalloc, 0);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.alloc_acc, 0);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.free_acc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, -(int64_t)gc_num.interval);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.malloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.realloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.poolalloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.bigalloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.free_acc, 0);
         }
     }
 }
@@ -654,8 +654,8 @@ static void sweep_malloced_memory(void) JL_NOTSAFEPOINT
     for (int t_i = 0; t_i < gc_n_threads; t_i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[t_i];
         if (ptls2 != NULL) {
-            mallocmemory_t *ma = ptls2->gc_tls.heap.mallocarrays;
-            mallocmemory_t **pma = &ptls2->gc_tls.heap.mallocarrays;
+            mallocmemory_t *ma = ptls2->gc_tls_common.heap.mallocarrays;
+            mallocmemory_t **pma = &ptls2->gc_tls_common.heap.mallocarrays;
             while (ma != NULL) {
                 mallocmemory_t *nxt = ma->next;
                 jl_value_t *a = (jl_value_t*)((uintptr_t)ma->a & ~1);
@@ -667,8 +667,8 @@ static void sweep_malloced_memory(void) JL_NOTSAFEPOINT
                     *pma = nxt;
                     int isaligned = (uintptr_t)ma->a & 1;
                     jl_gc_free_memory(a, isaligned);
-                    ma->next = ptls2->gc_tls.heap.mafreelist;
-                    ptls2->gc_tls.heap.mafreelist = ma;
+                    ma->next = ptls2->gc_tls_common.heap.mafreelist;
+                    ptls2->gc_tls_common.heap.mafreelist = ma;
                 }
                 gc_time_count_mallocd_memory(bits);
                 ma = nxt;
@@ -729,12 +729,12 @@ STATIC_INLINE jl_value_t *jl_gc_small_alloc_inner(jl_ptls_t ptls, int offset,
     return jl_gc_big_alloc(ptls, osize, NULL);
 #endif
     maybe_collect(ptls);
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + osize);
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.pool_live_bytes,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.pool_live_bytes) + osize);
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.poolalloc,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.poolalloc) + 1);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + osize);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.pool_live_bytes,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.pool_live_bytes) + osize);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.poolalloc,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.poolalloc) + 1);
     // first try to use the freelist
     jl_taggedvalue_t *v = p->freelist;
     if (v != NULL) {
@@ -971,8 +971,8 @@ static void gc_sweep_page(gc_page_profiler_serializer_t *s, jl_gc_pool_t *p, jl_
     // instead of adding it to the thread that originally allocated the page, so we can avoid
     // an atomic-fetch-add here.
     size_t delta = (GC_PAGE_SZ - GC_PAGE_OFFSET - nfree * osize);
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.pool_live_bytes,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.pool_live_bytes) + delta);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.pool_live_bytes,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.pool_live_bytes) + delta);
     jl_atomic_fetch_add_relaxed((_Atomic(int64_t) *)&gc_num.freed, (nfree - old_nfree) * osize);
 }
 
@@ -1228,7 +1228,7 @@ static void gc_sweep_pool(void)
             }
             continue;
         }
-        jl_atomic_store_relaxed(&ptls2->gc_tls.gc_num.pool_live_bytes, 0);
+        jl_atomic_store_relaxed(&ptls2->gc_tls_common.gc_num.pool_live_bytes, 0);
         for (int i = 0; i < JL_GC_N_POOLS; i++) {
             jl_gc_pool_t *p = &ptls2->gc_tls.heap.norm_pools[i];
             jl_taggedvalue_t *last = p->freelist;
@@ -2834,7 +2834,7 @@ JL_DLLEXPORT int64_t jl_gc_pool_live_bytes(void)
     for (int i = 0; i < n_threads; i++) {
         jl_ptls_t ptls2 = all_tls_states[i];
         if (ptls2 != NULL) {
-            pool_live_bytes += jl_atomic_load_relaxed(&ptls2->gc_tls.gc_num.pool_live_bytes);
+            pool_live_bytes += jl_atomic_load_relaxed(&ptls2->gc_tls_common.gc_num.pool_live_bytes);
         }
     }
     return pool_live_bytes;
@@ -3183,11 +3183,12 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
         if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
             // GC threads should never exit
             assert(!gc_is_collector_thread(t_i));
+            jl_thread_heap_common_t *common_heap = &ptls2->gc_tls_common.heap;
             jl_thread_heap_t *heap = &ptls2->gc_tls.heap;
-            if (heap->weak_refs.len == 0)
-                small_arraylist_free(&heap->weak_refs);
-            if (heap->live_tasks.len == 0)
-                small_arraylist_free(&heap->live_tasks);
+            if (common_heap->weak_refs.len == 0)
+                small_arraylist_free(&common_heap->weak_refs);
+            if (common_heap->live_tasks.len == 0)
+                small_arraylist_free(&common_heap->live_tasks);
             if (heap->remset.len == 0)
                 arraylist_free(&heap->remset);
             if (ptls2->finalizers.len == 0)
@@ -3256,8 +3257,8 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
     jl_task_t *ct = jl_current_task;
     jl_ptls_t ptls = ct->ptls;
     if (jl_atomic_load_acquire(&jl_gc_disable_counter)) {
-        size_t localbytes = jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + gc_num.interval;
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, -(int64_t)gc_num.interval);
+        size_t localbytes = jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + gc_num.interval;
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, -(int64_t)gc_num.interval);
         static_assert(sizeof(_Atomic(uint64_t)) == sizeof(gc_num.deferred_alloc), "");
         jl_atomic_fetch_add_relaxed((_Atomic(uint64_t)*)&gc_num.deferred_alloc, localbytes);
         return;
@@ -3362,6 +3363,7 @@ void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq)
 // Per-thread initialization
 void jl_init_thread_heap(jl_ptls_t ptls)
 {
+    jl_thread_heap_common_t *common_heap = &ptls->gc_tls_common.heap;
     jl_thread_heap_t *heap = &ptls->gc_tls.heap;
     jl_gc_pool_t *p = heap->norm_pools;
     for (int i = 0; i < JL_GC_N_POOLS; i++) {
@@ -3369,12 +3371,12 @@ void jl_init_thread_heap(jl_ptls_t ptls)
         p[i].freelist = NULL;
         p[i].newpages = NULL;
     }
-    small_arraylist_new(&heap->weak_refs, 0);
-    small_arraylist_new(&heap->live_tasks, 0);
+    small_arraylist_new(&common_heap->weak_refs, 0);
+    small_arraylist_new(&common_heap->live_tasks, 0);
     for (int i = 0; i < JL_N_STACK_POOLS; i++)
-        small_arraylist_new(&heap->free_stacks[i], 0);
-    heap->mallocarrays = NULL;
-    heap->mafreelist = NULL;
+        small_arraylist_new(&common_heap->free_stacks[i], 0);
+    common_heap->mallocarrays = NULL;
+    common_heap->mafreelist = NULL;
     heap->young_generation_of_bigvals = (bigval_t*)calloc_s(sizeof(bigval_t)); // sentinel
     assert(gc_bigval_sentinel_tag != 0); // make sure the sentinel is initialized
     heap->young_generation_of_bigvals->header = gc_bigval_sentinel_tag;
@@ -3400,8 +3402,8 @@ void jl_init_thread_heap(jl_ptls_t ptls)
     jl_atomic_store_relaxed(&q->array, wsa2);
     arraylist_new(&mq->reclaim_set, 32);
 
-    memset(&ptls->gc_tls.gc_num, 0, sizeof(ptls->gc_tls.gc_num));
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, -(int64_t)gc_num.interval);
+    memset(&ptls->gc_tls_common.gc_num, 0, sizeof(ptls->gc_tls_common.gc_num));
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, -(int64_t)gc_num.interval);
 }
 
 void jl_free_thread_gc_state(jl_ptls_t ptls)
@@ -3579,10 +3581,10 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
     if (data != NULL && pgcstack != NULL && ct->world_age) {
         jl_ptls_t ptls = ct->ptls;
         maybe_collect(ptls);
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-            jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + sz);
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.malloc,
-            jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.malloc) + 1);
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+            jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + sz);
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.malloc,
+            jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.malloc) + 1);
         jl_batch_accum_heap_size(ptls, sz);
     }
     return data;
@@ -3596,10 +3598,10 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
     if (data != NULL && pgcstack != NULL && ct->world_age) {
         jl_ptls_t ptls = ct->ptls;
         maybe_collect(ptls);
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-            jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + nm*sz);
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.malloc,
-            jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.malloc) + 1);
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+            jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + nm*sz);
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.malloc,
+            jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.malloc) + 1);
         jl_batch_accum_heap_size(ptls, sz * nm);
     }
     return data;
@@ -3624,10 +3626,10 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size
         jl_ptls_t ptls = ct->ptls;
         maybe_collect(ptls);
         if (!(sz < old))
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-                jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + (sz - old));
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.realloc,
-            jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.realloc) + 1);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+                jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + (sz - old));
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.realloc,
+            jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.realloc) + 1);
 
         int64_t diff = sz - old;
         if (diff < 0) {
@@ -3658,10 +3660,10 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
     if (b == NULL)
         jl_throw(jl_memory_exception);
 
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + allocsz);
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.malloc,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.malloc) + 1);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + allocsz);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.malloc,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.malloc) + 1);
     jl_batch_accum_heap_size(ptls, allocsz);
 #ifdef _OS_WINDOWS_
     SetLastError(last_error);
diff --git a/src/gc-tls-common.h b/src/gc-tls-common.h
new file mode 100644
index 0000000000000..28fbf2d0c448e
--- /dev/null
+++ b/src/gc-tls-common.h
@@ -0,0 +1,52 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+// Meant to be included in "julia_threads.h"
+#ifndef JL_GC_TLS_COMMON_H
+#define JL_GC_TLS_COMMON_H
+
+#include "julia_atomics.h"
+
+// GC threading ------------------------------------------------------------------
+
+#include "arraylist.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    // variable for tracking weak references
+    small_arraylist_t weak_refs;
+    // live tasks started on this thread
+    // that are holding onto a stack from the pool
+    small_arraylist_t live_tasks;
+
+    // variables for tracking malloc'd arrays
+    struct _mallocmemory_t *mallocarrays;
+    struct _mallocmemory_t *mafreelist;
+
+#define JL_N_STACK_POOLS 16
+    small_arraylist_t free_stacks[JL_N_STACK_POOLS];
+} jl_thread_heap_common_t;
+
+typedef struct {
+    _Atomic(int64_t) allocd;
+    _Atomic(int64_t) pool_live_bytes;
+    _Atomic(uint64_t) malloc;
+    _Atomic(uint64_t) realloc;
+    _Atomic(uint64_t) poolalloc;
+    _Atomic(uint64_t) bigalloc;
+    _Atomic(int64_t) free_acc;
+    _Atomic(uint64_t) alloc_acc;
+} jl_thread_gc_num_common_t;
+
+typedef struct {
+    jl_thread_heap_common_t heap;
+    jl_thread_gc_num_common_t gc_num;
+} jl_gc_tls_states_common_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // JL_GC_TLS_H
diff --git a/src/gc-tls.h b/src/gc-tls.h
index 9e4b09404db84..ecc815805a98b 100644
--- a/src/gc-tls.h
+++ b/src/gc-tls.h
@@ -21,16 +21,6 @@ typedef struct {
 } jl_gc_pool_t;
 
 typedef struct {
-    // variable for tracking weak references
-    small_arraylist_t weak_refs;
-    // live tasks started on this thread
-    // that are holding onto a stack from the pool
-    small_arraylist_t live_tasks;
-
-    // variables for tracking malloc'd arrays
-    struct _mallocmemory_t *mallocarrays;
-    struct _mallocmemory_t *mafreelist;
-
     // variable for tracking young (i.e. not in `GC_OLD_MARKED`/last generation) large objects
     struct _bigval_t *young_generation_of_bigvals;
 
@@ -42,22 +32,8 @@ typedef struct {
     // variables for allocating objects from pools
 #define JL_GC_N_MAX_POOLS 51 // conservative. must be kept in sync with `src/julia_internal.h`
     jl_gc_pool_t norm_pools[JL_GC_N_MAX_POOLS];
-
-#define JL_N_STACK_POOLS 16
-    small_arraylist_t free_stacks[JL_N_STACK_POOLS];
 } jl_thread_heap_t;
 
-typedef struct {
-    _Atomic(int64_t) allocd;
-    _Atomic(int64_t) pool_live_bytes;
-    _Atomic(uint64_t) malloc;
-    _Atomic(uint64_t) realloc;
-    _Atomic(uint64_t) poolalloc;
-    _Atomic(uint64_t) bigalloc;
-    _Atomic(int64_t) free_acc;
-    _Atomic(uint64_t) alloc_acc;
-} jl_thread_gc_num_t;
-
 typedef struct {
     ws_queue_t chunk_queue;
     ws_queue_t ptr_queue;
@@ -78,7 +54,6 @@ typedef struct {
 typedef struct {
     jl_thread_heap_t heap;
     jl_gc_page_stack_t page_metadata_allocd;
-    jl_thread_gc_num_t gc_num;
     jl_gc_markqueue_t mark_queue;
     jl_gc_mark_cache_t gc_cache;
     _Atomic(size_t) gc_sweeps_requested;
diff --git a/src/julia_threads.h b/src/julia_threads.h
index b697a0bf030ed..fcc28591658cb 100644
--- a/src/julia_threads.h
+++ b/src/julia_threads.h
@@ -5,6 +5,7 @@
 #define JL_THREADS_H
 
 #include "gc-tls.h"
+#include "gc-tls-common.h"
 #include "julia_atomics.h"
 #ifndef _OS_WINDOWS_
 #include "pthread.h"
@@ -155,6 +156,7 @@ typedef struct _jl_tls_states_t {
     // Counter to disable finalizer **on the current thread**
     int finalizers_inhibited;
     jl_gc_tls_states_t gc_tls; // this is very large, and the offset of the first member is baked into codegen
+    jl_gc_tls_states_common_t gc_tls_common; // common tls for both GCs
     volatile sig_atomic_t defer_signal;
     _Atomic(struct _jl_task_t*) current_task;
     struct _jl_task_t *next_task;
diff --git a/src/stackwalk.c b/src/stackwalk.c
index a1de3a6d61a07..0988d7a833c94 100644
--- a/src/stackwalk.c
+++ b/src/stackwalk.c
@@ -1309,7 +1309,7 @@ JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT
         if (ptls2 == NULL) {
             continue;
         }
-        small_arraylist_t *live_tasks = &ptls2->gc_tls.heap.live_tasks;
+        small_arraylist_t *live_tasks = &ptls2->gc_tls_common.heap.live_tasks;
         size_t n = mtarraylist_length(live_tasks);
         int t_state = JL_TASK_STATE_DONE;
         jl_task_t *t = ptls2->root_task;

From 048af72dee003a3ded89c3bf6c6572f97cb2678a Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Tue, 8 Oct 2024 09:14:24 +0000
Subject: [PATCH 095/116] Typo

---
 src/gc-tls-common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gc-tls-common.h b/src/gc-tls-common.h
index 28fbf2d0c448e..ba36f5c1c238e 100644
--- a/src/gc-tls-common.h
+++ b/src/gc-tls-common.h
@@ -49,4 +49,4 @@ typedef struct {
 }
 #endif
 
-#endif // JL_GC_TLS_H
+#endif // JL_GC_TLS_COMMON_H

From fe61c2232d997da0ebd3b936a469024acff7afbb Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Tue, 8 Oct 2024 22:46:39 +0000
Subject: [PATCH 096/116] Adding gc-tls-common.h to Makefile as a public header

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index a6b1f433b73ce..80bbdbcff67fc 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -103,7 +103,7 @@ ifeq ($(USE_SYSTEM_LIBUV),0)
 UV_HEADERS += uv.h
 UV_HEADERS += uv/*.h
 endif
-PUBLIC_HEADERS := $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(addprefix $(SRCDIR)/,work-stealing-queue.h gc-interface.h gc-tls.h julia.h julia_assert.h julia_threads.h julia_fasttls.h julia_locks.h julia_atomics.h jloptions.h)
+PUBLIC_HEADERS := $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(addprefix $(SRCDIR)/,work-stealing-queue.h gc-interface.h gc-tls.h gc-tls-common.h julia.h julia_assert.h julia_threads.h julia_fasttls.h julia_locks.h julia_atomics.h jloptions.h)
 ifeq ($(OS),WINNT)
 PUBLIC_HEADERS += $(addprefix $(SRCDIR)/,win32_ucontext.h)
 endif

From 380fd833efba491cb167ad9c61909199e14098d8 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Tue, 8 Oct 2024 23:26:33 +0000
Subject: [PATCH 097/116] Removing gc-tls-common fields from gc-tls-mmtk.h

---
 src/gc-mmtk.c                 | 58 +++++++++++++++++------------------
 src/gc-tls-mmtk.h             | 30 ------------------
 src/llvm-late-gc-lowering.cpp |  2 +-
 3 files changed, 30 insertions(+), 60 deletions(-)

diff --git a/src/gc-mmtk.c b/src/gc-mmtk.c
index 98a5612871be0..aa010c73b27d2 100644
--- a/src/gc-mmtk.c
+++ b/src/gc-mmtk.c
@@ -108,7 +108,7 @@ void jl_start_gc_threads(void) {
 }
 
 void jl_init_thread_heap(struct _jl_tls_states_t *ptls) JL_NOTSAFEPOINT {
-    jl_thread_heap_t *heap = &ptls->gc_tls.heap;
+    jl_thread_heap_common_t *heap = &ptls->gc_tls_common.heap;
     small_arraylist_new(&heap->weak_refs, 0);
     small_arraylist_new(&heap->live_tasks, 0);
     for (int i = 0; i < JL_N_STACK_POOLS; i++)
@@ -124,7 +124,7 @@ void jl_init_thread_heap(struct _jl_tls_states_t *ptls) JL_NOTSAFEPOINT {
     memcpy(&ptls->gc_tls.mmtk_mutator, mmtk_mutator, sizeof(MMTkMutatorContext));
     // Call post_bind to maintain a list of active mutators and to reclaim the old mutator (which is no longer needed)
     mmtk_post_bind_mutator(&ptls->gc_tls.mmtk_mutator, mmtk_mutator);
-    memset(&ptls->gc_tls.gc_num, 0, sizeof(ptls->gc_tls.gc_num));
+    memset(&ptls->gc_tls_common.gc_num, 0, sizeof(ptls->gc_tls_common.gc_num));
 }
 
 void jl_free_thread_gc_state(struct _jl_tls_states_t *ptls) {
@@ -162,8 +162,8 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) {
     jl_task_t *ct = jl_current_task;
     jl_ptls_t ptls = ct->ptls;
     if (jl_atomic_load_acquire(&jl_gc_disable_counter)) {
-        size_t localbytes = jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + gc_num.interval;
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, -(int64_t)gc_num.interval);
+        size_t localbytes = jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + gc_num.interval;
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, -(int64_t)gc_num.interval);
         static_assert(sizeof(_Atomic(uint64_t)) == sizeof(gc_num.deferred_alloc), "");
         jl_atomic_fetch_add_relaxed((_Atomic(uint64_t)*)&gc_num.deferred_alloc, localbytes);
         return;
@@ -186,15 +186,15 @@ static void combine_thread_gc_counts(jl_gc_num_t *dest, int update_heap) JL_NOTS
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls = gc_all_tls_states[i];
         if (ptls) {
-            dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + gc_num.interval);
-            dest->malloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.malloc);
-            dest->realloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.realloc);
-            dest->poolalloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.poolalloc);
-            dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.bigalloc);
-            dest->freed += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.free_acc);
+            dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + gc_num.interval);
+            dest->malloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.malloc);
+            dest->realloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.realloc);
+            dest->poolalloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.poolalloc);
+            dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.bigalloc);
+            dest->freed += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.free_acc);
             if (update_heap) {
-                jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.alloc_acc, 0);
-                jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.free_acc, 0);
+                jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc, 0);
+                jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.free_acc, 0);
             }
         }
     }
@@ -211,13 +211,13 @@ void reset_thread_gc_counts(void) JL_NOTSAFEPOINT
         jl_ptls_t ptls = gc_all_tls_states[i];
         if (ptls != NULL) {
             // don't reset `pool_live_bytes` here
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, -(int64_t)gc_num.interval);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.malloc, 0);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.realloc, 0);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.poolalloc, 0);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.bigalloc, 0);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.alloc_acc, 0);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.free_acc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, -(int64_t)gc_num.interval);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.malloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.realloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.poolalloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.bigalloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.free_acc, 0);
         }
     }
 }
@@ -257,8 +257,8 @@ JL_DLLEXPORT int64_t jl_gc_pool_live_bytes(void) {
 void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT
 {
     jl_ptls_t ptls = jl_current_task->ptls;
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + sz);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + sz);
 }
 
 void jl_gc_count_freed(size_t sz) JL_NOTSAFEPOINT
@@ -473,8 +473,8 @@ JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_default(jl_ptls_t ptls, int osize, siz
         mmtk_immix_post_alloc_fast(&ptls->gc_tls.mmtk_mutator, v, LLT_ALIGN(osize+sizeof(jl_taggedvalue_t), align));
     }
 
-    ptls->gc_tls.gc_num.allocd += osize;
-    ptls->gc_tls.gc_num.poolalloc++;
+    ptls->gc_tls_common.gc_num.allocd += osize;
+    ptls->gc_tls_common.gc_num.poolalloc++;
 
     return v;
 }
@@ -502,8 +502,8 @@ JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_big(jl_ptls_t ptls, size_t sz)
     }
     v->sz = allocsz;
 
-    ptls->gc_tls.gc_num.allocd += allocsz;
-    ptls->gc_tls.gc_num.bigalloc++;
+    ptls->gc_tls_common.gc_num.allocd += allocsz;
+    ptls->gc_tls_common.gc_num.bigalloc++;
 
     jl_value_t *result = jl_valueof(&v->header);
     mmtk_post_alloc(&ptls->gc_tls.mmtk_mutator, result, allocsz, 2);
@@ -565,10 +565,10 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
     if (b == NULL)
         jl_throw(jl_memory_exception);
 
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + allocsz);
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.malloc,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.malloc) + 1);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + allocsz);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.malloc,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.malloc) + 1);
     // FIXME: Should these be part of mmtk's heap?
     // malloc_maybe_collect(ptls, sz);
     // jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, allocsz);
diff --git a/src/gc-tls-mmtk.h b/src/gc-tls-mmtk.h
index 64a1bae192445..7b1b249cd8ae3 100644
--- a/src/gc-tls-mmtk.h
+++ b/src/gc-tls-mmtk.h
@@ -9,37 +9,7 @@
 extern "C" {
 #endif
 
-// This mostly remove some fields that are not used by MMTk
-
-typedef struct {
-    // variable for tracking weak references
-    small_arraylist_t weak_refs;
-    // live tasks started on this thread
-    // that are holding onto a stack from the pool
-    small_arraylist_t live_tasks;
-
-    // variables for tracking malloc'd arrays
-    struct _mallocmemory_t *mallocarrays;
-    struct _mallocmemory_t *mafreelist;
-
-#define JL_N_STACK_POOLS 16
-    small_arraylist_t free_stacks[JL_N_STACK_POOLS];
-} jl_thread_heap_t;
-
-typedef struct {
-    _Atomic(int64_t) allocd;
-    _Atomic(int64_t) pool_live_bytes;
-    _Atomic(uint64_t) malloc;
-    _Atomic(uint64_t) realloc;
-    _Atomic(uint64_t) poolalloc;
-    _Atomic(uint64_t) bigalloc;
-    _Atomic(int64_t) free_acc;
-    _Atomic(uint64_t) alloc_acc;
-} jl_thread_gc_num_t;
-
 typedef struct {
-    jl_thread_heap_t heap;
-    jl_thread_gc_num_t gc_num;
     MMTkMutatorContext mmtk_mutator;
     size_t malloc_sz_since_last_poll;
 } jl_gc_tls_states_t;
diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
index d395771f6df0c..4b7dc0ec855a7 100644
--- a/src/llvm-late-gc-lowering.cpp
+++ b/src/llvm-late-gc-lowering.cpp
@@ -2528,7 +2528,7 @@ Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
                 builder.CreateStore(new_cursor, cursor_ptr);
 
                 // ptls->gc_tls.gc_num.allocd += osize;
-                auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, gc_num));
+                auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls_common) + offsetof(jl_gc_tls_states_common_t, gc_num));
                 auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos);
                 auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc");
                 auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls);

From ebf478ad2783571684e64fa41c7868d40b105985 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 28 Aug 2024 00:44:09 +0000
Subject: [PATCH 098/116] Refactoring to be considered before adding MMTk

---
 src/gc-common.c      | 156 +++++++++++++++++++++++++++++++++++++++++++
 src/gc-common.h      |   6 ++
 src/gc-debug.c       |  41 +-----------
 src/gc-interface.h   |  12 ++++
 src/gc-stacks.c      |   4 +-
 src/gc-stock.c       | 156 ++++++++++++-------------------------------
 src/gc-stock.h       |  21 ------
 src/julia.h          |   2 +-
 src/julia_internal.h |  26 +-------
 src/scheduler.c      |  11 +++
 src/stackwalk.c      |   4 +-
 src/staticdata.c     |   2 +
 12 files changed, 237 insertions(+), 204 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index ee461b576ea9e..2ec167caa667a 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -20,6 +20,11 @@ extern "C" {
 
 jl_gc_num_t gc_num = {0};
 
+JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void)
+{
+    return gc_num.total_time;
+}
+
 // =========================================================================== //
 // GC Callbacks
 // =========================================================================== //
@@ -489,6 +494,87 @@ jl_ptls_t* gc_all_tls_states;
 // MISC
 // =========================================================================== //
 
+JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return jl_gc_new_weakref_th(ptls, value);
+}
+
+JL_DLLEXPORT void * jl_gc_alloc_typed(jl_ptls_t ptls, size_t sz, void *ty)
+{
+    return jl_gc_alloc(ptls, sz, ty);
+}
+
+JL_DLLEXPORT jl_value_t *jl_gc_allocobj(size_t sz)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return jl_gc_alloc(ptls, sz, NULL);
+}
+
+// allocation wrappers that save the size of allocations, to allow using
+// jl_gc_counted_* functions with a libc-compatible API.
+
+JL_DLLEXPORT void *jl_malloc(size_t sz)
+{
+    int64_t *p = (int64_t *)jl_gc_counted_malloc(sz + JL_SMALL_BYTE_ALIGNMENT);
+    if (p == NULL)
+        return NULL;
+    p[0] = sz;
+    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
+}
+
+//_unchecked_calloc does not check for potential overflow of nm*sz
+STATIC_INLINE void *_unchecked_calloc(size_t nm, size_t sz) {
+    size_t nmsz = nm*sz;
+    int64_t *p = (int64_t *)jl_gc_counted_calloc(nmsz + JL_SMALL_BYTE_ALIGNMENT, 1);
+    if (p == NULL)
+        return NULL;
+    p[0] = nmsz;
+    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
+}
+
+JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz)
+{
+    if (nm > SSIZE_MAX/sz - JL_SMALL_BYTE_ALIGNMENT)
+        return NULL;
+    return _unchecked_calloc(nm, sz);
+}
+
+JL_DLLEXPORT void jl_free(void *p)
+{
+    if (p != NULL) {
+        int64_t *pp = (int64_t *)p - 2;
+        size_t sz = pp[0];
+        jl_gc_counted_free_with_size(pp, sz + JL_SMALL_BYTE_ALIGNMENT);
+    }
+}
+
+JL_DLLEXPORT void *jl_realloc(void *p, size_t sz)
+{
+    int64_t *pp;
+    size_t szold;
+    if (p == NULL) {
+        pp = NULL;
+        szold = 0;
+    }
+    else {
+        pp = (int64_t *)p - 2;
+        szold = pp[0] + JL_SMALL_BYTE_ALIGNMENT;
+    }
+    int64_t *pnew = (int64_t *)jl_gc_counted_realloc_with_old_size(pp, szold, sz + JL_SMALL_BYTE_ALIGNMENT);
+    if (pnew == NULL)
+        return NULL;
+    pnew[0] = sz;
+    return (void *)(pnew + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
+}
+
+// allocator entry points
+
+JL_DLLEXPORT jl_value_t *(jl_gc_alloc)(jl_ptls_t ptls, size_t sz, void *ty)
+{
+    return jl_gc_alloc_(ptls, sz, ty);
+}
+
 const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00
 JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) JL_NOTSAFEPOINT
 {
@@ -501,6 +587,76 @@ JL_DLLEXPORT void jl_throw_out_of_memory_error(void)
     jl_throw(jl_memory_exception);
 }
 
+size_t jl_genericmemory_nbytes(jl_genericmemory_t *m) JL_NOTSAFEPOINT
+{
+    const jl_datatype_layout_t *layout = ((jl_datatype_t*)jl_typetagof(m))->layout;
+    size_t sz = layout->size * m->length;
+    if (layout->flags.arrayelem_isunion)
+        // account for isbits Union array selector bytes
+        sz += m->length;
+    return sz;
+}
+
+// tracking Memorys with malloc'd storage
+void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, int isaligned){
+    // This is **NOT** a GC safe point.
+    mallocmemory_t *ma;
+    if (ptls->gc_tls.heap.mafreelist == NULL) {
+        ma = (mallocmemory_t*)malloc_s(sizeof(mallocmemory_t));
+    }
+    else {
+        ma = ptls->gc_tls.heap.mafreelist;
+        ptls->gc_tls.heap.mafreelist = ma->next;
+    }
+    ma->a = (jl_genericmemory_t*)((uintptr_t)m | !!isaligned);
+    ma->next = ptls->gc_tls.heap.mallocarrays;
+    ptls->gc_tls.heap.mallocarrays = ma;
+}
+
+int gc_logging_enabled = 0;
+
+JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
+    gc_logging_enabled = enable;
+}
+
+JL_DLLEXPORT int jl_is_gc_logging_enabled(void) {
+    return gc_logging_enabled;
+}
+
+// gc-debug common functions
+// ---
+
+int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
+{
+    int nf = (int)jl_datatype_nfields(vt);
+    for (int i = 1; i < nf; i++) {
+        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
+            return i - 1;
+    }
+    return nf - 1;
+}
+
+int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
+{
+    char *slot = (char*)_slot;
+    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
+    char *start = NULL;
+    size_t len = 0;
+    size_t elsize = sizeof(void*);
+    if (vt == jl_module_type) {
+        jl_module_t *m = (jl_module_t*)obj;
+        start = (char*)m->usings.items;
+        len = m->usings.len;
+    }
+    else if (vt == jl_simplevector_type) {
+        start = (char*)jl_svec_data(obj);
+        len = jl_svec_len(obj);
+    }
+    if (slot < start || slot >= start + elsize * len)
+        return -1;
+    return (slot - start) / elsize;
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gc-common.h b/src/gc-common.h
index 4d53830442a7d..154b9659e9ccb 100644
--- a/src/gc-common.h
+++ b/src/gc-common.h
@@ -53,6 +53,12 @@ extern jl_gc_callback_list_t *gc_cblist_notify_gc_pressure;
 // malloc wrappers, aligned allocation
 // =========================================================================== //
 
+// data structure for tracking malloc'd genericmemory.
+typedef struct _mallocmemory_t {
+    jl_genericmemory_t *a; // lowest bit is tagged if this is aligned memory
+    struct _mallocmemory_t *next;
+} mallocmemory_t;
+
 #if defined(_OS_WINDOWS_)
 STATIC_INLINE void *jl_malloc_aligned(size_t sz, size_t align)
 {
diff --git a/src/gc-debug.c b/src/gc-debug.c
index 19dd93af5f236..d05fb4b49e9f7 100644
--- a/src/gc-debug.c
+++ b/src/gc-debug.c
@@ -1105,46 +1105,7 @@ void gc_count_pool(void)
     jl_safe_printf("************************\n");
 }
 
-int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
-{
-    int nf = (int)jl_datatype_nfields(vt);
-    for (int i = 1; i < nf; i++) {
-        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
-            return i - 1;
-    }
-    return nf - 1;
-}
-
-int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
-{
-    char *slot = (char*)_slot;
-    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
-    char *start = NULL;
-    size_t len = 0;
-    size_t elsize = sizeof(void*);
-    if (vt == jl_module_type) {
-        jl_module_t *m = (jl_module_t*)obj;
-        start = (char*)m->usings.items;
-        len = m->usings.len;
-    }
-    else if (vt == jl_simplevector_type) {
-        start = (char*)jl_svec_data(obj);
-        len = jl_svec_len(obj);
-    }
-    if (slot < start || slot >= start + elsize * len)
-        return -1;
-    return (slot - start) / elsize;
-}
-
-static int gc_logging_enabled = 0;
-
-JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
-    gc_logging_enabled = enable;
-}
-
-JL_DLLEXPORT int jl_is_gc_logging_enabled(void) {
-    return gc_logging_enabled;
-}
+extern int gc_logging_enabled;
 
 void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect, int64_t live_bytes) JL_NOTSAFEPOINT {
     if (!gc_logging_enabled) {
diff --git a/src/gc-interface.h b/src/gc-interface.h
index e543b4b5879f1..682f22344d69d 100644
--- a/src/gc-interface.h
+++ b/src/gc-interface.h
@@ -128,6 +128,13 @@ JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void);
 // Allocation
 // ========================================================================= //
 
+// On GCC, this function is inlined when sz is constant (see julia_internal.h)
+// In general, this function should implement allocation and should use the specific GC's logic
+// to decide whether to allocate a small or a large object. Finally, note that this function
+// **must** also set the type of the returning object to be `ty`. The type `ty` may also be used to record
+// an allocation of that type in the allocation profiler.
+struct _jl_value_t *jl_gc_alloc_(struct _jl_tls_states_t * ptls, size_t sz, void *ty);
+
 // Allocates small objects and increments Julia allocation counterst. Size of the object
 // header must be included in the object size. The (possibly unused in some implementations)
 // offset to the arena in which we're allocating is passed in the second parameter, and the
@@ -211,6 +218,11 @@ JL_DLLEXPORT void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align,
 // object being allocated and will be used to set the object header.
 struct _jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT;
 
+// This function notifies the GC about memory addresses that are set when loading the boot image.
+// The GC may use that information to, for instance, determine that such objects should
+// be treated as marked and belonged to the old generation in nursery collections.
+void jl_gc_notify_image_load(const char* img_data, size_t len);
+
 // ========================================================================= //
 // Runtime Write-Barriers
 // ========================================================================= //
diff --git a/src/gc-stacks.c b/src/gc-stacks.c
index 783129ea97693..8c44b65284386 100644
--- a/src/gc-stacks.c
+++ b/src/gc-stacks.c
@@ -46,7 +46,7 @@ static void *malloc_stack(size_t bufsz) JL_NOTSAFEPOINT
 }
 
 
-static void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
+void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
 {
     VirtualFree(stkbuf, 0, MEM_RELEASE);
     jl_atomic_fetch_add_relaxed(&num_stack_mappings, -1);
@@ -81,7 +81,7 @@ static void *malloc_stack(size_t bufsz) JL_NOTSAFEPOINT
     return stk;
 }
 
-static void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
+void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
 {
     munmap(stkbuf, bufsz);
     jl_atomic_fetch_add_relaxed(&num_stack_mappings, -1);
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 6b97881909bbd..6ebac8a0c079e 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -555,24 +555,6 @@ static void sweep_big(jl_ptls_t ptls) JL_NOTSAFEPOINT
     gc_time_big_end();
 }
 
-// tracking Memorys with malloc'd storage
-
-void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, int isaligned){
-    // This is **NOT** a GC safe point.
-    mallocmemory_t *ma;
-    if (ptls->gc_tls.heap.mafreelist == NULL) {
-        ma = (mallocmemory_t*)malloc_s(sizeof(mallocmemory_t));
-    }
-    else {
-        ma = ptls->gc_tls.heap.mafreelist;
-        ptls->gc_tls.heap.mafreelist = ma->next;
-    }
-    ma->a = (jl_genericmemory_t*)((uintptr_t)m | !!isaligned);
-    ma->next = ptls->gc_tls.heap.mallocarrays;
-    ptls->gc_tls.heap.mallocarrays = ma;
-}
-
-
 void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT
 {
     jl_ptls_t ptls = jl_current_task->ptls;
@@ -649,17 +631,6 @@ void jl_gc_reset_alloc_count(void) JL_NOTSAFEPOINT
     reset_thread_gc_counts();
 }
 
-size_t jl_genericmemory_nbytes(jl_genericmemory_t *m) JL_NOTSAFEPOINT
-{
-    const jl_datatype_layout_t *layout = ((jl_datatype_t*)jl_typetagof(m))->layout;
-    size_t sz = layout->size * m->length;
-    if (layout->flags.arrayelem_isunion)
-        // account for isbits Union array selector bytes
-        sz += m->length;
-    return sz;
-}
-
-
 static void jl_gc_free_memory(jl_value_t *v, int isaligned) JL_NOTSAFEPOINT
 {
     assert(jl_is_genericmemory(v));
@@ -818,6 +789,29 @@ jl_value_t *jl_gc_small_alloc_noinline(jl_ptls_t ptls, int offset, int osize) {
     return jl_gc_small_alloc_inner(ptls, offset, osize);
 }
 
+// Size does NOT include the type tag!!
+inline jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty)
+{
+    jl_value_t *v;
+    const size_t allocsz = sz + sizeof(jl_taggedvalue_t);
+    if (sz <= GC_MAX_SZCLASS) {
+        int pool_id = jl_gc_szclass(allocsz);
+        jl_gc_pool_t *p = &ptls->gc_tls.heap.norm_pools[pool_id];
+        int osize = jl_gc_sizeclasses[pool_id];
+        // We call `jl_gc_small_alloc_noinline` instead of `jl_gc_small_alloc` to avoid double-counting in
+        // the Allocations Profiler. (See https://github.com/JuliaLang/julia/pull/43868 for more details.)
+        v = jl_gc_small_alloc_noinline(ptls, (char*)p - (char*)ptls, osize);
+    }
+    else {
+        if (allocsz < sz) // overflow in adding offs, size was "negative"
+            jl_throw(jl_memory_exception);
+        v = jl_gc_big_alloc_noinline(ptls, allocsz);
+    }
+    jl_set_typeof(v, ty);
+    maybe_record_alloc_to_profile(v, sz, (jl_datatype_t*)ty);
+    return v;
+}
+
 int jl_gc_classify_pools(size_t sz, int *osize)
 {
     if (sz > GC_MAX_SZCLASS)
@@ -2794,6 +2788,21 @@ static void sweep_finalizer_list(arraylist_t *list)
     list->len = j;
 }
 
+int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    return tid >= gc_first_tid && tid <= gc_last_parallel_collector_thread_id();
+}
+
+int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    if (jl_n_sweepthreads == 0) {
+        return 0;
+    }
+    int last_parallel_collector_thread_id = gc_last_parallel_collector_thread_id();
+    int concurrent_collector_thread_id = last_parallel_collector_thread_id + 1;
+    return tid == concurrent_collector_thread_id;
+}
+
 // collector entry point and control
 _Atomic(uint32_t) jl_gc_disable_counter = 1;
 
@@ -2832,11 +2841,6 @@ JL_DLLEXPORT void jl_gc_get_total_bytes(int64_t *bytes) JL_NOTSAFEPOINT
     *bytes = (num.total_allocd + num.deferred_alloc + num.allocd);
 }
 
-JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void)
-{
-    return gc_num.total_time;
-}
-
 JL_DLLEXPORT jl_gc_num_t jl_gc_num(void)
 {
     jl_gc_num_t num = gc_num;
@@ -3397,13 +3401,6 @@ void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq)
     gc_mark_roots(mq);
 }
 
-// allocator entry points
-
-JL_DLLEXPORT jl_value_t *(jl_gc_alloc)(jl_ptls_t ptls, size_t sz, void *ty)
-{
-    return jl_gc_alloc_(ptls, sz, ty);
-}
-
 // Per-thread initialization
 void jl_init_thread_heap(jl_ptls_t ptls)
 {
@@ -3685,63 +3682,6 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size
     return data;
 }
 
-// allocation wrappers that save the size of allocations, to allow using
-// jl_gc_counted_* functions with a libc-compatible API.
-
-JL_DLLEXPORT void *jl_malloc(size_t sz)
-{
-    int64_t *p = (int64_t *)jl_gc_counted_malloc(sz + JL_SMALL_BYTE_ALIGNMENT);
-    if (p == NULL)
-        return NULL;
-    p[0] = sz;
-    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
-}
-
-//_unchecked_calloc does not check for potential overflow of nm*sz
-STATIC_INLINE void *_unchecked_calloc(size_t nm, size_t sz) {
-    size_t nmsz = nm*sz;
-    int64_t *p = (int64_t *)jl_gc_counted_calloc(nmsz + JL_SMALL_BYTE_ALIGNMENT, 1);
-    if (p == NULL)
-        return NULL;
-    p[0] = nmsz;
-    return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
-}
-
-JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz)
-{
-    if (nm > SSIZE_MAX/sz - JL_SMALL_BYTE_ALIGNMENT)
-        return NULL;
-    return _unchecked_calloc(nm, sz);
-}
-
-JL_DLLEXPORT void jl_free(void *p)
-{
-    if (p != NULL) {
-        int64_t *pp = (int64_t *)p - 2;
-        size_t sz = pp[0];
-        jl_gc_counted_free_with_size(pp, sz + JL_SMALL_BYTE_ALIGNMENT);
-    }
-}
-
-JL_DLLEXPORT void *jl_realloc(void *p, size_t sz)
-{
-    int64_t *pp;
-    size_t szold;
-    if (p == NULL) {
-        pp = NULL;
-        szold = 0;
-    }
-    else {
-        pp = (int64_t *)p - 2;
-        szold = pp[0] + JL_SMALL_BYTE_ALIGNMENT;
-    }
-    int64_t *pnew = (int64_t *)jl_gc_counted_realloc_with_old_size(pp, szold, sz + JL_SMALL_BYTE_ALIGNMENT);
-    if (pnew == NULL)
-        return NULL;
-    pnew[0] = sz;
-    return (void *)(pnew + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
-}
-
 // allocating blocks for Arrays and Strings
 
 JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
@@ -3875,18 +3815,6 @@ jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT
     return jl_valueof(o);
 }
 
-JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return jl_gc_new_weakref_th(ptls, value);
-}
-
-JL_DLLEXPORT jl_value_t *jl_gc_allocobj(size_t sz)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return jl_gc_alloc(ptls, sz, NULL);
-}
-
 JL_DLLEXPORT int jl_gc_enable_conservative_gc_support(void)
 {
     if (jl_is_initialized()) {
@@ -4014,14 +3942,14 @@ JL_DLLEXPORT size_t jl_gc_external_obj_hdr_size(void)
 }
 
 
-JL_DLLEXPORT void * jl_gc_alloc_typed(jl_ptls_t ptls, size_t sz, void *ty)
+JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj)
 {
-    return jl_gc_alloc(ptls, sz, ty);
+    arraylist_push(&ptls->gc_tls.sweep_objs, obj);
 }
 
-JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *obj)
+void jl_gc_notify_image_load(const char* img_data, size_t len)
 {
-    arraylist_push(&ptls->gc_tls.sweep_objs, obj);
+    // Do nothing
 }
 
 #ifdef __cplusplus
diff --git a/src/gc-stock.h b/src/gc-stock.h
index 46f7d3e11e105..cc661ce6e1600 100644
--- a/src/gc-stock.h
+++ b/src/gc-stock.h
@@ -106,12 +106,6 @@ JL_EXTENSION typedef struct _bigval_t {
     // must be 64-byte aligned here, in 32 & 64 bit modes
 } bigval_t;
 
-// data structure for tracking malloc'd genericmemory.
-typedef struct _mallocmemory_t {
-    jl_genericmemory_t *a; // lowest bit is tagged if this is aligned memory
-    struct _mallocmemory_t *next;
-} mallocmemory_t;
-
 // pool page metadata
 typedef struct _jl_gc_pagemeta_t {
     // next metadata structure in per-thread list
@@ -428,21 +422,6 @@ STATIC_INLINE int gc_ith_parallel_collector_thread_id(int i) JL_NOTSAFEPOINT
     return gc_first_tid + i;
 }
 
-STATIC_INLINE int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
-{
-    return tid >= gc_first_tid && tid <= gc_last_parallel_collector_thread_id();
-}
-
-STATIC_INLINE int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
-{
-    if (jl_n_sweepthreads == 0) {
-        return 0;
-    }
-    int last_parallel_collector_thread_id = gc_last_parallel_collector_thread_id();
-    int concurrent_collector_thread_id = last_parallel_collector_thread_id + 1;
-    return tid == concurrent_collector_thread_id;
-}
-
 STATIC_INLINE int gc_random_parallel_collector_thread_id(jl_ptls_t ptls) JL_NOTSAFEPOINT
 {
     assert(jl_n_markthreads > 0);
diff --git a/src/julia.h b/src/julia.h
index ed3d9bf825658..b74de3060d26a 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -858,7 +858,7 @@ static inline jl_value_t *jl_to_typeof(uintptr_t t)
     return (jl_value_t*)t;
 }
 #else
-extern JL_HIDDEN jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
+extern JL_DLLEXPORT jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
 static inline jl_value_t *jl_to_typeof(uintptr_t t)
 {
     if (t < (jl_max_tags << 4))
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 20d90fede3d5e..04857d440b643 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -513,30 +513,6 @@ STATIC_INLINE uint8_t JL_CONST_FUNC jl_gc_szclass_align8(unsigned sz) JL_NOTSAFE
 #define GC_MAX_SZCLASS (2032-sizeof(void*))
 static_assert(ARRAY_CACHE_ALIGN_THRESHOLD > GC_MAX_SZCLASS, "");
 
-
-// Size does NOT include the type tag!!
-STATIC_INLINE jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty)
-{
-    jl_value_t *v;
-    const size_t allocsz = sz + sizeof(jl_taggedvalue_t);
-    if (sz <= GC_MAX_SZCLASS) {
-        int pool_id = jl_gc_szclass(allocsz);
-        jl_gc_pool_t *p = &ptls->gc_tls.heap.norm_pools[pool_id];
-        int osize = jl_gc_sizeclasses[pool_id];
-        // We call `jl_gc_small_alloc_noinline` instead of `jl_gc_small_alloc` to avoid double-counting in
-        // the Allocations Profiler. (See https://github.com/JuliaLang/julia/pull/43868 for more details.)
-        v = jl_gc_small_alloc_noinline(ptls, (char*)p - (char*)ptls, osize);
-    }
-    else {
-        if (allocsz < sz) // overflow in adding offs, size was "negative"
-            jl_throw(jl_memory_exception);
-        v = jl_gc_big_alloc_noinline(ptls, allocsz);
-    }
-    jl_set_typeof(v, ty);
-    maybe_record_alloc_to_profile(v, sz, (jl_datatype_t*)ty);
-    return v;
-}
-
 /* Programming style note: When using jl_gc_alloc, do not JL_GC_PUSH it into a
  * gc frame, until it has been fully initialized. An uninitialized value in a
  * gc frame can crash upon encountering the first safepoint. By delaying use of
@@ -1077,7 +1053,7 @@ STATIC_INLINE int jl_addr_is_safepoint(uintptr_t addr)
     return addr >= safepoint_addr && addr < safepoint_addr + jl_page_size * 4;
 }
 extern _Atomic(uint32_t) jl_gc_running;
-extern _Atomic(uint32_t) jl_gc_disable_counter;
+extern JL_DLLEXPORT _Atomic(uint32_t) jl_gc_disable_counter;
 // All the functions are safe to be called from within a signal handler
 // provided that the thread will not be interrupted by another asynchronous
 // signal.
diff --git a/src/scheduler.c b/src/scheduler.c
index bb2f85b52283f..b85a481588e4f 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -80,9 +80,20 @@ JL_DLLEXPORT int jl_set_task_threadpoolid(jl_task_t *task, int8_t tpid) JL_NOTSA
     return 1;
 }
 
+<<<<<<< HEAD
 // GC functions used
 extern int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache,
                                          jl_gc_markqueue_t *mq, jl_value_t *obj) JL_NOTSAFEPOINT;
+=======
+// parallel task runtime
+// ---
+
+JL_DLLEXPORT uint32_t jl_rand_ptls(uint32_t max) // [0, n)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return cong(max, &ptls->rngseed);
+}
+>>>>>>> 4f39869d04 (Refactoring to be considered before adding MMTk)
 
 // initialize the threading infrastructure
 // (called only by the main thread)
diff --git a/src/stackwalk.c b/src/stackwalk.c
index 6aa36fa8b499c..5f28b61c4a8fe 100644
--- a/src/stackwalk.c
+++ b/src/stackwalk.c
@@ -5,7 +5,7 @@
   utilities for walking the stack and looking up information about code addresses
 */
 #include <inttypes.h>
-#include "gc-stock.h"
+#include "gc-common.h"
 #include "julia.h"
 #include "julia_internal.h"
 #include "threading.h"
@@ -1294,6 +1294,8 @@ JL_DLLEXPORT void jl_print_backtrace(void) JL_NOTSAFEPOINT
 }
 
 extern int gc_first_tid;
+extern int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT;
+extern int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT;
 
 // Print backtraces for all live tasks, for all threads, to jl_safe_printf stderr
 JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT
diff --git a/src/staticdata.c b/src/staticdata.c
index 0a8cbe6db7c67..bba35e6dcb5f9 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -657,6 +657,7 @@ static void jl_load_sysimg_so(void)
         plen = (size_t *)&jl_system_image_size;
     else
         jl_dlsym(jl_sysimg_handle, "jl_system_image_size", (void **)&plen, 1);
+    jl_gc_notify_image_load(sysimg_data, *plen);
     jl_restore_system_image_data(sysimg_data, *plen);
 }
 
@@ -4054,6 +4055,7 @@ JL_DLLEXPORT jl_value_t *jl_restore_package_image_from_file(const char *fname, j
     jl_dlsym(pkgimg_handle, "jl_system_image_data", (void **)&pkgimg_data, 1);
     size_t *plen;
     jl_dlsym(pkgimg_handle, "jl_system_image_size", (void **)&plen, 1);
+    jl_gc_notify_image_load(pkgimg_data, *plen);
 
     jl_image_t pkgimage = jl_init_processor_pkgimg(pkgimg_handle);
 

From 0a8444ea6f539cdb63481f45411f42629c1c97e1 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 29 Aug 2024 04:57:59 +0000
Subject: [PATCH 099/116] Removing jl_gc_notify_image_load, since it's a new
 function and not part of the refactoring

---
 src/gc-interface.h | 5 -----
 src/gc-stock.c     | 5 -----
 src/staticdata.c   | 2 --
 3 files changed, 12 deletions(-)

diff --git a/src/gc-interface.h b/src/gc-interface.h
index 682f22344d69d..25ffed4524f0c 100644
--- a/src/gc-interface.h
+++ b/src/gc-interface.h
@@ -218,11 +218,6 @@ JL_DLLEXPORT void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align,
 // object being allocated and will be used to set the object header.
 struct _jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT;
 
-// This function notifies the GC about memory addresses that are set when loading the boot image.
-// The GC may use that information to, for instance, determine that such objects should
-// be treated as marked and belonged to the old generation in nursery collections.
-void jl_gc_notify_image_load(const char* img_data, size_t len);
-
 // ========================================================================= //
 // Runtime Write-Barriers
 // ========================================================================= //
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 6ebac8a0c079e..88b201a687eba 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -3947,11 +3947,6 @@ JL_DLLEXPORT void jl_gc_schedule_foreign_sweepfunc(jl_ptls_t ptls, jl_value_t *o
     arraylist_push(&ptls->gc_tls.sweep_objs, obj);
 }
 
-void jl_gc_notify_image_load(const char* img_data, size_t len)
-{
-    // Do nothing
-}
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/staticdata.c b/src/staticdata.c
index bba35e6dcb5f9..0a8cbe6db7c67 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -657,7 +657,6 @@ static void jl_load_sysimg_so(void)
         plen = (size_t *)&jl_system_image_size;
     else
         jl_dlsym(jl_sysimg_handle, "jl_system_image_size", (void **)&plen, 1);
-    jl_gc_notify_image_load(sysimg_data, *plen);
     jl_restore_system_image_data(sysimg_data, *plen);
 }
 
@@ -4055,7 +4054,6 @@ JL_DLLEXPORT jl_value_t *jl_restore_package_image_from_file(const char *fname, j
     jl_dlsym(pkgimg_handle, "jl_system_image_data", (void **)&pkgimg_data, 1);
     size_t *plen;
     jl_dlsym(pkgimg_handle, "jl_system_image_size", (void **)&plen, 1);
-    jl_gc_notify_image_load(pkgimg_data, *plen);
 
     jl_image_t pkgimage = jl_init_processor_pkgimg(pkgimg_handle);
 

From c8818eab4ec04a248121bef73e1dd5e3b29a3ceb Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Mon, 2 Sep 2024 01:27:08 +0000
Subject: [PATCH 100/116] Moving gc_enable code to gc-common.c

---
 src/gc-common.c | 30 ++++++++++++++++++++++++++++++
 src/gc-stock.c  | 30 ------------------------------
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index 2ec167caa667a..03c046bc300f2 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -613,6 +613,36 @@ void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, i
     ptls->gc_tls.heap.mallocarrays = ma;
 }
 
+// collector entry point and control
+_Atomic(uint32_t) jl_gc_disable_counter = 1;
+
+JL_DLLEXPORT int jl_gc_enable(int on)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    int prev = !ptls->disable_gc;
+    ptls->disable_gc = (on == 0);
+    if (on && !prev) {
+        // disable -> enable
+        if (jl_atomic_fetch_add(&jl_gc_disable_counter, -1) == 1) {
+            gc_num.allocd += gc_num.deferred_alloc;
+            gc_num.deferred_alloc = 0;
+        }
+    }
+    else if (prev && !on) {
+        // enable -> disable
+        jl_atomic_fetch_add(&jl_gc_disable_counter, 1);
+        // check if the GC is running and wait for it to finish
+        jl_gc_safepoint_(ptls);
+    }
+    return prev;
+}
+
+JL_DLLEXPORT int jl_gc_is_enabled(void)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return !ptls->disable_gc;
+}
+
 int gc_logging_enabled = 0;
 
 JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 88b201a687eba..55499bce61182 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -2803,36 +2803,6 @@ int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
     return tid == concurrent_collector_thread_id;
 }
 
-// collector entry point and control
-_Atomic(uint32_t) jl_gc_disable_counter = 1;
-
-JL_DLLEXPORT int jl_gc_enable(int on)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    int prev = !ptls->disable_gc;
-    ptls->disable_gc = (on == 0);
-    if (on && !prev) {
-        // disable -> enable
-        if (jl_atomic_fetch_add(&jl_gc_disable_counter, -1) == 1) {
-            gc_num.allocd += gc_num.deferred_alloc;
-            gc_num.deferred_alloc = 0;
-        }
-    }
-    else if (prev && !on) {
-        // enable -> disable
-        jl_atomic_fetch_add(&jl_gc_disable_counter, 1);
-        // check if the GC is running and wait for it to finish
-        jl_gc_safepoint_(ptls);
-    }
-    return prev;
-}
-
-JL_DLLEXPORT int jl_gc_is_enabled(void)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return !ptls->disable_gc;
-}
-
 JL_DLLEXPORT void jl_gc_get_total_bytes(int64_t *bytes) JL_NOTSAFEPOINT
 {
     jl_gc_num_t num = gc_num;

From e721e0c121ee911c29e736668b5e20766844d85e Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Mon, 16 Sep 2024 06:38:02 +0000
Subject: [PATCH 101/116] Addressing PR comments

---
 src/gc-common.c      | 134 +++++++++++++++++++++++++------------------
 src/gc-common.h      |   6 ++
 src/gc-debug.c       |   2 -
 src/gc-interface.h   |  30 +---------
 src/gc-stock.c       |  18 +-----
 src/gc-stock.h       |  15 +++++
 src/julia.h          |   2 +-
 src/julia_internal.h |   4 +-
 src/stackwalk.c      |  10 +---
 9 files changed, 110 insertions(+), 111 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index 03c046bc300f2..046feae6aa4c5 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -491,15 +491,9 @@ int gc_n_threads;
 jl_ptls_t* gc_all_tls_states;
 
 // =========================================================================== //
-// MISC
+// Allocation
 // =========================================================================== //
 
-JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return jl_gc_new_weakref_th(ptls, value);
-}
-
 JL_DLLEXPORT void * jl_gc_alloc_typed(jl_ptls_t ptls, size_t sz, void *ty)
 {
     return jl_gc_alloc(ptls, sz, ty);
@@ -575,17 +569,9 @@ JL_DLLEXPORT jl_value_t *(jl_gc_alloc)(jl_ptls_t ptls, size_t sz, void *ty)
     return jl_gc_alloc_(ptls, sz, ty);
 }
 
-const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00
-JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) JL_NOTSAFEPOINT
-{
-    return jl_buff_tag;
-}
-
-// callback for passing OOM errors from gmp
-JL_DLLEXPORT void jl_throw_out_of_memory_error(void)
-{
-    jl_throw(jl_memory_exception);
-}
+// =========================================================================== //
+// Generic Memory
+// =========================================================================== //
 
 size_t jl_genericmemory_nbytes(jl_genericmemory_t *m) JL_NOTSAFEPOINT
 {
@@ -613,6 +599,66 @@ void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, i
     ptls->gc_tls.heap.mallocarrays = ma;
 }
 
+// =========================================================================== //
+// GC Debug
+// =========================================================================== //
+
+int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
+{
+    int nf = (int)jl_datatype_nfields(vt);
+    for (int i = 1; i < nf; i++) {
+        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
+            return i - 1;
+    }
+    return nf - 1;
+}
+
+int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
+{
+    char *slot = (char*)_slot;
+    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
+    char *start = NULL;
+    size_t len = 0;
+    size_t elsize = sizeof(void*);
+    if (vt == jl_module_type) {
+        jl_module_t *m = (jl_module_t*)obj;
+        start = (char*)m->usings.items;
+        len = m->usings.len;
+    }
+    else if (vt == jl_simplevector_type) {
+        start = (char*)jl_svec_data(obj);
+        len = jl_svec_len(obj);
+    }
+    if (slot < start || slot >= start + elsize * len)
+        return -1;
+    return (slot - start) / elsize;
+}
+
+// =========================================================================== //
+// GC Control
+// =========================================================================== //
+
+JL_DLLEXPORT uint32_t jl_get_gc_disable_counter(void) {
+    return jl_atomic_load_acquire(&jl_gc_disable_counter);
+}
+
+JL_DLLEXPORT int jl_gc_is_enabled(void)
+{
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return !ptls->disable_gc;
+}
+
+int gc_logging_enabled = 0;
+
+JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
+    gc_logging_enabled = enable;
+}
+
+JL_DLLEXPORT int jl_is_gc_logging_enabled(void) {
+    return gc_logging_enabled;
+}
+
+
 // collector entry point and control
 _Atomic(uint32_t) jl_gc_disable_counter = 1;
 
@@ -637,54 +683,30 @@ JL_DLLEXPORT int jl_gc_enable(int on)
     return prev;
 }
 
-JL_DLLEXPORT int jl_gc_is_enabled(void)
+// =========================================================================== //
+// MISC
+// =========================================================================== //
+
+JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
 {
     jl_ptls_t ptls = jl_current_task->ptls;
-    return !ptls->disable_gc;
-}
-
-int gc_logging_enabled = 0;
-
-JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
-    gc_logging_enabled = enable;
+    return jl_gc_new_weakref_th(ptls, value);
 }
 
-JL_DLLEXPORT int jl_is_gc_logging_enabled(void) {
-    return gc_logging_enabled;
+JL_DLLEXPORT jl_datatype_t **jl_get_ijl_small_typeof(void) {
+    return ijl_small_typeof;
 }
 
-// gc-debug common functions
-// ---
-
-int gc_slot_to_fieldidx(void *obj, void *slot, jl_datatype_t *vt) JL_NOTSAFEPOINT
+const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00
+JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) JL_NOTSAFEPOINT
 {
-    int nf = (int)jl_datatype_nfields(vt);
-    for (int i = 1; i < nf; i++) {
-        if (slot < (void*)((char*)obj + jl_field_offset(vt, i)))
-            return i - 1;
-    }
-    return nf - 1;
+    return jl_buff_tag;
 }
 
-int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
+// callback for passing OOM errors from gmp
+JL_DLLEXPORT void jl_throw_out_of_memory_error(void)
 {
-    char *slot = (char*)_slot;
-    jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(obj);
-    char *start = NULL;
-    size_t len = 0;
-    size_t elsize = sizeof(void*);
-    if (vt == jl_module_type) {
-        jl_module_t *m = (jl_module_t*)obj;
-        start = (char*)m->usings.items;
-        len = m->usings.len;
-    }
-    else if (vt == jl_simplevector_type) {
-        start = (char*)jl_svec_data(obj);
-        len = jl_svec_len(obj);
-    }
-    if (slot < start || slot >= start + elsize * len)
-        return -1;
-    return (slot - start) / elsize;
+    jl_throw(jl_memory_exception);
 }
 
 #ifdef __cplusplus
diff --git a/src/gc-common.h b/src/gc-common.h
index 154b9659e9ccb..32b7470b13a58 100644
--- a/src/gc-common.h
+++ b/src/gc-common.h
@@ -179,4 +179,10 @@ JL_DLLEXPORT void jl_finalize_th(jl_task_t *ct, jl_value_t *o);
 extern int gc_n_threads;
 extern jl_ptls_t* gc_all_tls_states;
 
+// =========================================================================== //
+// Logging
+// =========================================================================== //
+
+extern int gc_logging_enabled;
+
 #endif // JL_GC_COMMON_H
diff --git a/src/gc-debug.c b/src/gc-debug.c
index d05fb4b49e9f7..7c479484cde45 100644
--- a/src/gc-debug.c
+++ b/src/gc-debug.c
@@ -1105,8 +1105,6 @@ void gc_count_pool(void)
     jl_safe_printf("************************\n");
 }
 
-extern int gc_logging_enabled;
-
 void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect, int64_t live_bytes) JL_NOTSAFEPOINT {
     if (!gc_logging_enabled) {
         return;
diff --git a/src/gc-interface.h b/src/gc-interface.h
index 25ffed4524f0c..0e9ce32697f35 100644
--- a/src/gc-interface.h
+++ b/src/gc-interface.h
@@ -94,6 +94,8 @@ JL_DLLEXPORT void jl_gc_set_max_memory(uint64_t max_mem);
 // should run a collection cycle again (e.g. a full mark right after a full sweep to ensure
 // we do a full heap traversal).
 JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection);
+// Returns whether the thread with `tid` is a collector thread
+JL_DLLEXPORT int gc_is_collector_thread(int tid) JL_NOTSAFEPOINT;
 
 // ========================================================================= //
 // Metrics
@@ -162,26 +164,6 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz);
 JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz);
 // Wrapper around Libc realloc that updates Julia allocation counters.
 JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz);
-// Wrapper around Libc malloc that allocates a memory region with a few additional machine
-// words before the actual payload that are used to record the size of the requested
-// allocation. Also updates Julia allocation counters. The function returns a pointer to the
-// payload as a result of the allocation.
-JL_DLLEXPORT void *jl_malloc(size_t sz);
-// Wrapper around Libc calloc that allocates a memory region with a few additional machine
-// words before the actual payload that are used to record the size of the requested
-// allocation. Also updates Julia allocation counters. The function returns a pointer to the
-// payload as a result of the allocation.
-JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz);
-// Wrapper around Libc free that takes a pointer to the payload of a memory region allocated
-// with jl_malloc or jl_calloc, and uses the size information stored in the first machine
-// words of the memory buffer update Julia allocation counters, and then frees the
-// corresponding memory buffer.
-JL_DLLEXPORT void jl_free(void *p);
-// Wrapper around Libc realloc that takes a memory region allocated with jl_malloc or
-// jl_calloc, and uses the size information stored in the first machine words of the memory
-// buffer to update Julia allocation counters, reallocating the corresponding memory buffer
-// in the end.
-JL_DLLEXPORT void *jl_realloc(void *p, size_t sz);
 // Wrapper around Libc malloc that's used to dynamically allocate memory for Arrays and
 // Strings. It increments Julia allocation counters and should check whether we're close to
 // the Julia heap target, and therefore, whether we should run a collection. Note that this
@@ -195,14 +177,6 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz);
 // thread-local allocator of the thread referenced by the first jl_ptls_t argument.
 JL_DLLEXPORT struct _jl_weakref_t *jl_gc_new_weakref_th(struct _jl_tls_states_t *ptls,
                                                         struct _jl_value_t *value);
-// Allocates a new weak-reference, assigns its value and increments Julia allocation
-// counters. If thread-local allocators are used, then this function should allocate in the
-// thread-local allocator of the current thread.
-JL_DLLEXPORT struct _jl_weakref_t *jl_gc_new_weakref(struct _jl_value_t *value);
-// Allocates an object whose size is specified by the function argument and increments Julia
-// allocation counters. If thread-local allocators are used, then this function should
-// allocate in the thread-local allocator of the current thread.
-JL_DLLEXPORT struct _jl_value_t *jl_gc_allocobj(size_t sz);
 // Permanently allocates a memory slot of the size specified by the first parameter. This
 // block of memory is allocated in an immortal region that is never swept. The second
 // parameter specifies whether the memory should be filled with zeros. The third and fourth
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 55499bce61182..b345fe08ff69c 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -2788,19 +2788,8 @@ static void sweep_finalizer_list(arraylist_t *list)
     list->len = j;
 }
 
-int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
-{
-    return tid >= gc_first_tid && tid <= gc_last_parallel_collector_thread_id();
-}
-
-int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
-{
-    if (jl_n_sweepthreads == 0) {
-        return 0;
-    }
-    int last_parallel_collector_thread_id = gc_last_parallel_collector_thread_id();
-    int concurrent_collector_thread_id = last_parallel_collector_thread_id + 1;
-    return tid == concurrent_collector_thread_id;
+int gc_is_collector_thread(int tid) JL_NOTSAFEPOINT {
+    return gc_is_parallel_collector_thread(tid) || gc_is_concurrent_collector_thread(tid);
 }
 
 JL_DLLEXPORT void jl_gc_get_total_bytes(int64_t *bytes) JL_NOTSAFEPOINT
@@ -3193,8 +3182,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
         // free empty GC state for threads that have exited
         if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
             // GC threads should never exit
-            assert(!gc_is_parallel_collector_thread(t_i));
-            assert(!gc_is_concurrent_collector_thread(t_i));
+            assert(!gc_is_collector_thread(t_i));
             jl_thread_heap_t *heap = &ptls2->gc_tls.heap;
             if (heap->weak_refs.len == 0)
                 small_arraylist_free(&heap->weak_refs);
diff --git a/src/gc-stock.h b/src/gc-stock.h
index cc661ce6e1600..0f8d1eee67581 100644
--- a/src/gc-stock.h
+++ b/src/gc-stock.h
@@ -422,6 +422,21 @@ STATIC_INLINE int gc_ith_parallel_collector_thread_id(int i) JL_NOTSAFEPOINT
     return gc_first_tid + i;
 }
 
+STATIC_INLINE int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    return tid >= gc_first_tid && tid <= gc_last_parallel_collector_thread_id();
+}
+
+STATIC_INLINE int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    if (jl_n_sweepthreads == 0) {
+        return 0;
+    }
+    int last_parallel_collector_thread_id = gc_last_parallel_collector_thread_id();
+    int concurrent_collector_thread_id = last_parallel_collector_thread_id + 1;
+    return tid == concurrent_collector_thread_id;
+}
+
 STATIC_INLINE int gc_random_parallel_collector_thread_id(jl_ptls_t ptls) JL_NOTSAFEPOINT
 {
     assert(jl_n_markthreads > 0);
diff --git a/src/julia.h b/src/julia.h
index b74de3060d26a..ed3d9bf825658 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -858,7 +858,7 @@ static inline jl_value_t *jl_to_typeof(uintptr_t t)
     return (jl_value_t*)t;
 }
 #else
-extern JL_DLLEXPORT jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
+extern JL_HIDDEN jl_datatype_t *ijl_small_typeof[(jl_max_tags << 4) / sizeof(jl_datatype_t*)];
 static inline jl_value_t *jl_to_typeof(uintptr_t t)
 {
     if (t < (jl_max_tags << 4))
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 04857d440b643..c079c06f0189a 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -367,6 +367,8 @@ extern jl_function_t *jl_typeinf_func JL_GLOBALLY_ROOTED;
 extern JL_DLLEXPORT size_t jl_typeinf_world;
 extern _Atomic(jl_typemap_entry_t*) call_cache[N_CALL_CACHE] JL_GLOBALLY_ROOTED;
 
+extern void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT;
+
 JL_DLLEXPORT extern int jl_lineno;
 JL_DLLEXPORT extern const char *jl_filename;
 
@@ -1053,7 +1055,7 @@ STATIC_INLINE int jl_addr_is_safepoint(uintptr_t addr)
     return addr >= safepoint_addr && addr < safepoint_addr + jl_page_size * 4;
 }
 extern _Atomic(uint32_t) jl_gc_running;
-extern JL_DLLEXPORT _Atomic(uint32_t) jl_gc_disable_counter;
+extern _Atomic(uint32_t) jl_gc_disable_counter;
 // All the functions are safe to be called from within a signal handler
 // provided that the thread will not be interrupted by another asynchronous
 // signal.
diff --git a/src/stackwalk.c b/src/stackwalk.c
index 5f28b61c4a8fe..a1de3a6d61a07 100644
--- a/src/stackwalk.c
+++ b/src/stackwalk.c
@@ -1294,8 +1294,6 @@ JL_DLLEXPORT void jl_print_backtrace(void) JL_NOTSAFEPOINT
 }
 
 extern int gc_first_tid;
-extern int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT;
-extern int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT;
 
 // Print backtraces for all live tasks, for all threads, to jl_safe_printf stderr
 JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT
@@ -1304,12 +1302,8 @@ JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT
     jl_ptls_t *allstates = jl_atomic_load_relaxed(&jl_all_tls_states);
     for (size_t i = 0; i < nthreads; i++) {
         jl_ptls_t ptls2 = allstates[i];
-        if (gc_is_parallel_collector_thread(i)) {
-            jl_safe_printf("==== Skipping backtrace for parallel GC thread %zu\n", i + 1);
-            continue;
-        }
-        if (gc_is_concurrent_collector_thread(i)) {
-            jl_safe_printf("==== Skipping backtrace for concurrent GC thread %zu\n", i + 1);
+        if (gc_is_collector_thread(i)) {
+            jl_safe_printf("==== Skipping backtrace for parallel/concurrent GC thread %zu\n", i + 1);
             continue;
         }
         if (ptls2 == NULL) {

From 6c0eb93fccbd77a338c6a6e2ddae8888fa6bc1b2 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 19 Sep 2024 04:18:13 +0000
Subject: [PATCH 102/116] Push resolution of merge conflict

---
 src/scheduler.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/src/scheduler.c b/src/scheduler.c
index b85a481588e4f..bb2f85b52283f 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -80,20 +80,9 @@ JL_DLLEXPORT int jl_set_task_threadpoolid(jl_task_t *task, int8_t tpid) JL_NOTSA
     return 1;
 }
 
-<<<<<<< HEAD
 // GC functions used
 extern int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache,
                                          jl_gc_markqueue_t *mq, jl_value_t *obj) JL_NOTSAFEPOINT;
-=======
-// parallel task runtime
-// ---
-
-JL_DLLEXPORT uint32_t jl_rand_ptls(uint32_t max) // [0, n)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    return cong(max, &ptls->rngseed);
-}
->>>>>>> 4f39869d04 (Refactoring to be considered before adding MMTk)
 
 // initialize the threading infrastructure
 // (called only by the main thread)

From fb0ec76ecc52efae85ad65c34b1a3f49f24475e7 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 25 Sep 2024 01:10:31 +0000
Subject: [PATCH 103/116] Removing jl_gc_mark_queue_obj_explicit extern
 definition from scheduler.c

---
 src/scheduler.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/scheduler.c b/src/scheduler.c
index bb2f85b52283f..7e23f654c2566 100644
--- a/src/scheduler.c
+++ b/src/scheduler.c
@@ -80,10 +80,6 @@ JL_DLLEXPORT int jl_set_task_threadpoolid(jl_task_t *task, int8_t tpid) JL_NOTSA
     return 1;
 }
 
-// GC functions used
-extern int jl_gc_mark_queue_obj_explicit(jl_gc_mark_cache_t *gc_cache,
-                                         jl_gc_markqueue_t *mq, jl_value_t *obj) JL_NOTSAFEPOINT;
-
 // initialize the threading infrastructure
 // (called only by the main thread)
 void jl_init_threadinginfra(void)

From 3eea0790d832eba1d17b1a1564447f51986c7118 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Wed, 25 Sep 2024 02:50:25 +0000
Subject: [PATCH 104/116] Don't need the getter function since it's possible to
 use jl_small_typeof directly

---
 src/gc-common.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/gc-common.c b/src/gc-common.c
index 046feae6aa4c5..417f12f26d64d 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -693,10 +693,6 @@ JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref(jl_value_t *value)
     return jl_gc_new_weakref_th(ptls, value);
 }
 
-JL_DLLEXPORT jl_datatype_t **jl_get_ijl_small_typeof(void) {
-    return ijl_small_typeof;
-}
-
 const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00
 JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) JL_NOTSAFEPOINT
 {

From ef6c79823306f2556951d6f8a70b165aceda2c76 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Fri, 27 Sep 2024 00:49:07 +0000
Subject: [PATCH 105/116] Remove extern from free_stack declaration in
 julia_internal.h

---
 src/julia_internal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/julia_internal.h b/src/julia_internal.h
index c079c06f0189a..6fd537ed6baf8 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -367,7 +367,7 @@ extern jl_function_t *jl_typeinf_func JL_GLOBALLY_ROOTED;
 extern JL_DLLEXPORT size_t jl_typeinf_world;
 extern _Atomic(jl_typemap_entry_t*) call_cache[N_CALL_CACHE] JL_GLOBALLY_ROOTED;
 
-extern void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT;
+void free_stack(void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT;
 
 JL_DLLEXPORT extern int jl_lineno;
 JL_DLLEXPORT extern const char *jl_filename;

From 63ca362bfaeed147887da242a6721de014ca5535 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Tue, 8 Oct 2024 09:12:49 +0000
Subject: [PATCH 106/116] Putting everything that is common GC tls into
 gc-tls-common.h

---
 src/gc-common.c     |  10 +--
 src/gc-stacks.c     |  18 +++---
 src/gc-stock.c      | 154 ++++++++++++++++++++++----------------------
 src/gc-tls-common.h |  52 +++++++++++++++
 src/gc-tls.h        |  25 -------
 src/julia_threads.h |   2 +
 src/stackwalk.c     |   2 +-
 7 files changed, 147 insertions(+), 116 deletions(-)
 create mode 100644 src/gc-tls-common.h

diff --git a/src/gc-common.c b/src/gc-common.c
index 417f12f26d64d..6ce455d3923ad 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -587,16 +587,16 @@ size_t jl_genericmemory_nbytes(jl_genericmemory_t *m) JL_NOTSAFEPOINT
 void jl_gc_track_malloced_genericmemory(jl_ptls_t ptls, jl_genericmemory_t *m, int isaligned){
     // This is **NOT** a GC safe point.
     mallocmemory_t *ma;
-    if (ptls->gc_tls.heap.mafreelist == NULL) {
+    if (ptls->gc_tls_common.heap.mafreelist == NULL) {
         ma = (mallocmemory_t*)malloc_s(sizeof(mallocmemory_t));
     }
     else {
-        ma = ptls->gc_tls.heap.mafreelist;
-        ptls->gc_tls.heap.mafreelist = ma->next;
+        ma = ptls->gc_tls_common.heap.mafreelist;
+        ptls->gc_tls_common.heap.mafreelist = ma->next;
     }
     ma->a = (jl_genericmemory_t*)((uintptr_t)m | !!isaligned);
-    ma->next = ptls->gc_tls.heap.mallocarrays;
-    ptls->gc_tls.heap.mallocarrays = ma;
+    ma->next = ptls->gc_tls_common.heap.mallocarrays;
+    ptls->gc_tls_common.heap.mallocarrays = ma;
 }
 
 // =========================================================================== //
diff --git a/src/gc-stacks.c b/src/gc-stacks.c
index 8c44b65284386..a8fec938456a3 100644
--- a/src/gc-stacks.c
+++ b/src/gc-stacks.c
@@ -131,7 +131,7 @@ void _jl_free_stack(jl_ptls_t ptls, void *stkbuf, size_t bufsz) JL_NOTSAFEPOINT
     if (bufsz <= pool_sizes[JL_N_STACK_POOLS - 1]) {
         unsigned pool_id = select_pool(bufsz);
         if (pool_sizes[pool_id] == bufsz) {
-            small_arraylist_push(&ptls->gc_tls.heap.free_stacks[pool_id], stkbuf);
+            small_arraylist_push(&ptls->gc_tls_common.heap.free_stacks[pool_id], stkbuf);
             return;
         }
     }
@@ -160,7 +160,7 @@ void jl_release_task_stack(jl_ptls_t ptls, jl_task_t *task)
 #ifdef _COMPILER_ASAN_ENABLED_
             __asan_unpoison_stack_memory((uintptr_t)stkbuf, bufsz);
 #endif
-            small_arraylist_push(&ptls->gc_tls.heap.free_stacks[pool_id], stkbuf);
+            small_arraylist_push(&ptls->gc_tls_common.heap.free_stacks[pool_id], stkbuf);
         }
     }
 }
@@ -175,7 +175,7 @@ JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, jl_task_t *owner) JL_NOTSAFEPO
     if (ssize <= pool_sizes[JL_N_STACK_POOLS - 1]) {
         unsigned pool_id = select_pool(ssize);
         ssize = pool_sizes[pool_id];
-        small_arraylist_t *pool = &ptls->gc_tls.heap.free_stacks[pool_id];
+        small_arraylist_t *pool = &ptls->gc_tls_common.heap.free_stacks[pool_id];
         if (pool->len > 0) {
             stk = small_arraylist_pop(pool);
         }
@@ -196,7 +196,7 @@ JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, jl_task_t *owner) JL_NOTSAFEPO
     }
     *bufsz = ssize;
     if (owner) {
-        small_arraylist_t *live_tasks = &ptls->gc_tls.heap.live_tasks;
+        small_arraylist_t *live_tasks = &ptls->gc_tls_common.heap.live_tasks;
         mtarraylist_push(live_tasks, owner);
     }
     return stk;
@@ -223,7 +223,7 @@ void sweep_stack_pools(void) JL_NOTSAFEPOINT
 
         // free half of stacks that remain unused since last sweep
         for (int p = 0; p < JL_N_STACK_POOLS; p++) {
-            small_arraylist_t *al = &ptls2->gc_tls.heap.free_stacks[p];
+            small_arraylist_t *al = &ptls2->gc_tls_common.heap.free_stacks[p];
             size_t n_to_free;
             if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
                 n_to_free = al->len; // not alive yet or dead, so it does not need these anymore
@@ -245,10 +245,10 @@ void sweep_stack_pools(void) JL_NOTSAFEPOINT
             }
         }
         if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
-            small_arraylist_free(ptls2->gc_tls.heap.free_stacks);
+            small_arraylist_free(ptls2->gc_tls_common.heap.free_stacks);
         }
 
-        small_arraylist_t *live_tasks = &ptls2->gc_tls.heap.live_tasks;
+        small_arraylist_t *live_tasks = &ptls2->gc_tls_common.heap.live_tasks;
         size_t n = 0;
         size_t ndel = 0;
         size_t l = live_tasks->len;
@@ -299,7 +299,7 @@ JL_DLLEXPORT jl_array_t *jl_live_tasks(void)
         jl_ptls_t ptls2 = allstates[i];
         if (ptls2 == NULL)
             continue;
-        small_arraylist_t *live_tasks = &ptls2->gc_tls.heap.live_tasks;
+        small_arraylist_t *live_tasks = &ptls2->gc_tls_common.heap.live_tasks;
         size_t n = mtarraylist_length(live_tasks);
         l += n + (ptls2->root_task->ctx.stkbuf != NULL);
     }
@@ -318,7 +318,7 @@ JL_DLLEXPORT jl_array_t *jl_live_tasks(void)
                 goto restart;
             jl_array_data(a,void*)[j++] = t;
         }
-        small_arraylist_t *live_tasks = &ptls2->gc_tls.heap.live_tasks;
+        small_arraylist_t *live_tasks = &ptls2->gc_tls_common.heap.live_tasks;
         size_t n = mtarraylist_length(live_tasks);
         for (size_t i = 0; i < n; i++) {
             jl_task_t *t = (jl_task_t*)mtarraylist_get(live_tasks, i);
diff --git a/src/gc-stock.c b/src/gc-stock.c
index b345fe08ff69c..8e040c9b25dcf 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -357,7 +357,7 @@ JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref_th(jl_ptls_t ptls, jl_value_t *valu
     jl_weakref_t *wr = (jl_weakref_t*)jl_gc_alloc(ptls, sizeof(void*),
                                                   jl_weakref_type);
     wr->value = value;  // NOTE: wb not needed here
-    small_arraylist_push(&ptls->gc_tls.heap.weak_refs, wr);
+    small_arraylist_push(&ptls->gc_tls_common.heap.weak_refs, wr);
     return wr;
 }
 
@@ -367,8 +367,8 @@ static void clear_weak_refs(void)
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[i];
         if (ptls2 != NULL) {
-            size_t n, l = ptls2->gc_tls.heap.weak_refs.len;
-            void **lst = ptls2->gc_tls.heap.weak_refs.items;
+            size_t n, l = ptls2->gc_tls_common.heap.weak_refs.len;
+            void **lst = ptls2->gc_tls_common.heap.weak_refs.items;
             for (n = 0; n < l; n++) {
                 jl_weakref_t *wr = (jl_weakref_t*)lst[n];
                 if (!gc_marked(jl_astaggedvalue(wr->value)->bits.gc))
@@ -386,8 +386,8 @@ static void sweep_weak_refs(void)
         if (ptls2 != NULL) {
             size_t n = 0;
             size_t ndel = 0;
-            size_t l = ptls2->gc_tls.heap.weak_refs.len;
-            void **lst = ptls2->gc_tls.heap.weak_refs.items;
+            size_t l = ptls2->gc_tls_common.heap.weak_refs.len;
+            void **lst = ptls2->gc_tls_common.heap.weak_refs.items;
             if (l == 0)
                 continue;
             while (1) {
@@ -402,7 +402,7 @@ static void sweep_weak_refs(void)
                 lst[n] = lst[n + ndel];
                 lst[n + ndel] = tmp;
             }
-            ptls2->gc_tls.heap.weak_refs.len -= ndel;
+            ptls2->gc_tls_common.heap.weak_refs.len -= ndel;
         }
     }
 }
@@ -410,18 +410,18 @@ static void sweep_weak_refs(void)
 
 STATIC_INLINE void jl_batch_accum_heap_size(jl_ptls_t ptls, uint64_t sz) JL_NOTSAFEPOINT
 {
-    uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.alloc_acc) + sz;
+    uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc) + sz;
     if (alloc_acc < 16*1024)
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.alloc_acc, alloc_acc);
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc, alloc_acc);
     else {
         jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_acc);
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.alloc_acc, 0);
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc, 0);
     }
 }
 
 STATIC_INLINE void jl_batch_accum_free_size(jl_ptls_t ptls, uint64_t sz) JL_NOTSAFEPOINT
 {
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.free_acc, jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.free_acc) + sz);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.free_acc, jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.free_acc) + sz);
 }
 
 // big value list
@@ -442,10 +442,10 @@ STATIC_INLINE jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz)
         jl_throw(jl_memory_exception);
     gc_invoke_callbacks(jl_gc_cb_notify_external_alloc_t,
         gc_cblist_notify_external_alloc, (v, allocsz));
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + allocsz);
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.bigalloc,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.bigalloc) + 1);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + allocsz);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.bigalloc,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.bigalloc) + 1);
     jl_batch_accum_heap_size(ptls, allocsz);
 #ifdef MEMDEBUG
     memset(v, 0xee, allocsz);
@@ -558,8 +558,8 @@ static void sweep_big(jl_ptls_t ptls) JL_NOTSAFEPOINT
 void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT
 {
     jl_ptls_t ptls = jl_current_task->ptls;
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + sz);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + sz);
     jl_batch_accum_heap_size(ptls, sz);
 }
 
@@ -578,18 +578,18 @@ static void combine_thread_gc_counts(jl_gc_num_t *dest, int update_heap) JL_NOTS
     for (int i = 0; i < gc_n_threads; i++) {
         jl_ptls_t ptls = gc_all_tls_states[i];
         if (ptls) {
-            dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + gc_num.interval);
-            dest->malloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.malloc);
-            dest->realloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.realloc);
-            dest->poolalloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.poolalloc);
-            dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.bigalloc);
-            dest->freed += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.free_acc);
+            dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + gc_num.interval);
+            dest->malloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.malloc);
+            dest->realloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.realloc);
+            dest->poolalloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.poolalloc);
+            dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.bigalloc);
+            dest->freed += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.free_acc);
             if (update_heap) {
-                uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.alloc_acc);
-                freed_in_runtime += jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.free_acc);
+                uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc);
+                freed_in_runtime += jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.free_acc);
                 jl_atomic_store_relaxed(&gc_heap_stats.heap_size, alloc_acc + jl_atomic_load_relaxed(&gc_heap_stats.heap_size));
-                jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.alloc_acc, 0);
-                jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.free_acc, 0);
+                jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc, 0);
+                jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.free_acc, 0);
             }
         }
     }
@@ -605,13 +605,13 @@ static void reset_thread_gc_counts(void) JL_NOTSAFEPOINT
         jl_ptls_t ptls = gc_all_tls_states[i];
         if (ptls != NULL) {
             // don't reset `pool_live_bytes` here
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, -(int64_t)gc_num.interval);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.malloc, 0);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.realloc, 0);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.poolalloc, 0);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.bigalloc, 0);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.alloc_acc, 0);
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.free_acc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, -(int64_t)gc_num.interval);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.malloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.realloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.poolalloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.bigalloc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.alloc_acc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.free_acc, 0);
         }
     }
 }
@@ -654,8 +654,8 @@ static void sweep_malloced_memory(void) JL_NOTSAFEPOINT
     for (int t_i = 0; t_i < gc_n_threads; t_i++) {
         jl_ptls_t ptls2 = gc_all_tls_states[t_i];
         if (ptls2 != NULL) {
-            mallocmemory_t *ma = ptls2->gc_tls.heap.mallocarrays;
-            mallocmemory_t **pma = &ptls2->gc_tls.heap.mallocarrays;
+            mallocmemory_t *ma = ptls2->gc_tls_common.heap.mallocarrays;
+            mallocmemory_t **pma = &ptls2->gc_tls_common.heap.mallocarrays;
             while (ma != NULL) {
                 mallocmemory_t *nxt = ma->next;
                 jl_value_t *a = (jl_value_t*)((uintptr_t)ma->a & ~1);
@@ -667,8 +667,8 @@ static void sweep_malloced_memory(void) JL_NOTSAFEPOINT
                     *pma = nxt;
                     int isaligned = (uintptr_t)ma->a & 1;
                     jl_gc_free_memory(a, isaligned);
-                    ma->next = ptls2->gc_tls.heap.mafreelist;
-                    ptls2->gc_tls.heap.mafreelist = ma;
+                    ma->next = ptls2->gc_tls_common.heap.mafreelist;
+                    ptls2->gc_tls_common.heap.mafreelist = ma;
                 }
                 gc_time_count_mallocd_memory(bits);
                 ma = nxt;
@@ -729,12 +729,12 @@ STATIC_INLINE jl_value_t *jl_gc_small_alloc_inner(jl_ptls_t ptls, int offset,
     return jl_gc_big_alloc(ptls, osize, NULL);
 #endif
     maybe_collect(ptls);
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + osize);
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.pool_live_bytes,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.pool_live_bytes) + osize);
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.poolalloc,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.poolalloc) + 1);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + osize);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.pool_live_bytes,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.pool_live_bytes) + osize);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.poolalloc,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.poolalloc) + 1);
     // first try to use the freelist
     jl_taggedvalue_t *v = p->freelist;
     if (v != NULL) {
@@ -971,8 +971,8 @@ static void gc_sweep_page(gc_page_profiler_serializer_t *s, jl_gc_pool_t *p, jl_
     // instead of adding it to the thread that originally allocated the page, so we can avoid
     // an atomic-fetch-add here.
     size_t delta = (GC_PAGE_SZ - GC_PAGE_OFFSET - nfree * osize);
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.pool_live_bytes,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.pool_live_bytes) + delta);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.pool_live_bytes,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.pool_live_bytes) + delta);
     jl_atomic_fetch_add_relaxed((_Atomic(int64_t) *)&gc_num.freed, (nfree - old_nfree) * osize);
 }
 
@@ -1228,7 +1228,7 @@ static void gc_sweep_pool(void)
             }
             continue;
         }
-        jl_atomic_store_relaxed(&ptls2->gc_tls.gc_num.pool_live_bytes, 0);
+        jl_atomic_store_relaxed(&ptls2->gc_tls_common.gc_num.pool_live_bytes, 0);
         for (int i = 0; i < JL_GC_N_POOLS; i++) {
             jl_gc_pool_t *p = &ptls2->gc_tls.heap.norm_pools[i];
             jl_taggedvalue_t *last = p->freelist;
@@ -2834,7 +2834,7 @@ JL_DLLEXPORT int64_t jl_gc_pool_live_bytes(void)
     for (int i = 0; i < n_threads; i++) {
         jl_ptls_t ptls2 = all_tls_states[i];
         if (ptls2 != NULL) {
-            pool_live_bytes += jl_atomic_load_relaxed(&ptls2->gc_tls.gc_num.pool_live_bytes);
+            pool_live_bytes += jl_atomic_load_relaxed(&ptls2->gc_tls_common.gc_num.pool_live_bytes);
         }
     }
     return pool_live_bytes;
@@ -3183,11 +3183,12 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
         if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
             // GC threads should never exit
             assert(!gc_is_collector_thread(t_i));
+            jl_thread_heap_common_t *common_heap = &ptls2->gc_tls_common.heap;
             jl_thread_heap_t *heap = &ptls2->gc_tls.heap;
-            if (heap->weak_refs.len == 0)
-                small_arraylist_free(&heap->weak_refs);
-            if (heap->live_tasks.len == 0)
-                small_arraylist_free(&heap->live_tasks);
+            if (common_heap->weak_refs.len == 0)
+                small_arraylist_free(&common_heap->weak_refs);
+            if (common_heap->live_tasks.len == 0)
+                small_arraylist_free(&common_heap->live_tasks);
             if (heap->remset.len == 0)
                 arraylist_free(&heap->remset);
             if (ptls2->finalizers.len == 0)
@@ -3256,8 +3257,8 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
     jl_task_t *ct = jl_current_task;
     jl_ptls_t ptls = ct->ptls;
     if (jl_atomic_load_acquire(&jl_gc_disable_counter)) {
-        size_t localbytes = jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + gc_num.interval;
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, -(int64_t)gc_num.interval);
+        size_t localbytes = jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + gc_num.interval;
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, -(int64_t)gc_num.interval);
         static_assert(sizeof(_Atomic(uint64_t)) == sizeof(gc_num.deferred_alloc), "");
         jl_atomic_fetch_add_relaxed((_Atomic(uint64_t)*)&gc_num.deferred_alloc, localbytes);
         return;
@@ -3362,6 +3363,7 @@ void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq)
 // Per-thread initialization
 void jl_init_thread_heap(jl_ptls_t ptls)
 {
+    jl_thread_heap_common_t *common_heap = &ptls->gc_tls_common.heap;
     jl_thread_heap_t *heap = &ptls->gc_tls.heap;
     jl_gc_pool_t *p = heap->norm_pools;
     for (int i = 0; i < JL_GC_N_POOLS; i++) {
@@ -3369,12 +3371,12 @@ void jl_init_thread_heap(jl_ptls_t ptls)
         p[i].freelist = NULL;
         p[i].newpages = NULL;
     }
-    small_arraylist_new(&heap->weak_refs, 0);
-    small_arraylist_new(&heap->live_tasks, 0);
+    small_arraylist_new(&common_heap->weak_refs, 0);
+    small_arraylist_new(&common_heap->live_tasks, 0);
     for (int i = 0; i < JL_N_STACK_POOLS; i++)
-        small_arraylist_new(&heap->free_stacks[i], 0);
-    heap->mallocarrays = NULL;
-    heap->mafreelist = NULL;
+        small_arraylist_new(&common_heap->free_stacks[i], 0);
+    common_heap->mallocarrays = NULL;
+    common_heap->mafreelist = NULL;
     heap->young_generation_of_bigvals = (bigval_t*)calloc_s(sizeof(bigval_t)); // sentinel
     assert(gc_bigval_sentinel_tag != 0); // make sure the sentinel is initialized
     heap->young_generation_of_bigvals->header = gc_bigval_sentinel_tag;
@@ -3400,8 +3402,8 @@ void jl_init_thread_heap(jl_ptls_t ptls)
     jl_atomic_store_relaxed(&q->array, wsa2);
     arraylist_new(&mq->reclaim_set, 32);
 
-    memset(&ptls->gc_tls.gc_num, 0, sizeof(ptls->gc_tls.gc_num));
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, -(int64_t)gc_num.interval);
+    memset(&ptls->gc_tls_common.gc_num, 0, sizeof(ptls->gc_tls_common.gc_num));
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd, -(int64_t)gc_num.interval);
 }
 
 void jl_free_thread_gc_state(jl_ptls_t ptls)
@@ -3579,10 +3581,10 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
     if (data != NULL && pgcstack != NULL && ct->world_age) {
         jl_ptls_t ptls = ct->ptls;
         maybe_collect(ptls);
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-            jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + sz);
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.malloc,
-            jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.malloc) + 1);
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+            jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + sz);
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.malloc,
+            jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.malloc) + 1);
         jl_batch_accum_heap_size(ptls, sz);
     }
     return data;
@@ -3596,10 +3598,10 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
     if (data != NULL && pgcstack != NULL && ct->world_age) {
         jl_ptls_t ptls = ct->ptls;
         maybe_collect(ptls);
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-            jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + nm*sz);
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.malloc,
-            jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.malloc) + 1);
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+            jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + nm*sz);
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.malloc,
+            jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.malloc) + 1);
         jl_batch_accum_heap_size(ptls, sz * nm);
     }
     return data;
@@ -3624,10 +3626,10 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size
         jl_ptls_t ptls = ct->ptls;
         maybe_collect(ptls);
         if (!(sz < old))
-            jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-                jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + (sz - old));
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.realloc,
-            jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.realloc) + 1);
+            jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+                jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + (sz - old));
+        jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.realloc,
+            jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.realloc) + 1);
 
         int64_t diff = sz - old;
         if (diff < 0) {
@@ -3658,10 +3660,10 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
     if (b == NULL)
         jl_throw(jl_memory_exception);
 
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + allocsz);
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.malloc,
-        jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.malloc) + 1);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.allocd,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.allocd) + allocsz);
+    jl_atomic_store_relaxed(&ptls->gc_tls_common.gc_num.malloc,
+        jl_atomic_load_relaxed(&ptls->gc_tls_common.gc_num.malloc) + 1);
     jl_batch_accum_heap_size(ptls, allocsz);
 #ifdef _OS_WINDOWS_
     SetLastError(last_error);
diff --git a/src/gc-tls-common.h b/src/gc-tls-common.h
new file mode 100644
index 0000000000000..28fbf2d0c448e
--- /dev/null
+++ b/src/gc-tls-common.h
@@ -0,0 +1,52 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+// Meant to be included in "julia_threads.h"
+#ifndef JL_GC_TLS_COMMON_H
+#define JL_GC_TLS_COMMON_H
+
+#include "julia_atomics.h"
+
+// GC threading ------------------------------------------------------------------
+
+#include "arraylist.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    // variable for tracking weak references
+    small_arraylist_t weak_refs;
+    // live tasks started on this thread
+    // that are holding onto a stack from the pool
+    small_arraylist_t live_tasks;
+
+    // variables for tracking malloc'd arrays
+    struct _mallocmemory_t *mallocarrays;
+    struct _mallocmemory_t *mafreelist;
+
+#define JL_N_STACK_POOLS 16
+    small_arraylist_t free_stacks[JL_N_STACK_POOLS];
+} jl_thread_heap_common_t;
+
+typedef struct {
+    _Atomic(int64_t) allocd;
+    _Atomic(int64_t) pool_live_bytes;
+    _Atomic(uint64_t) malloc;
+    _Atomic(uint64_t) realloc;
+    _Atomic(uint64_t) poolalloc;
+    _Atomic(uint64_t) bigalloc;
+    _Atomic(int64_t) free_acc;
+    _Atomic(uint64_t) alloc_acc;
+} jl_thread_gc_num_common_t;
+
+typedef struct {
+    jl_thread_heap_common_t heap;
+    jl_thread_gc_num_common_t gc_num;
+} jl_gc_tls_states_common_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // JL_GC_TLS_H
diff --git a/src/gc-tls.h b/src/gc-tls.h
index 9e4b09404db84..ecc815805a98b 100644
--- a/src/gc-tls.h
+++ b/src/gc-tls.h
@@ -21,16 +21,6 @@ typedef struct {
 } jl_gc_pool_t;
 
 typedef struct {
-    // variable for tracking weak references
-    small_arraylist_t weak_refs;
-    // live tasks started on this thread
-    // that are holding onto a stack from the pool
-    small_arraylist_t live_tasks;
-
-    // variables for tracking malloc'd arrays
-    struct _mallocmemory_t *mallocarrays;
-    struct _mallocmemory_t *mafreelist;
-
     // variable for tracking young (i.e. not in `GC_OLD_MARKED`/last generation) large objects
     struct _bigval_t *young_generation_of_bigvals;
 
@@ -42,22 +32,8 @@ typedef struct {
     // variables for allocating objects from pools
 #define JL_GC_N_MAX_POOLS 51 // conservative. must be kept in sync with `src/julia_internal.h`
     jl_gc_pool_t norm_pools[JL_GC_N_MAX_POOLS];
-
-#define JL_N_STACK_POOLS 16
-    small_arraylist_t free_stacks[JL_N_STACK_POOLS];
 } jl_thread_heap_t;
 
-typedef struct {
-    _Atomic(int64_t) allocd;
-    _Atomic(int64_t) pool_live_bytes;
-    _Atomic(uint64_t) malloc;
-    _Atomic(uint64_t) realloc;
-    _Atomic(uint64_t) poolalloc;
-    _Atomic(uint64_t) bigalloc;
-    _Atomic(int64_t) free_acc;
-    _Atomic(uint64_t) alloc_acc;
-} jl_thread_gc_num_t;
-
 typedef struct {
     ws_queue_t chunk_queue;
     ws_queue_t ptr_queue;
@@ -78,7 +54,6 @@ typedef struct {
 typedef struct {
     jl_thread_heap_t heap;
     jl_gc_page_stack_t page_metadata_allocd;
-    jl_thread_gc_num_t gc_num;
     jl_gc_markqueue_t mark_queue;
     jl_gc_mark_cache_t gc_cache;
     _Atomic(size_t) gc_sweeps_requested;
diff --git a/src/julia_threads.h b/src/julia_threads.h
index b697a0bf030ed..fcc28591658cb 100644
--- a/src/julia_threads.h
+++ b/src/julia_threads.h
@@ -5,6 +5,7 @@
 #define JL_THREADS_H
 
 #include "gc-tls.h"
+#include "gc-tls-common.h"
 #include "julia_atomics.h"
 #ifndef _OS_WINDOWS_
 #include "pthread.h"
@@ -155,6 +156,7 @@ typedef struct _jl_tls_states_t {
     // Counter to disable finalizer **on the current thread**
     int finalizers_inhibited;
     jl_gc_tls_states_t gc_tls; // this is very large, and the offset of the first member is baked into codegen
+    jl_gc_tls_states_common_t gc_tls_common; // common tls for both GCs
     volatile sig_atomic_t defer_signal;
     _Atomic(struct _jl_task_t*) current_task;
     struct _jl_task_t *next_task;
diff --git a/src/stackwalk.c b/src/stackwalk.c
index a1de3a6d61a07..0988d7a833c94 100644
--- a/src/stackwalk.c
+++ b/src/stackwalk.c
@@ -1309,7 +1309,7 @@ JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT
         if (ptls2 == NULL) {
             continue;
         }
-        small_arraylist_t *live_tasks = &ptls2->gc_tls.heap.live_tasks;
+        small_arraylist_t *live_tasks = &ptls2->gc_tls_common.heap.live_tasks;
         size_t n = mtarraylist_length(live_tasks);
         int t_state = JL_TASK_STATE_DONE;
         jl_task_t *t = ptls2->root_task;

From 3271996a9eb45899e330a274420a53d45c6b4079 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Tue, 8 Oct 2024 09:14:24 +0000
Subject: [PATCH 107/116] Typo

---
 src/gc-tls-common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gc-tls-common.h b/src/gc-tls-common.h
index 28fbf2d0c448e..ba36f5c1c238e 100644
--- a/src/gc-tls-common.h
+++ b/src/gc-tls-common.h
@@ -49,4 +49,4 @@ typedef struct {
 }
 #endif
 
-#endif // JL_GC_TLS_H
+#endif // JL_GC_TLS_COMMON_H

From cd4f5a177f0c0c7d9e0fb59bf830f2d914c46727 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Tue, 8 Oct 2024 22:46:39 +0000
Subject: [PATCH 108/116] Adding gc-tls-common.h to Makefile as a public header

---
 src/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Makefile b/src/Makefile
index a6b1f433b73ce..80bbdbcff67fc 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -103,7 +103,7 @@ ifeq ($(USE_SYSTEM_LIBUV),0)
 UV_HEADERS += uv.h
 UV_HEADERS += uv/*.h
 endif
-PUBLIC_HEADERS := $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(addprefix $(SRCDIR)/,work-stealing-queue.h gc-interface.h gc-tls.h julia.h julia_assert.h julia_threads.h julia_fasttls.h julia_locks.h julia_atomics.h jloptions.h)
+PUBLIC_HEADERS := $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(addprefix $(SRCDIR)/,work-stealing-queue.h gc-interface.h gc-tls.h gc-tls-common.h julia.h julia_assert.h julia_threads.h julia_fasttls.h julia_locks.h julia_atomics.h jloptions.h)
 ifeq ($(OS),WINNT)
 PUBLIC_HEADERS += $(addprefix $(SRCDIR)/,win32_ucontext.h)
 endif

From f4eba6b1dbe4bf3d66ff9de33953a80d6afb0d07 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 10 Oct 2024 01:42:47 +0000
Subject: [PATCH 109/116] Adding jl_full_sweep_reasons since timing.jl depends
 on it

---
 src/gc-mmtk.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/gc-mmtk.c b/src/gc-mmtk.c
index aa010c73b27d2..0d9a4db1d4fbc 100644
--- a/src/gc-mmtk.c
+++ b/src/gc-mmtk.c
@@ -10,6 +10,11 @@
 extern "C" {
 #endif
 
+// FIXME: Does it make sense for MMTk to implement something similar
+// for now, just ignoring this.
+// Table recording number of full GCs due to each reason
+JL_DLLEXPORT uint64_t jl_full_sweep_reasons[FULL_SWEEP_NUM_REASONS];
+
 // FIXME: Should the values below be shared between both GC's?
 // Note that MMTk uses a hard max heap limit, which is set by default
 // as 70% of the free available memory. The min heap is set as the

From c20ecb31d11a4ce7354d05ae5e49ffc91714901f Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 10 Oct 2024 02:06:50 +0000
Subject: [PATCH 110/116] Fixing issue with jl_full_sweep_reasons (missing
 constants)

---
 src/gc-mmtk.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/gc-mmtk.c b/src/gc-mmtk.c
index 0d9a4db1d4fbc..48992aeb43bd0 100644
--- a/src/gc-mmtk.c
+++ b/src/gc-mmtk.c
@@ -12,6 +12,14 @@ extern "C" {
 
 // FIXME: Does it make sense for MMTk to implement something similar
 // for now, just ignoring this.
+
+// Must be kept in sync with `base/timing.jl`
+#define FULL_SWEEP_REASON_SWEEP_ALWAYS_FULL (0)
+#define FULL_SWEEP_REASON_FORCED_FULL_SWEEP (1)
+#define FULL_SWEEP_REASON_USER_MAX_EXCEEDED (2)
+#define FULL_SWEEP_REASON_LARGE_PROMOTION_RATE (3)
+#define FULL_SWEEP_NUM_REASONS (4)
+
 // Table recording number of full GCs due to each reason
 JL_DLLEXPORT uint64_t jl_full_sweep_reasons[FULL_SWEEP_NUM_REASONS];
 

From 7cc2fe10dd849f1e69f640e73f1cab2c62c6a6e2 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 10 Oct 2024 03:11:10 +0000
Subject: [PATCH 111/116] Removing reduntant changes to upstream (wip; need to
 add back stuff for sticky)

---
 Makefile                       | 5 +----
 src/Makefile                   | 5 ++---
 src/builtins.c                 | 1 -
 src/datatype.c                 | 8 --------
 src/gc-common.c                | 2 +-
 src/gc-stacks.c                | 2 --
 src/gc-stock.c                 | 2 +-
 src/genericmemory.c            | 1 -
 src/init.c                     | 7 ++-----
 src/jitlayers.h                | 4 ----
 src/julia.h                    | 2 +-
 src/julia_internal.h           | 2 +-
 src/julia_threads.h            | 2 +-
 src/llvm-final-gc-lowering.cpp | 2 +-
 src/llvm-late-gc-lowering.cpp  | 2 +-
 src/symbol.c                   | 4 ----
 src/threading.c                | 3 +--
 17 files changed, 13 insertions(+), 41 deletions(-)

diff --git a/Makefile b/Makefile
index 732fcfcc77e7d..4fd8b878c5d1f 100644
--- a/Makefile
+++ b/Makefile
@@ -130,7 +130,7 @@ check-whitespace:
 ifneq ($(NO_GIT), 1)
 	@# Append the directory containing the julia we just built to the end of `PATH`,
 	@# to give us the best chance of being able to run this check.
-	@PATH="$(PATH):$(dir $(JULIA_EXECUTABLE))" $(JULIA_EXECUTABLE) $(call cygpath_w,$(JULIAHOME)/contrib/check-whitespace.jl)
+	@PATH="$(PATH):$(dir $(JULIA_EXECUTABLE))" julia $(call cygpath_w,$(JULIAHOME)/contrib/check-whitespace.jl)
 else
 	$(warn "Skipping whitespace check because git is unavailable")
 endif
@@ -648,9 +648,6 @@ testall: check-whitespace $(JULIA_BUILD_MODE)
 testall1: check-whitespace $(JULIA_BUILD_MODE)
 	@env JULIA_CPU_THREADS=1 $(MAKE) $(QUIET_MAKE) -C $(BUILDROOT)/test all JULIA_BUILD_MODE=$(JULIA_BUILD_MODE)
 
-testall3: check-whitespace $(JULIA_BUILD_MODE)
-	@env JULIA_CPU_THREADS=3 $(MAKE) $(QUIET_MAKE) -C $(BUILDROOT)/test all JULIA_BUILD_MODE=$(JULIA_BUILD_MODE)
-
 test-%: check-whitespace $(JULIA_BUILD_MODE) .FORCE
 	@([ $$(( $$(date +%s) - $$(date -r $(build_private_libdir)/sys.$(SHLIB_EXT) +%s) )) -le 100 ] && \
 		printf '\033[93m    HINT The system image was recently rebuilt. Are you aware of the test-revise-* targets? See CONTRIBUTING.md. \033[0m\n') || true
diff --git a/src/Makefile b/src/Makefile
index b27eee8db8511..b2857e63d5881 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -18,7 +18,6 @@ FLAGS := \
 	-I$(SRCDIR)/flisp -I$(SRCDIR)/support \
 	-I$(LIBUV_INC) -I$(build_includedir) \
 	-I$(JULIAHOME)/deps/valgrind
-
 FLAGS += -Wall -Wno-strict-aliasing -fno-omit-frame-pointer -fvisibility=hidden -fno-common \
 		 -Wno-comment -Wpointer-arith -Wundef
 ifeq ($(USEGCC),1) # GCC bug #25509 (void)__attribute__((warn_unused_result))
@@ -427,13 +426,13 @@ $(build_shlibdir)/libjulia-internal.$(JL_MAJOR_MINOR_SHLIB_EXT): $(BUILDDIR)/jul
 	@$(call PRINT_LINK, $(CXXLD) $(call IMPLIB_FLAGS,$@) $(JCXXFLAGS) $(JL_CXXFLAGS) $(CXXLDFLAGS) $(SHIPFLAGS) $(OBJS) $(MMTK_OBJS) $(RPATH_LIB) -o $@ \
 		$(JLDFLAGS) $(BOLT_LDFLAGS) $(JLIBLDFLAGS) $(RT_RELEASE_LIBS) $(call SONAME_FLAGS,libjulia-internal.$(JL_MAJOR_SHLIB_EXT)))
 	@$(INSTALL_NAME_CMD)libjulia-internal.$(SHLIB_EXT) $@
-		$(DSYMUTIL) $@
+	$(DSYMUTIL) $@
 
 $(build_shlibdir)/libjulia-internal-debug.$(JL_MAJOR_MINOR_SHLIB_EXT): $(BUILDDIR)/julia.expmap $(DOBJS) $(MMTK_DOBJS) $(BUILDDIR)/flisp/libflisp-debug.a $(BUILDDIR)/support/libsupport-debug.a $(LIBUV)
 	@$(call PRINT_LINK, $(CXXLD) $(call IMPLIB_FLAGS,$@) $(JCXXFLAGS) $(JL_CXXFLAGS) $(CXXLDFLAGS) $(DEBUGFLAGS) $(DOBJS) $(MMTK_DOBJS) $(RPATH_LIB) -o $@ \
 		$(JLDFLAGS) $(JLIBLDFLAGS) $(RT_DEBUG_LIBS) $(call SONAME_FLAGS,libjulia-internal-debug.$(JL_MAJOR_SHLIB_EXT)))
 	@$(INSTALL_NAME_CMD)libjulia-internal-debug.$(SHLIB_EXT) $@
-		$(DSYMUTIL) $@
+	$(DSYMUTIL) $@
 
 ifneq ($(OS), WINNT)
 $(build_shlibdir)/libjulia-internal.$(JL_MAJOR_SHLIB_EXT) $(build_shlibdir)/libjulia-internal-debug.$(JL_MAJOR_SHLIB_EXT): $(build_shlibdir)/libjulia-internal%.$(JL_MAJOR_SHLIB_EXT): \
diff --git a/src/builtins.c b/src/builtins.c
index d1ffadaf706cc..939aef4234ac9 100644
--- a/src/builtins.c
+++ b/src/builtins.c
@@ -22,7 +22,6 @@
 #include <ctype.h>
 #include "julia.h"
 #include "julia_internal.h"
-#include "gc-interface.h"
 #include "builtin_proto.h"
 #include "intrinsics.h"
 #include "julia_assert.h"
diff --git a/src/datatype.c b/src/datatype.c
index 03afce0e97a25..3a2ebf2bb303a 100644
--- a/src/datatype.c
+++ b/src/datatype.c
@@ -290,10 +290,6 @@ static jl_datatype_layout_t *jl_get_layout(uint32_t sz,
     if ((void*)ret == HT_NOTFOUND) {
         if (!should_malloc) {
             char *perm_mem = (char *)jl_gc_perm_alloc(flddesc_sz, 0, 4, 0);
-#ifdef MMTK_GC
-    jl_ptls_t ptls = jl_current_task->ptls;
-    mmtk_immortal_post_alloc_fast(&ptls->mmtk_mutator, jl_valueof(perm_mem), flddesc_sz);
-#endif
             assert(perm_mem);
             ret = (jl_datatype_layout_t *)perm_mem;
             memcpy(perm_mem, flddesc, flddesc_sz);
@@ -977,10 +973,6 @@ JL_DLLEXPORT jl_datatype_t * jl_new_foreign_type(jl_sym_t *name,
     jl_datatype_layout_t *layout = (jl_datatype_layout_t *)
       jl_gc_perm_alloc(sizeof(jl_datatype_layout_t) + sizeof(jl_fielddescdyn_t),
         0, 4, 0);
-#ifdef MMTK_GC
-    jl_ptls_t ptls = jl_current_task->ptls;
-    mmtk_immortal_post_alloc_fast(&ptls->mmtk_mutator, jl_valueof(layout), sizeof(jl_datatype_layout_t) + sizeof(jl_fielddescdyn_t));
-#endif
     layout->size = large ? GC_MAX_SZCLASS+1 : 0;
     layout->nfields = 0;
     layout->alignment = sizeof(void *);
diff --git a/src/gc-common.c b/src/gc-common.c
index 29e9233205dd5..6ce455d3923ad 100644
--- a/src/gc-common.c
+++ b/src/gc-common.c
@@ -707,4 +707,4 @@ JL_DLLEXPORT void jl_throw_out_of_memory_error(void)
 
 #ifdef __cplusplus
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/src/gc-stacks.c b/src/gc-stacks.c
index 6d96825dfd13d..a8fec938456a3 100644
--- a/src/gc-stacks.c
+++ b/src/gc-stacks.c
@@ -289,8 +289,6 @@ void sweep_stack_pools(void) JL_NOTSAFEPOINT
     }
 }
 
-extern int gc_first_tid;
-
 JL_DLLEXPORT jl_array_t *jl_live_tasks(void)
 {
     size_t nthreads = jl_atomic_load_acquire(&jl_n_threads);
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 3c5cea47a6236..daebfc4e22ba9 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -3916,4 +3916,4 @@ void jl_gc_notify_image_load(const char* img_data, size_t len)
 }
 #endif
 
-#endif // !MMTK_GC
\ No newline at end of file
+#endif // !MMTK_GC
diff --git a/src/genericmemory.c b/src/genericmemory.c
index 6851e9131e534..93d90ef99e8b0 100644
--- a/src/genericmemory.c
+++ b/src/genericmemory.c
@@ -54,7 +54,6 @@ jl_genericmemory_t *_new_genericmemory_(jl_value_t *mtype, size_t nel, int8_t is
         tot = sizeof(jl_genericmemory_t) + sizeof(void*);
     }
     m = (jl_genericmemory_t*)jl_gc_alloc(ct->ptls, tot, mtype);
-
     if (pooled) {
         data = (char*)m + JL_SMALL_BYTE_ALIGNMENT;
     }
diff --git a/src/init.c b/src/init.c
index 1e5cd129cf264..413d4e8055e54 100644
--- a/src/init.c
+++ b/src/init.c
@@ -286,7 +286,8 @@ JL_DLLEXPORT void jl_atexit_hook(int exitcode) JL_NOTSAFEPOINT_ENTER
     JL_STDOUT = (uv_stream_t*) STDOUT_FILENO;
     JL_STDERR = (uv_stream_t*) STDERR_FILENO;
 
-    jl_gc_run_all_finalizers(ct);
+    if (ct)
+        jl_gc_run_all_finalizers(ct);
 
     uv_loop_t *loop = jl_global_event_loop();
     if (loop != NULL) {
@@ -826,7 +827,6 @@ JL_DLLEXPORT void julia_init(JL_IMAGE_SEARCH rel)
     arraylist_push(&eytzinger_image_tree, (void*)1); // outside image
 
     jl_ptls_t ptls = jl_init_threadtls(0);
-
 #pragma GCC diagnostic push
 #if defined(_COMPILER_GCC_) && __GNUC__ >= 12
 #pragma GCC diagnostic ignored "-Wdangling-pointer"
@@ -889,9 +889,6 @@ static NOINLINE void _finish_julia_init(JL_IMAGE_SEARCH rel, jl_ptls_t ptls, jl_
     jl_start_gc_threads();
     uv_barrier_wait(&thread_init_done);
 
-#ifdef MMTK_GC
-    mmtk_initialize_collection((void *)ptls);
-#endif
     jl_gc_enable(1);
 
     if (jl_options.image_file && (!jl_generating_output() || jl_options.incremental) && jl_module_init_order) {
diff --git a/src/jitlayers.h b/src/jitlayers.h
index 6dc9c51cef98d..9ae99d3a2c9b2 100644
--- a/src/jitlayers.h
+++ b/src/jitlayers.h
@@ -326,10 +326,6 @@ class MaxAlignedAllocImpl
         Align MaxAlign = alignment(Size);
         assert(Alignment < MaxAlign); (void)Alignment;
         void* result = jl_gc_perm_alloc(Size, 0, MaxAlign.value(), offset);
-#ifdef MMTK_GC
-        jl_ptls_t ptls = jl_current_task->ptls;
-        mmtk_immortal_post_alloc_fast(&ptls->mmtk_mutator, jl_valueof(result), Size);
-#endif
         return result;
     }
 
diff --git a/src/julia.h b/src/julia.h
index 5954872dcafa6..ed3d9bf825658 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -2654,4 +2654,4 @@ typedef struct {
 }
 #endif
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/julia_internal.h b/src/julia_internal.h
index ec77ebe93233e..6fd537ed6baf8 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -1970,4 +1970,4 @@ JL_DLLIMPORT uint64_t jl_getUnwindInfo(uint64_t dwBase);
 #define JL_PROBE_RT_SLEEP_CHECK_UV_WAKE_ENABLED() (0)
 #endif
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/julia_threads.h b/src/julia_threads.h
index e118295ef4056..3b804823d796b 100644
--- a/src/julia_threads.h
+++ b/src/julia_threads.h
@@ -313,4 +313,4 @@ JL_DLLEXPORT int jl_setaffinity(int16_t tid, char *mask, int cpumasksize);
 }
 #endif
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/llvm-final-gc-lowering.cpp b/src/llvm-final-gc-lowering.cpp
index 816181b14e1dd..76dcd944890ab 100644
--- a/src/llvm-final-gc-lowering.cpp
+++ b/src/llvm-final-gc-lowering.cpp
@@ -225,4 +225,4 @@ PreservedAnalyses FinalLowerGCPass::run(Function &F, FunctionAnalysisManager &AM
         return PreservedAnalyses::allInSet<CFGAnalyses>();
     }
     return PreservedAnalyses::all();
-}
\ No newline at end of file
+}
diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
index bc8cec2eb42cd..4b7dc0ec855a7 100644
--- a/src/llvm-late-gc-lowering.cpp
+++ b/src/llvm-late-gc-lowering.cpp
@@ -2629,4 +2629,4 @@ PreservedAnalyses LateLowerGCPass::run(Function &F, FunctionAnalysisManager &AM)
         }
     }
     return PreservedAnalyses::all();
-}
\ No newline at end of file
+}
diff --git a/src/symbol.c b/src/symbol.c
index 079c044752e1b..ef2c11e0842e8 100644
--- a/src/symbol.c
+++ b/src/symbol.c
@@ -40,10 +40,6 @@ static jl_sym_t *mk_symbol(const char *str, size_t len) JL_NOTSAFEPOINT
     sym = (jl_sym_t*)jl_valueof(tag);
     // set to old marked so that we won't look at it in the GC or write barrier.
     jl_set_typetagof(sym, jl_symbol_tag, GC_OLD_MARKED);
-#ifdef MMTK_GC
-    jl_ptls_t ptls = jl_current_task->ptls;
-    mmtk_immortal_post_alloc_fast(&ptls->mmtk_mutator, jl_valueof(tag), nb);
-#endif
     jl_atomic_store_relaxed(&sym->left, NULL);
     jl_atomic_store_relaxed(&sym->right, NULL);
     sym->hash = hash_symbol(str, len);
diff --git a/src/threading.c b/src/threading.c
index ac2a75e23a1c3..4e4186f5f070d 100644
--- a/src/threading.c
+++ b/src/threading.c
@@ -340,9 +340,8 @@ jl_ptls_t jl_init_threadtls(int16_t tid)
 #endif
     ptls->system_id = uv_thread_self();
     ptls->rngseed = jl_rand();
-    if (tid == 0) {
+    if (tid == 0) 
         ptls->disable_gc = 1;
-    }
 #ifdef _OS_WINDOWS_
     if (tid == 0) {
         if (!DuplicateHandle(GetCurrentProcess(), GetCurrentThread(),

From 3f7fd3194878af7e80a01032a1fd848a76d882a0 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 10 Oct 2024 03:12:58 +0000
Subject: [PATCH 112/116] Typo

---
 src/gc-stock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gc-stock.h b/src/gc-stock.h
index 91da2cd32f28a..686753fd37349 100644
--- a/src/gc-stock.h
+++ b/src/gc-stock.h
@@ -729,4 +729,4 @@ void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect
 #endif
 
 
-#endif // !MMTK_GC
\ No newline at end of file
+#endif // !MMTK_GC

From 7ff37de41278b5f1290f21714f876f356266698a Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 10 Oct 2024 03:22:31 +0000
Subject: [PATCH 113/116] Cleanup

---
 src/Makefile    | 9 ---------
 src/jitlayers.h | 3 +--
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index b2857e63d5881..a88f28b98de14 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -270,15 +270,6 @@ $(BUILDDIR)/%.o : $(SRCDIR)/%.d
 $(BUILDDIR)/%.dbg.obj : $(SRCDIR)/%.d
 	@$(call PRINT_DTRACE, $(DTRACE) -G -s $< -o $@)
 
-ifeq ($(WITH_MMTK), 1)
-$(MMTK_JULIA_INC)/%.o: $(MMTK_JULIA_INC)/%.c $(HEADERS) | $(MMTK_JULIA_INC)
-	@$(call PRINT_CC, $(CC) $(JCPPFLAGS) $(JCFLAGS) $(SHIPFLAGS) $(DISABLE_ASSERTIONS) -c $< -o $@)
-$(MMTK_JULIA_INC)/%.dbg.obj: $(MMTK_JULIA_INC)/%.c $(HEADERS) | $(MMTK_JULIA_INC)
-	@$(call PRINT_CC, $(CC) $(JCPPFLAGS) $(JCFLAGS) $(DEBUGFLAGS) -c $< -o $@)
-$(MMTK_LIB_DST): $(MMTK_LIB_SRC)
-	@$(call PRINT_MMTK, cp $< $@)
-endif
-
 # public header rules
 $(eval $(call dir_target,$(build_includedir)/julia))
 define public_header_target
diff --git a/src/jitlayers.h b/src/jitlayers.h
index 9ae99d3a2c9b2..3353a4093bd27 100644
--- a/src/jitlayers.h
+++ b/src/jitlayers.h
@@ -325,8 +325,7 @@ class MaxAlignedAllocImpl
     LLVM_ATTRIBUTE_RETURNS_NONNULL void *Allocate(size_t Size, Align Alignment) {
         Align MaxAlign = alignment(Size);
         assert(Alignment < MaxAlign); (void)Alignment;
-        void* result = jl_gc_perm_alloc(Size, 0, MaxAlign.value(), offset);
-        return result;
+        return jl_gc_perm_alloc(Size, 0, MaxAlign.value(), offset);
     }
 
     inline LLVM_ATTRIBUTE_RETURNS_NONNULL

From 2e1d5da8d102344b9fabaeb97b8cabaf925d18f5 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 10 Oct 2024 04:49:47 +0000
Subject: [PATCH 114/116] Cleanup; Adding pieces to make building immix
 possible again

---
 src/gc-interface.h  |  2 ++
 src/gc-mmtk.c       | 33 +++++++++++++++++++++++++++++++++
 src/gc-stock.c      | 22 ++++++++++++++++++++++
 src/genericmemory.c |  2 +-
 src/julia.h         |  9 +++++++++
 src/mmtk-gc.c       | 39 ---------------------------------------
 src/staticdata.c    |  2 --
 src/threading.c     |  1 -
 8 files changed, 67 insertions(+), 43 deletions(-)

diff --git a/src/gc-interface.h b/src/gc-interface.h
index b1f3ab9d6908d..6e36f5670c7f3 100644
--- a/src/gc-interface.h
+++ b/src/gc-interface.h
@@ -96,6 +96,8 @@ JL_DLLEXPORT void jl_gc_set_max_memory(uint64_t max_mem);
 JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection);
 // Returns whether the thread with `tid` is a collector thread
 JL_DLLEXPORT int gc_is_collector_thread(int tid) JL_NOTSAFEPOINT;
+// Pinning objects; Returns whether the object has been pinned by this call.
+JL_DLLEXPORT unsigned char jl_gc_pin_object(void* obj);
 
 // ========================================================================= //
 // Metrics
diff --git a/src/gc-mmtk.c b/src/gc-mmtk.c
index 48992aeb43bd0..b36524e8f56fd 100644
--- a/src/gc-mmtk.c
+++ b/src/gc-mmtk.c
@@ -634,6 +634,33 @@ int jl_n_sweepthreads;
 // `tid` of first GC thread
 int gc_first_tid;
 
+// Write barriers
+
+// No inline write barrier -- only used for debugging
+JL_DLLEXPORT void jl_gc_wb1_noinline(const void *parent) JL_NOTSAFEPOINT
+{
+    jl_gc_wb_back(parent);
+}
+
+JL_DLLEXPORT void jl_gc_wb2_noinline(const void *parent, const void *ptr) JL_NOTSAFEPOINT
+{
+    jl_gc_wb(parent, ptr);
+}
+
+JL_DLLEXPORT void jl_gc_wb1_slow(const void *parent) JL_NOTSAFEPOINT
+{
+    jl_task_t *ct = jl_current_task;
+    jl_ptls_t ptls = ct->ptls;
+    mmtk_object_reference_write_slow(&ptls->gc_tls.mmtk_mutator, parent, (const void*) 0);
+}
+
+JL_DLLEXPORT void jl_gc_wb2_slow(const void *parent, const void* ptr) JL_NOTSAFEPOINT
+{
+    jl_task_t *ct = jl_current_task;
+    jl_ptls_t ptls = ct->ptls;
+    mmtk_object_reference_write_slow(&ptls->gc_tls.mmtk_mutator, parent, ptr);
+}
+
 JL_DLLEXPORT void jl_gc_queue_root(const struct _jl_value_t *ptr) JL_NOTSAFEPOINT
 {
     mmtk_unreachable();
@@ -722,6 +749,12 @@ JL_DLLEXPORT jl_value_t *jl_gc_internal_obj_base_ptr(void *p)
     return NULL;
 }
 
+extern unsigned char mmtk_pin_object(void* obj);
+
+JL_DLLEXPORT unsigned char jl_gc_pin_object(void* obj) {
+    return mmtk_pin_object(obj);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gc-stock.c b/src/gc-stock.c
index daebfc4e22ba9..d193254834a56 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -3912,6 +3912,28 @@ void jl_gc_notify_image_load(const char* img_data, size_t len)
     // Do nothing
 }
 
+JL_DLLEXPORT unsigned char jl_gc_pin_object(void* obj) {
+    return 0;
+}
+
+// added for MMTk integration
+
+JL_DLLEXPORT void jl_gc_wb1_noinline(const void *parent) JL_NOTSAFEPOINT
+{
+}
+
+JL_DLLEXPORT void jl_gc_wb2_noinline(const void *parent, const void *ptr) JL_NOTSAFEPOINT
+{
+}
+
+JL_DLLEXPORT void jl_gc_wb1_slow(const void *parent) JL_NOTSAFEPOINT
+{
+}
+
+JL_DLLEXPORT void jl_gc_wb2_slow(const void *parent, const void* ptr) JL_NOTSAFEPOINT
+{
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/genericmemory.c b/src/genericmemory.c
index 93d90ef99e8b0..276a4fdd17d8a 100644
--- a/src/genericmemory.c
+++ b/src/genericmemory.c
@@ -270,7 +270,7 @@ JL_DLLEXPORT void jl_genericmemory_copyto(jl_genericmemory_t *dest, char* destda
         _Atomic(void*) * dest_p = (_Atomic(void*)*)destdata;
         _Atomic(void*) * src_p = (_Atomic(void*)*)srcdata;
         jl_value_t *owner = jl_genericmemory_owner(dest);
-        mmtk_gc_wb(owner, NULL);
+        jl_gc_wb(owner, NULL); // FIXME: needs to be added here since the check below doesn't apply to MMTk 
         if (__unlikely(jl_astaggedvalue(owner)->bits.gc == GC_OLD_MARKED)) {
             jl_value_t *src_owner = jl_genericmemory_owner(src);
             ssize_t done = 0;
diff --git a/src/julia.h b/src/julia.h
index ed3d9bf825658..816d7b5d75b16 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -79,6 +79,15 @@ typedef struct _jl_tls_states_t *jl_ptls_t;
 extern "C" {
 #endif
 
+// object pinning  ------------------------------------------------------------
+
+// FIXME: Pinning objects that get hashed in the ptrhash table
+// until we implement address space hashing.
+#define PTRHASH_PIN(key) jl_gc_pin_object(key);
+
+// Called when pinning objects that would cause an error if moved
+#define PTR_PIN(key) jl_gc_pin_object(key);
+
 // core data types ------------------------------------------------------------
 
 // the common fields are hidden before the pointer, but the following macro is
diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
index 5a104c4856c54..284e72a502b3a 100644
--- a/src/mmtk-gc.c
+++ b/src/mmtk-gc.c
@@ -295,15 +295,6 @@ void jl_init_thread_heap(jl_ptls_t ptls)
     mmtk_post_bind_mutator(&ptls->mmtk_mutator, mmtk_mutator);
 }
 
-void jl_free_thread_gc_state(jl_ptls_t ptls)
-{
-}
-
-void jl_deinit_thread_heap(jl_ptls_t ptls)
-{
-    mmtk_destroy_mutator(&ptls->mmtk_mutator);
-}
-
 extern jl_mutex_t finalizers_lock;
 extern arraylist_t to_finalize;
 extern arraylist_t finalizer_list_marked;
@@ -529,31 +520,6 @@ JL_DLLEXPORT void jl_gc_array_ptr_copy(jl_array_t *dest, void **dest_p, jl_array
     mmtk_memory_region_copy(&ptls->mmtk_mutator, jl_array_owner(src), src_p, jl_array_owner(dest), dest_p, n);
 }
 
-// No inline write barrier -- only used for debugging
-JL_DLLEXPORT void jl_gc_wb1_noinline(const void *parent) JL_NOTSAFEPOINT
-{
-    jl_gc_wb_back(parent);
-}
-
-JL_DLLEXPORT void jl_gc_wb2_noinline(const void *parent, const void *ptr) JL_NOTSAFEPOINT
-{
-    jl_gc_wb(parent, ptr);
-}
-
-JL_DLLEXPORT void jl_gc_wb1_slow(const void *parent) JL_NOTSAFEPOINT
-{
-    jl_task_t *ct = jl_current_task;
-    jl_ptls_t ptls = ct->ptls;
-    mmtk_object_reference_write_slow(&ptls->mmtk_mutator, parent, (const void*) 0);
-}
-
-JL_DLLEXPORT void jl_gc_wb2_slow(const void *parent, const void* ptr) JL_NOTSAFEPOINT
-{
-    jl_task_t *ct = jl_current_task;
-    jl_ptls_t ptls = ct->ptls;
-    mmtk_object_reference_write_slow(&ptls->mmtk_mutator, parent, ptr);
-}
-
 void *jl_gc_perm_alloc_nolock(size_t sz, int zero, unsigned align, unsigned offset)
 {
     jl_ptls_t ptls = jl_current_task->ptls;
@@ -572,11 +538,6 @@ void jl_gc_notify_image_load(const char* img_data, size_t len)
     mmtk_set_vm_space((void*)img_data, len);
 }
 
-void jl_gc_notify_image_alloc(char* img_data, size_t len)
-{
-    mmtk_immortal_region_post_alloc((void*)img_data, len);
-}
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/staticdata.c b/src/staticdata.c
index af24a84f39854..6f4bc61521c1a 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -3915,7 +3915,6 @@ static jl_value_t *jl_restore_package_image_from_stream(void* pkgimage_handle, i
         ios_seek(f, datastartpos);
         if (needs_permalloc) {
             sysimg = (char*)jl_gc_perm_alloc(len, 0, 64, 0);
-            jl_gc_notify_image_alloc(sysimg, len);
         }
         else
             sysimg = &f->buf[f->bpos];
@@ -4025,7 +4024,6 @@ JL_DLLEXPORT void jl_restore_system_image(const char *fname)
         ios_seek_end(&f);
         size_t len = ios_pos(&f);
         char *sysimg = (char*)jl_gc_perm_alloc(len, 0, 64, 0);
-        jl_gc_notify_image_alloc(sysimg, len);
         ios_seek(&f, 0);
         if (ios_readall(&f, sysimg, len) != len)
             jl_errorf("Error reading system image file.");
diff --git a/src/threading.c b/src/threading.c
index 4e4186f5f070d..9e6974da3b2ec 100644
--- a/src/threading.c
+++ b/src/threading.c
@@ -527,7 +527,6 @@ static void jl_delete_thread(void *value) JL_NOTSAFEPOINT_ENTER
 #else
     pthread_mutex_unlock(&in_signal_lock);
 #endif
-    jl_deinit_thread_heap(ptls);
     free(ptls->bt_data);
     small_arraylist_free(&ptls->locks);
     ptls->previous_exception = NULL;

From 2ca9fb0d73fc154ab612ea9aae5b4e958066c930 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Thu, 10 Oct 2024 05:47:03 +0000
Subject: [PATCH 115/116] Trying to add sticky back again (wip)

---
 src/Makefile                   |   2 +-
 src/gc-mmtk.c                  |  17 +-
 src/julia.h                    |  72 ++++-
 src/llvm-final-gc-lowering.cpp |  39 +++
 src/llvm-gc-interface-passes.h |  13 +
 src/llvm-late-gc-lowering.cpp  |  59 +++-
 src/mmtk-gc.c                  | 545 ---------------------------------
 7 files changed, 193 insertions(+), 554 deletions(-)
 delete mode 100644 src/mmtk-gc.c

diff --git a/src/Makefile b/src/Makefile
index a88f28b98de14..308678662c879 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -341,7 +341,7 @@ $(BUILDDIR)/debuginfo.o $(BUILDDIR)/debuginfo.dbg.obj: $(addprefix $(SRCDIR)/,de
 $(BUILDDIR)/disasm.o $(BUILDDIR)/disasm.dbg.obj: $(SRCDIR)/debuginfo.h $(SRCDIR)/processor.h
 $(BUILDDIR)/gc-debug.o $(BUILDDIR)/gc-debug.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h
 $(BUILDDIR)/gc-pages.o $(BUILDDIR)/gc-pages.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h
-$(BUILDDIR)/gc-mmtk.o $(BUILDDIR)/mmtk-gc.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h
+$(BUILDDIR)/gc-mmtk.o $(BUILDDIR)/gc-mmtk.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h
 $(BUILDDIR)/gc-stock.o $(BUILDDIR)/gc.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h $(SRCDIR)/gc-page-profiler.h
 $(BUILDDIR)/gc-heap-snapshot.o $(BUILDDIR)/gc-heap-snapshot.dbg.obj: $(SRCDIR)/gc-heap-snapshot.h
 $(BUILDDIR)/gc-alloc-profiler.o $(BUILDDIR)/gc-alloc-profiler.dbg.obj: $(SRCDIR)/gc-alloc-profiler.h
diff --git a/src/gc-mmtk.c b/src/gc-mmtk.c
index b36524e8f56fd..1f20ba875b150 100644
--- a/src/gc-mmtk.c
+++ b/src/gc-mmtk.c
@@ -382,9 +382,18 @@ inline void* mmtk_immortal_alloc_fast(MMTkMutatorContext* mutator, size_t size,
 }
 
 inline void mmtk_immortal_post_alloc_fast(MMTkMutatorContext* mutator, void* obj, size_t size) {
-    // FIXME: Similarly, for now, we do nothing
-    // but when supporting moving, this is where we set the valid object (VO) bit
-    // and log (old gen) bit
+    if (MMTK_NEEDS_WRITE_BARRIER == MMTK_OBJECT_BARRIER) {
+        intptr_t addr = (intptr_t) obj;
+        uint8_t* meta_addr = (uint8_t*) (MMTK_SIDE_LOG_BIT_BASE_ADDRESS) + (addr >> 6);
+        intptr_t shift = (addr >> 3) & 0b111;
+        while(1) {
+            uint8_t old_val = *meta_addr;
+            uint8_t new_val = old_val | (1 << shift);
+            if (jl_atomic_cmpswap((_Atomic(uint8_t)*)meta_addr, &old_val, new_val)) {
+                break;
+            }
+        }
+    }
 }
 
 // allocation wrappers that track allocation and let collection run
@@ -634,7 +643,7 @@ int jl_n_sweepthreads;
 // `tid` of first GC thread
 int gc_first_tid;
 
-// Write barriers
+// TODO: Move write barriers from julia.h and add them here
 
 // No inline write barrier -- only used for debugging
 JL_DLLEXPORT void jl_gc_wb1_noinline(const void *parent) JL_NOTSAFEPOINT
diff --git a/src/julia.h b/src/julia.h
index 816d7b5d75b16..651f313021f95 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -1130,7 +1130,7 @@ JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, struct _jl_task_t *owner) JL_N
 JL_DLLEXPORT void jl_free_stack(void *stkbuf, size_t bufsz);
 
 // GC write barriers
-
+#ifndef MMTK_GC
 STATIC_INLINE void jl_gc_wb(const void *parent, const void *ptr) JL_NOTSAFEPOINT
 {
     // parent and ptr isa jl_value_t*
@@ -1160,6 +1160,24 @@ STATIC_INLINE void jl_gc_multi_wb(const void *parent, const jl_value_t *ptr) JL_
     if (ly->npointers)
         jl_gc_queue_multiroot((jl_value_t*)parent, ptr, dt);
 }
+#else  // MMTK_GC
+STATIC_INLINE void mmtk_gc_wb(const void *parent, const void *ptr) JL_NOTSAFEPOINT;
+
+STATIC_INLINE void jl_gc_wb(const void *parent, const void *ptr) JL_NOTSAFEPOINT
+{
+    mmtk_gc_wb(parent, ptr);
+}
+
+STATIC_INLINE void jl_gc_wb_back(const void *ptr) JL_NOTSAFEPOINT // ptr isa jl_value_t*
+{
+    mmtk_gc_wb(ptr, (void*)0);
+}
+
+STATIC_INLINE void jl_gc_multi_wb(const void *parent, const jl_value_t *ptr) JL_NOTSAFEPOINT
+{
+    mmtk_gc_wb(parent, (void*)0);
+}
+#endif // MMTK_GC
 
 JL_DLLEXPORT void jl_gc_safepoint(void);
 JL_DLLEXPORT int jl_safepoint_suspend_thread(int tid, int waitstate);
@@ -2659,6 +2677,58 @@ typedef struct {
     int emit_metadata;
 } jl_emission_params_t;
 
+#ifdef MMTK_GC
+
+extern void mmtk_object_reference_write_post(void* mutator, const void* parent, const void* ptr);
+extern void mmtk_object_reference_write_slow(void* mutator, const void* parent, const void* ptr);
+
+// These need to be constants.
+
+#define MMTK_OBJECT_BARRIER (1)
+// Stickyimmix needs write barrier. Immix does not need write barrier.
+#ifdef MMTK_PLAN_IMMIX
+#define MMTK_NEEDS_WRITE_BARRIER (0)
+#endif
+#ifdef MMTK_PLAN_STICKYIMMIX
+#define MMTK_NEEDS_WRITE_BARRIER (1)
+#endif
+
+#define MMTK_DEFAULT_IMMIX_ALLOCATOR (0)
+#define MMTK_IMMORTAL_BUMP_ALLOCATOR (0)
+
+extern const void* MMTK_SIDE_LOG_BIT_BASE_ADDRESS;
+
+// Directly call into MMTk for write barrier (debugging only)
+STATIC_INLINE void mmtk_gc_wb_full(const void *parent, const void *ptr) JL_NOTSAFEPOINT
+{
+    jl_task_t *ct = jl_current_task;
+    jl_ptls_t ptls = ct->ptls;
+    mmtk_object_reference_write_post(&ptls->gc_tls.mmtk_mutator, parent, ptr);
+}
+
+// Inlined fastpath
+STATIC_INLINE void mmtk_gc_wb_fast(const void *parent, const void *ptr) JL_NOTSAFEPOINT
+{
+    if (MMTK_NEEDS_WRITE_BARRIER == MMTK_OBJECT_BARRIER) {
+        intptr_t addr = (intptr_t) (void*) parent;
+        uint8_t* meta_addr = (uint8_t*) (MMTK_SIDE_LOG_BIT_BASE_ADDRESS) + (addr >> 6);
+        intptr_t shift = (addr >> 3) & 0b111;
+        uint8_t byte_val = *meta_addr;
+        if (((byte_val >> shift) & 1) == 1) {
+            jl_task_t *ct = jl_current_task;
+            jl_ptls_t ptls = ct->ptls;
+            mmtk_object_reference_write_slow(&ptls->gc_tls.mmtk_mutator, parent, ptr);
+        }
+    }
+}
+
+STATIC_INLINE void mmtk_gc_wb(const void *parent, const void *ptr) JL_NOTSAFEPOINT
+{
+    mmtk_gc_wb_fast(parent, ptr);
+}
+
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/llvm-final-gc-lowering.cpp b/src/llvm-final-gc-lowering.cpp
index 76dcd944890ab..b06a084651231 100644
--- a/src/llvm-final-gc-lowering.cpp
+++ b/src/llvm-final-gc-lowering.cpp
@@ -117,6 +117,32 @@ void FinalLowerGC::lowerSafepoint(CallInst *target, Function &F)
     target->eraseFromParent();
 }
 
+#ifdef MMTK_GC
+void FinalLowerGC::lowerWriteBarrier1(CallInst *target, Function &F)
+{
+    assert(target->arg_size() == 1);
+    target->setCalledFunction(writeBarrier1Func);
+}
+
+void FinalLowerGC::lowerWriteBarrier2(CallInst *target, Function &F)
+{
+    assert(target->arg_size() == 2);
+    target->setCalledFunction(writeBarrier2Func);
+}
+
+void FinalLowerGC::lowerWriteBarrier1Slow(CallInst *target, Function &F)
+{
+    assert(target->arg_size() == 1);
+    target->setCalledFunction(writeBarrier1SlowFunc);
+}
+
+void FinalLowerGC::lowerWriteBarrier2Slow(CallInst *target, Function &F)
+{
+    assert(target->arg_size() == 2);
+    target->setCalledFunction(writeBarrier2SlowFunc);
+}
+#endif
+
 void FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
 {
     ++GCAllocBytesCount;
@@ -181,6 +207,12 @@ bool FinalLowerGC::runOnFunction(Function &F)
     smallAllocFunc = getOrDeclare(jl_well_known::GCSmallAlloc);
     bigAllocFunc = getOrDeclare(jl_well_known::GCBigAlloc);
     allocTypedFunc = getOrDeclare(jl_well_known::GCAllocTyped);
+#ifdef MMTK_GC
+    writeBarrier1Func = getOrDeclare(jl_well_known::GCWriteBarrier1);
+    writeBarrier2Func = getOrDeclare(jl_well_known::GCWriteBarrier2);
+    writeBarrier1SlowFunc = getOrDeclare(jl_well_known::GCWriteBarrier1Slow);
+    writeBarrier2SlowFunc = getOrDeclare(jl_well_known::GCWriteBarrier2Slow);
+#endif
     T_size = F.getParent()->getDataLayout().getIntPtrType(F.getContext());
 
     // Lower all calls to supported intrinsics.
@@ -209,6 +241,13 @@ bool FinalLowerGC::runOnFunction(Function &F)
             LOWER_INTRINSIC(queueGCRoot, lowerQueueGCRoot);
             LOWER_INTRINSIC(safepoint, lowerSafepoint);
 
+#ifdef MMTK_GC
+            LOWER_INTRINSIC(writeBarrier1, lowerWriteBarrier1);
+            LOWER_INTRINSIC(writeBarrier2, lowerWriteBarrier2);
+            LOWER_INTRINSIC(writeBarrier1Slow, lowerWriteBarrier1Slow);
+            LOWER_INTRINSIC(writeBarrier2Slow, lowerWriteBarrier2Slow);
+#endif
+
 #undef LOWER_INTRINSIC
         }
     }
diff --git a/src/llvm-gc-interface-passes.h b/src/llvm-gc-interface-passes.h
index ed6b94dcdc3fc..7ddfc1f1c10ef 100644
--- a/src/llvm-gc-interface-passes.h
+++ b/src/llvm-gc-interface-passes.h
@@ -389,6 +389,12 @@ struct FinalLowerGC: private JuliaPassContext {
     Function *smallAllocFunc;
     Function *bigAllocFunc;
     Function *allocTypedFunc;
+#ifdef MMTK_GC
+    Function *writeBarrier1Func;
+    Function *writeBarrier2Func;
+    Function *writeBarrier1SlowFunc;
+    Function *writeBarrier2SlowFunc;
+#endif
     Instruction *pgcstack;
     Type *T_size;
 
@@ -412,6 +418,13 @@ struct FinalLowerGC: private JuliaPassContext {
 
     // Lowers a `julia.safepoint` intrinsic.
     void lowerSafepoint(CallInst *target, Function &F);
+
+#ifdef MMTK_GC
+    void lowerWriteBarrier1(CallInst *target, Function &F);
+    void lowerWriteBarrier2(CallInst *target, Function &F);
+    void lowerWriteBarrier1Slow(CallInst *target, Function &F);
+    void lowerWriteBarrier2Slow(CallInst *target, Function &F);
+#endif
 };
 
 #endif // LLVM_GC_PASSES_H
diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
index 4b7dc0ec855a7..3201ae64cf984 100644
--- a/src/llvm-late-gc-lowering.cpp
+++ b/src/llvm-late-gc-lowering.cpp
@@ -1944,14 +1944,15 @@ void LateLowerGCFrame::CleanupWriteBarriers(Function &F, State *S, const SmallVe
         if (CFGModified) {
             *CFGModified = true;
         }
+
+        IRBuilder<> builder(CI);
+        builder.SetCurrentDebugLocation(CI->getDebugLoc());
+#ifndef MMTK_GC
         auto DebugInfoMeta = F.getParent()->getModuleFlag("julia.debug_level");
         int debug_info = 1;
         if (DebugInfoMeta != nullptr) {
             debug_info = cast<ConstantInt>(cast<ConstantAsMetadata>(DebugInfoMeta)->getValue())->getZExtValue();
         }
-
-        IRBuilder<> builder(CI);
-        builder.SetCurrentDebugLocation(CI->getDebugLoc());
         auto parBits = builder.CreateAnd(EmitLoadTag(builder, T_size, parent), GC_OLD_MARKED);
         setName(parBits, "parent_bits", debug_info);
         auto parOldMarked = builder.CreateICmpEQ(parBits, ConstantInt::get(T_size, GC_OLD_MARKED));
@@ -1981,6 +1982,58 @@ void LateLowerGCFrame::CleanupWriteBarriers(Function &F, State *S, const SmallVe
         else {
             assert(false);
         }
+#else
+        // FIXME: Currently we call write barrier with the src object (parent).
+        // This works fine for object barrier for generational plans (such as stickyimmix), which does not use the target object at all.
+        // But for other MMTk plans, we need to be careful.
+        const bool INLINE_WRITE_BARRIER = true;
+        if (CI->getCalledOperand() == write_barrier_func) {
+            if (MMTK_NEEDS_WRITE_BARRIER == MMTK_OBJECT_BARRIER) {
+                if (INLINE_WRITE_BARRIER) {
+                    auto i8_ty = Type::getInt8Ty(F.getContext());
+                    auto intptr_ty = T_size;
+
+                    // intptr_t addr = (intptr_t) (void*) src;
+                    // uint8_t* meta_addr = (uint8_t*) (SIDE_METADATA_BASE_ADDRESS + (addr >> 6));
+                    intptr_t metadata_base_address = reinterpret_cast<intptr_t>(MMTK_SIDE_LOG_BIT_BASE_ADDRESS);
+                    auto metadata_base_val = ConstantInt::get(intptr_ty, metadata_base_address);
+                    auto metadata_base_ptr = ConstantExpr::getIntToPtr(metadata_base_val, PointerType::get(i8_ty, 0));
+
+                    auto parent_val = builder.CreatePtrToInt(parent, intptr_ty);
+                    auto shr = builder.CreateLShr(parent_val, ConstantInt::get(intptr_ty, 6));
+                    auto metadata_ptr = builder.CreateGEP(i8_ty, metadata_base_ptr, shr);
+
+                    // intptr_t shift = (addr >> 3) & 0b111;
+                    auto shift = builder.CreateAnd(builder.CreateLShr(parent_val, ConstantInt::get(intptr_ty, 3)), ConstantInt::get(intptr_ty, 7));
+                    auto shift_i8 = builder.CreateTruncOrBitCast(shift, i8_ty);
+
+                    // uint8_t byte_val = *meta_addr;
+                    auto load_i8 = builder.CreateAlignedLoad(i8_ty, metadata_ptr, Align());
+
+                    // if (((byte_val >> shift) & 1) == 1) {
+                    auto shifted_load_i8 = builder.CreateLShr(load_i8, shift_i8);
+                    auto masked = builder.CreateAnd(shifted_load_i8, ConstantInt::get(i8_ty, 1));
+                    auto is_unlogged = builder.CreateICmpEQ(masked, ConstantInt::get(i8_ty, 1));
+
+                    // object_reference_write_slow_call((void*) src, (void*) slot, (void*) target);
+                    MDBuilder MDB(F.getContext());
+                    SmallVector<uint32_t, 2> Weights{1, 9};
+                    if (!S->DT) {
+                        S->DT = &GetDT();
+                    }
+                    DomTreeUpdater dtu = DomTreeUpdater(S->DT, llvm::DomTreeUpdater::UpdateStrategy::Lazy);
+                    auto mayTriggerSlowpath = SplitBlockAndInsertIfThen(is_unlogged, CI, false, MDB.createBranchWeights(Weights), &dtu);
+                    builder.SetInsertPoint(mayTriggerSlowpath);
+                    builder.CreateCall(getOrDeclare(jl_intrinsics::writeBarrier1Slow), { parent });
+                } else {
+                    Function *wb_func = getOrDeclare(jl_intrinsics::writeBarrier1);
+                    builder.CreateCall(wb_func, { parent });
+                }
+            }
+        } else {
+            assert(false);
+        }
+#endif
         CI->eraseFromParent();
     }
 }
diff --git a/src/mmtk-gc.c b/src/mmtk-gc.c
deleted file mode 100644
index 284e72a502b3a..0000000000000
--- a/src/mmtk-gc.c
+++ /dev/null
@@ -1,545 +0,0 @@
-// This file is a part of Julia. License is MIT: https://julialang.org/license
-
-#ifdef MMTK_GC
-
-#include "gc.h"
-#include "mmtk_julia.h"
-#include "julia_gcext.h"
-
-// callbacks
-// ---
-
-typedef void (*jl_gc_cb_func_t)(void);
-
-JL_DLLEXPORT void jl_gc_set_cb_root_scanner(jl_gc_cb_root_scanner_t cb, int enable)
-{
-}
-JL_DLLEXPORT void jl_gc_set_cb_task_scanner(jl_gc_cb_task_scanner_t cb, int enable)
-{
-}
-JL_DLLEXPORT void jl_gc_set_cb_pre_gc(jl_gc_cb_pre_gc_t cb, int enable)
-{
-}
-JL_DLLEXPORT void jl_gc_set_cb_post_gc(jl_gc_cb_post_gc_t cb, int enable)
-{
-}
-JL_DLLEXPORT void jl_gc_set_cb_notify_external_alloc(jl_gc_cb_notify_external_alloc_t cb, int enable)
-{
-}
-JL_DLLEXPORT void jl_gc_set_cb_notify_external_free(jl_gc_cb_notify_external_free_t cb, int enable)
-{
-}
-JL_DLLEXPORT void jl_gc_set_cb_notify_gc_pressure(jl_gc_cb_notify_gc_pressure_t cb, int enable)
-{
-}
-
-// mutex for page profile
-uv_mutex_t page_profile_lock;
-
-JL_DLLEXPORT void jl_gc_take_page_profile(ios_t *stream)
-{
-    uv_mutex_lock(&page_profile_lock);
-    const char *str = "Page profiler in unsupported in MMTk.";
-    ios_write(stream, str, strlen(str));
-    uv_mutex_unlock(&page_profile_lock);
-}
-
-JL_DLLEXPORT double jl_gc_page_utilization_stats[JL_GC_N_MAX_POOLS];
-
-STATIC_INLINE void gc_dump_page_utilization_data(void) JL_NOTSAFEPOINT
-{
-    // FIXME: MMTk would have to provide its own stats
-}
-
-#define MMTK_GC_PAGE_SZ (1 << 12) // MMTk's page size is defined in mmtk-core constants
-
-JL_DLLEXPORT uint64_t jl_get_pg_size(void)
-{
-    return MMTK_GC_PAGE_SZ;
-}
-
-inline void maybe_collect(jl_ptls_t ptls)
-{
-    // Just do a safe point for general maybe_collect
-    jl_gc_safepoint_(ptls);
-}
-
-// This is only used for malloc. We need to know if we need to do GC. However, keeping checking with MMTk (mmtk_gc_poll),
-// is expensive. So we only check for every few allocations.
-static inline void malloc_maybe_collect(jl_ptls_t ptls, size_t sz)
-{
-    // We do not need to carefully maintain malloc_sz_since_last_poll. We just need to
-    // avoid using mmtk_gc_poll too frequently, and try to be precise on our heap usage
-    // as much as we can.
-    if (ptls->malloc_sz_since_last_poll > 4096) {
-        jl_atomic_store_relaxed(&ptls->malloc_sz_since_last_poll, 0);
-        mmtk_gc_poll(ptls);
-    } else {
-        jl_atomic_fetch_add_relaxed(&ptls->malloc_sz_since_last_poll, sz);
-        jl_gc_safepoint_(ptls);
-    }
-}
-
-// allocation
-int jl_gc_classify_pools(size_t sz, int *osize)
-{
-    if (sz > GC_MAX_SZCLASS)
-        return -1; // call big alloc function
-    size_t allocsz = sz + sizeof(jl_taggedvalue_t);
-    *osize = LLT_ALIGN(allocsz, 16);
-    return 0; // use MMTk's fastpath logic
-}
-
-// malloc wrappers, aligned allocation
-// We currently just duplicate what Julia GC does. We will in the future replace the malloc calls with MMTK's malloc.
-
-#if defined(_OS_WINDOWS_)
-inline void *jl_malloc_aligned(size_t sz, size_t align)
-{
-    return _aligned_malloc(sz ? sz : 1, align);
-}
-inline void *jl_realloc_aligned(void *p, size_t sz, size_t oldsz,
-                                       size_t align)
-{
-    (void)oldsz;
-    return _aligned_realloc(p, sz ? sz : 1, align);
-}
-inline void jl_free_aligned(void *p) JL_NOTSAFEPOINT
-{
-    _aligned_free(p);
-}
-#else
-inline void *jl_malloc_aligned(size_t sz, size_t align)
-{
-#if defined(_P64) || defined(__APPLE__)
-    if (align <= 16)
-        return malloc(sz);
-#endif
-    void *ptr;
-    if (posix_memalign(&ptr, align, sz))
-        return NULL;
-    return ptr;
-}
-inline void *jl_realloc_aligned(void *d, size_t sz, size_t oldsz,
-                                       size_t align)
-{
-#if defined(_P64) || defined(__APPLE__)
-    if (align <= 16)
-        return realloc(d, sz);
-#endif
-    void *b = jl_malloc_aligned(sz, align);
-    if (b != NULL) {
-        memcpy(b, d, oldsz > sz ? sz : oldsz);
-        free(d);
-    }
-    return b;
-}
-inline void jl_free_aligned(void *p) JL_NOTSAFEPOINT
-{
-    free(p);
-}
-#endif
-
-// weak references
-// ---
-JL_DLLEXPORT jl_weakref_t *jl_gc_new_weakref_th(jl_ptls_t ptls, jl_value_t *value)
-{
-    jl_weakref_t *wr = (jl_weakref_t*)jl_gc_alloc(ptls, sizeof(void*), jl_weakref_type);
-    wr->value = value;  // NOTE: wb not needed here
-    mmtk_add_weak_candidate(wr);
-    return wr;
-}
-
-
-// big values
-// ---
-
-// Size includes the tag and the tag is not cleared!!
-inline jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz)
-{
-    // TODO: assertion needed here?
-    assert(jl_atomic_load_relaxed(&ptls->gc_state) == 0);
-    // TODO: drop this okay?
-    // maybe_collect(ptls);
-
-    jl_value_t *v = jl_mmtk_gc_alloc_big(ptls, sz);
-    // TODO: this is done (without atomic operations) in jl_mmtk_gc_alloc_big; enable
-    // here when that's edited?
-    /*
-    jl_atomic_store_relaxed(&ptls->gc_num.allocd,
-        jl_atomic_load_relaxed(&ptls->gc_num.allocd) + allocsz);
-    jl_atomic_store_relaxed(&ptls->gc_num.bigalloc,
-        jl_atomic_load_relaxed(&ptls->gc_num.bigalloc) + 1);
-    */
-    // TODO: move to jl_mmtk_gc_alloc_big if needed.
-/*
-#ifdef MEMDEBUG
-    memset(v, 0xee, allocsz);
-#endif
-*/
-    // TODO: need to set this? have to move to jl_mmtk_gc_alloc_big then.
-    // v->age = 0;
-    // TODO: dropping this; confirm okay? `sweep_big` no longer needed?
-    // gc_big_object_link(v, &ptls->heap.big_objects);
-    return v;
-}
-
-// Size includes the tag and the tag is not cleared!!
-inline jl_value_t *jl_gc_pool_alloc_inner(jl_ptls_t ptls, int pool_offset, int osize)
-{
-    assert(jl_atomic_load_relaxed(&ptls->gc_state) == 0);
-#ifdef MEMDEBUG
-    return jl_gc_big_alloc(ptls, osize);
-#endif
-    // TODO: drop this okay?
-    // maybe_collect(ptls);
-
-    jl_value_t *v = jl_mmtk_gc_alloc_default(ptls, osize, 16, NULL);
-    // TODO: this is done (without atomic operations) in jl_mmtk_gc_alloc_default; enable
-    // here when that's edited?
-    /*
-    jl_atomic_store_relaxed(&ptls->gc_num.allocd,
-        jl_atomic_load_relaxed(&ptls->gc_num.allocd) + osize);
-    jl_atomic_store_relaxed(&ptls->gc_num.poolalloc,
-        jl_atomic_load_relaxed(&ptls->gc_num.poolalloc) + 1);
-    */
-   return v;
-}
-
-// roots
-// ---
-
-JL_DLLEXPORT void jl_gc_queue_root(const jl_value_t *ptr)
-{
-    mmtk_unreachable();
-}
-
-// TODO: exported, but not MMTk-specific?
-JL_DLLEXPORT void jl_gc_queue_multiroot(const jl_value_t *root, const void *stored, jl_datatype_t *dt) JL_NOTSAFEPOINT
-{
-    mmtk_unreachable();
-}
-
-
-// marking
-// ---
-
-JL_DLLEXPORT int jl_gc_mark_queue_obj(jl_ptls_t ptls, jl_value_t *obj)
-{
-    mmtk_unreachable();
-    return 0;
-}
-JL_DLLEXPORT void jl_gc_mark_queue_objarray(jl_ptls_t ptls, jl_value_t *parent,
-                                            jl_value_t **objs, size_t nobjs)
-{
-    mmtk_unreachable();
-}
-
-
-// GC control
-// ---
-
-JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
-{
-    jl_task_t *ct = jl_current_task;
-    jl_ptls_t ptls = ct->ptls;
-    if (jl_atomic_load_acquire(&jl_gc_disable_counter)) {
-        size_t localbytes = jl_atomic_load_relaxed(&ptls->gc_tls.gc_num.allocd) + gc_num.interval;
-        jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, -(int64_t)gc_num.interval);
-        static_assert(sizeof(_Atomic(uint64_t)) == sizeof(gc_num.deferred_alloc), "");
-        jl_atomic_fetch_add_relaxed((_Atomic(uint64_t)*)&gc_num.deferred_alloc, localbytes);
-        return;
-    }
-    mmtk_handle_user_collection_request(ptls, collection);
-}
-
-// Per-thread initialization
-// TODO: remove `norm_pools`, `weak_refs`, etc. from `heap`?
-// TODO: remove `gc_cache`?
-void jl_init_thread_heap(jl_ptls_t ptls)
-{
-    jl_thread_heap_t *heap = &ptls->gc_tls.heap;
-    jl_gc_pool_t *p = heap->norm_pools;
-    for (int i = 0; i < JL_GC_N_POOLS; i++) {
-        p[i].osize = jl_gc_sizeclasses[i];
-        p[i].freelist = NULL;
-        p[i].newpages = NULL;
-    }
-    small_arraylist_new(&heap->weak_refs, 0);
-    small_arraylist_new(&heap->live_tasks, 0);
-    for (int i = 0; i < JL_N_STACK_POOLS; i++)
-        small_arraylist_new(&heap->free_stacks[i], 0);
-    heap->mallocarrays = NULL;
-    heap->mafreelist = NULL;
-    heap->big_objects = NULL;
-    arraylist_new(&heap->remset, 0);
-    arraylist_new(&ptls->finalizers, 0);
-    arraylist_new(&ptls->gc_tls.sweep_objs, 0);
-
-    jl_gc_mark_cache_t *gc_cache = &ptls->gc_tls.gc_cache;
-    gc_cache->perm_scanned_bytes = 0;
-    gc_cache->scanned_bytes = 0;
-    gc_cache->nbig_obj = 0;
-
-    memset(&ptls->gc_tls.gc_num, 0, sizeof(ptls->gc_tls.gc_num));
-    jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, -(int64_t)gc_num.interval);
-
-    // Clear the malloc sz count
-    jl_atomic_store_relaxed(&ptls->malloc_sz_since_last_poll, 0);
-
-    // Create mutator
-    MMTk_Mutator mmtk_mutator = mmtk_bind_mutator((void *)ptls, ptls->tid);
-    // Copy the mutator to the thread local storage
-    memcpy(&ptls->mmtk_mutator, mmtk_mutator, sizeof(MMTkMutatorContext));
-    // Call post_bind to maintain a list of active mutators and to reclaim the old mutator (which is no longer needed)
-    mmtk_post_bind_mutator(&ptls->mmtk_mutator, mmtk_mutator);
-}
-
-extern jl_mutex_t finalizers_lock;
-extern arraylist_t to_finalize;
-extern arraylist_t finalizer_list_marked;
-
-// System-wide initialization
-// TODO: remove locks? remove anything else?
-void jl_gc_init(void)
-{
-    if (jl_options.heap_size_hint)
-        jl_gc_set_max_memory(jl_options.heap_size_hint);
-
-    JL_MUTEX_INIT(&heapsnapshot_lock, "heapsnapshot_lock");
-    JL_MUTEX_INIT(&finalizers_lock, "finalizers_lock");
-    uv_mutex_init(&gc_perm_lock);
-
-    arraylist_new(&to_finalize, 0);
-    arraylist_new(&finalizer_list_marked, 0);
-
-    gc_num.interval = default_collect_interval;
-    last_long_collect_interval = default_collect_interval;
-    gc_num.allocd = 0;
-    gc_num.max_pause = 0;
-    gc_num.max_memory = 0;
-
-#ifdef _P64
-    total_mem = uv_get_total_memory();
-    uint64_t constrained_mem = uv_get_constrained_memory();
-    if (constrained_mem > 0 && constrained_mem < total_mem)
-        total_mem = constrained_mem;
-#endif
-
-    // We allocate with abandon until we get close to the free memory on the machine.
-    uint64_t free_mem = uv_get_available_memory();
-    uint64_t high_water_mark = free_mem / 10 * 7;  // 70% high water mark
-
-    if (high_water_mark < max_total_memory)
-       max_total_memory = high_water_mark;
-
-    // MMTk-specific
-    long long min_heap_size;
-    long long max_heap_size;
-    char* min_size_def = getenv("MMTK_MIN_HSIZE");
-    char* min_size_gb = getenv("MMTK_MIN_HSIZE_G");
-
-    char* max_size_def = getenv("MMTK_MAX_HSIZE");
-    char* max_size_gb = getenv("MMTK_MAX_HSIZE_G");
-
-    // default min heap currently set as Julia's default_collect_interval
-    if (min_size_def != NULL) {
-        char *p;
-        double min_size = strtod(min_size_def, &p);
-        min_heap_size = (long) 1024 * 1024 * min_size;
-    } else if (min_size_gb != NULL) {
-        char *p;
-        double min_size = strtod(min_size_gb, &p);
-        min_heap_size = (long) 1024 * 1024 * 1024 * min_size;
-    } else {
-        min_heap_size = default_collect_interval;
-    }
-
-    // default max heap currently set as 70% the free memory in the system
-    if (max_size_def != NULL) {
-        char *p;
-        double max_size = strtod(max_size_def, &p);
-        max_heap_size = (long) 1024 * 1024 * max_size;
-    } else if (max_size_gb != NULL) {
-        char *p;
-        double max_size = strtod(max_size_gb, &p);
-        max_heap_size = (long) 1024 * 1024 * 1024 * max_size;
-    } else {
-        max_heap_size = uv_get_free_memory() * 70 / 100;
-    }
-
-    // Assert that the number of stock GC threads is 0; MMTK uses the number of threads in jl_options.ngcthreads
-    assert(jl_n_gcthreads == 0);
-
-    // Check that the julia_copy_stack rust feature has been defined when the COPY_STACK has been defined
-    int copy_stacks;
-
-#ifdef COPY_STACKS
-    copy_stacks = 1;
-#else
-    copy_stacks = 0;
-#endif
-
-    mmtk_julia_copy_stack_check(copy_stacks);
-
-    // if only max size is specified initialize MMTk with a fixed size heap
-    // TODO: We just assume mark threads means GC threads, and ignore the number of concurrent sweep threads.
-    // If the two values are the same, we can use either. Otherwise, we need to be careful.
-    uintptr_t gcthreads = jl_options.nmarkthreads;
-    if (max_size_def != NULL || (max_size_gb != NULL && (min_size_def == NULL && min_size_gb == NULL))) {
-        mmtk_gc_init(0, max_heap_size, gcthreads, &mmtk_upcalls, (sizeof(jl_taggedvalue_t)), jl_buff_tag);
-    } else {
-        mmtk_gc_init(min_heap_size, max_heap_size, gcthreads, &mmtk_upcalls, (sizeof(jl_taggedvalue_t)), jl_buff_tag);
-    }
-}
-
-// allocation wrappers that track allocation and let collection run
-
-JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
-{
-    jl_gcframe_t **pgcstack = jl_get_pgcstack();
-    jl_task_t *ct = jl_current_task;
-    void *data = malloc(sz);
-    if (data != NULL && pgcstack != NULL && ct->world_age) {
-        jl_ptls_t ptls = ct->ptls;
-        malloc_maybe_collect(ptls, sz);
-        jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, sz);
-    }
-    return data;
-}
-
-JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
-{
-    jl_gcframe_t **pgcstack = jl_get_pgcstack();
-    jl_task_t *ct = jl_current_task;
-    void *data = calloc(nm, sz);
-    if (data != NULL && pgcstack != NULL && ct->world_age) {
-        jl_ptls_t ptls = ct->ptls;
-        malloc_maybe_collect(ptls, nm * sz);
-        jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, nm * sz);
-    }
-    return data;
-}
-
-JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz)
-{
-    jl_gcframe_t **pgcstack = jl_get_pgcstack();
-    jl_task_t *ct = jl_current_task;
-    free(p);
-    if (pgcstack != NULL && ct->world_age) {
-        jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, -sz);
-    }
-}
-
-JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz)
-{
-    jl_gcframe_t **pgcstack = jl_get_pgcstack();
-    jl_task_t *ct = jl_current_task;
-    if (pgcstack && ct->world_age) {
-        jl_ptls_t ptls = ct->ptls;
-        malloc_maybe_collect(ptls, sz);
-        if (sz < old)
-            jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, old - sz);
-        else
-            jl_atomic_fetch_add_relaxed(&JULIA_MALLOC_BYTES, sz - old);
-    }
-    return realloc(p, sz);
-}
-
-jl_value_t *jl_gc_realloc_string(jl_value_t *s, size_t sz)
-{
-    size_t len = jl_string_len(s);
-    jl_value_t *snew = jl_alloc_string(sz);
-    memcpy(jl_string_data(snew), jl_string_data(s), sz <= len ? sz : len);
-    if(mmtk_is_pinned(s)) {
-        // if the source string was pinned, we also pin the new one
-        mmtk_pin_object(snew);
-    }
-    return snew;
-}
-
-JL_DLLEXPORT int jl_gc_enable_conservative_gc_support(void)
-{
-    return 0;
-}
-
-JL_DLLEXPORT int jl_gc_conservative_gc_support_enabled(void)
-{
-    return 0;
-}
-
-// TODO: if this is needed, it can be added in MMTk
-JL_DLLEXPORT jl_value_t *jl_gc_internal_obj_base_ptr(void *p)
-{
-    return NULL;
-}
-
-
-// gc-debug functions
-// ---
-
-jl_gc_pagemeta_t *jl_gc_page_metadata(void *data)
-{
-    return NULL;
-}
-
-JL_DLLEXPORT jl_taggedvalue_t *jl_gc_find_taggedvalue_pool(char *p, size_t *osize_p)
-{
-    return NULL;
-}
-
-void jl_gc_debug_critical_error(void) JL_NOTSAFEPOINT
-{
-}
-
-void jl_gc_debug_print_status(void) JL_NOTSAFEPOINT
-{
-    // May not be accurate but should be helpful enough
-    uint64_t pool_count = gc_num.poolalloc;
-    uint64_t big_count = gc_num.bigalloc;
-    jl_safe_printf("Allocations: %" PRIu64 " "
-                   "(Pool: %" PRIu64 "; Big: %" PRIu64 "); GC: %d\n",
-                   pool_count + big_count, pool_count, big_count, gc_num.pause);
-}
-
-void jl_print_gc_stats(JL_STREAM *s)
-{
-}
-
-// gc thread function
-void jl_gc_threadfun(void *arg)
-{
-    mmtk_unreachable();
-}
-
-// added for MMTk integration
-
-JL_DLLEXPORT void jl_gc_array_ptr_copy(jl_array_t *dest, void **dest_p, jl_array_t *src, void **src_p, ssize_t n) JL_NOTSAFEPOINT
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    mmtk_memory_region_copy(&ptls->mmtk_mutator, jl_array_owner(src), src_p, jl_array_owner(dest), dest_p, n);
-}
-
-void *jl_gc_perm_alloc_nolock(size_t sz, int zero, unsigned align, unsigned offset)
-{
-    jl_ptls_t ptls = jl_current_task->ptls;
-    size_t allocsz = mmtk_align_alloc_sz(sz);
-    void* addr = mmtk_immortal_alloc_fast(&ptls->mmtk_mutator, allocsz, align, offset);
-    return addr;
-}
-
-void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align, unsigned offset)
-{
-    return jl_gc_perm_alloc_nolock(sz, zero, align, offset);
-}
-
-void jl_gc_notify_image_load(const char* img_data, size_t len)
-{
-    mmtk_set_vm_space((void*)img_data, len);
-}
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // MMTK_GC

From b536f42167c70f5dfdfb947c9554077583abfbb2 Mon Sep 17 00:00:00 2001
From: Eduardo Souza <ledusou@gmail.com>
Date: Fri, 11 Oct 2024 01:56:12 +0000
Subject: [PATCH 116/116] Adding jl_gc_notify_image_alloc calls that set the
 log bit for the whole chunk of memory

---
 src/gc-interface.h | 7 ++++++-
 src/gc-mmtk.c      | 9 ++++++---
 src/gc-stock.c     | 5 +++++
 src/staticdata.c   | 2 ++
 4 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/src/gc-interface.h b/src/gc-interface.h
index 6e36f5670c7f3..176efc81b7ca7 100644
--- a/src/gc-interface.h
+++ b/src/gc-interface.h
@@ -195,10 +195,15 @@ JL_DLLEXPORT void *jl_gc_perm_alloc(size_t sz, int zero, unsigned align,
 struct _jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT;
 
 // This function notifies the GC about memory addresses that are set when loading the boot image.
-// The GC may use that information to, for instance, determine that such objects should
+// The GC may use that information to, for instance, determine that all objects in that chunk of memory should
 // be treated as marked and belonged to the old generation in nursery collections.
 void jl_gc_notify_image_load(const char* img_data, size_t len);
 
+// This function notifies the GC about memory addresses that are set when allocating the boot image.
+// The GC may use that information to, for instance, determine that all objects in that chunk of memory should
+// be treated as marked and belonged to the old generation in nursery collections.
+void jl_gc_notify_image_alloc(char* img_data, size_t len);
+
 // ========================================================================= //
 // Runtime Write-Barriers
 // ========================================================================= //
diff --git a/src/gc-mmtk.c b/src/gc-mmtk.c
index 1f20ba875b150..f4a44471f37f8 100644
--- a/src/gc-mmtk.c
+++ b/src/gc-mmtk.c
@@ -453,6 +453,7 @@ void *jl_gc_perm_alloc_nolock(size_t sz, int zero, unsigned align, unsigned offs
     jl_ptls_t ptls = jl_current_task->ptls;
     size_t allocsz = mmtk_align_alloc_sz(sz);
     void* addr = mmtk_immortal_alloc_fast(&ptls->gc_tls.mmtk_mutator, allocsz, align, offset);
+    mmtk_immortal_post_alloc_fast(&ptls->gc_tls.mmtk_mutator, jl_valueof(addr), allocsz);
     return addr;
 }
 
@@ -468,9 +469,6 @@ jl_value_t *jl_gc_permobj(size_t sz, void *ty) JL_NOTSAFEPOINT
                                                  sizeof(void*) * 2 : 16));
     jl_taggedvalue_t *o = (jl_taggedvalue_t*)jl_gc_perm_alloc(allocsz, 0, align,
                                                               sizeof(void*) % align);
-
-    jl_ptls_t ptls = jl_current_task->ptls;
-    mmtk_immortal_post_alloc_fast(&ptls->gc_tls.mmtk_mutator, jl_valueof(o), allocsz);
     o->header = (uintptr_t)ty;
     return jl_valueof(o);
 }
@@ -608,6 +606,11 @@ void jl_gc_notify_image_load(const char* img_data, size_t len)
     mmtk_set_vm_space((void*)img_data, len);
 }
 
+void jl_gc_notify_image_alloc(char* img_data, size_t len)
+{
+    mmtk_immortal_region_post_alloc((void*)img_data, len);
+}
+
 // mutex for page profile
 uv_mutex_t page_profile_lock;
 
diff --git a/src/gc-stock.c b/src/gc-stock.c
index d193254834a56..e99db4c54d17e 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -3912,6 +3912,11 @@ void jl_gc_notify_image_load(const char* img_data, size_t len)
     // Do nothing
 }
 
+void jl_gc_notify_image_alloc(const char* img_data, size_t len)
+{
+    // Do nothing
+}
+
 JL_DLLEXPORT unsigned char jl_gc_pin_object(void* obj) {
     return 0;
 }
diff --git a/src/staticdata.c b/src/staticdata.c
index 6f4bc61521c1a..af24a84f39854 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -3915,6 +3915,7 @@ static jl_value_t *jl_restore_package_image_from_stream(void* pkgimage_handle, i
         ios_seek(f, datastartpos);
         if (needs_permalloc) {
             sysimg = (char*)jl_gc_perm_alloc(len, 0, 64, 0);
+            jl_gc_notify_image_alloc(sysimg, len);
         }
         else
             sysimg = &f->buf[f->bpos];
@@ -4024,6 +4025,7 @@ JL_DLLEXPORT void jl_restore_system_image(const char *fname)
         ios_seek_end(&f);
         size_t len = ios_pos(&f);
         char *sysimg = (char*)jl_gc_perm_alloc(len, 0, 64, 0);
+        jl_gc_notify_image_alloc(sysimg, len);
         ios_seek(&f, 0);
         if (ios_readall(&f, sysimg, len) != len)
             jl_errorf("Error reading system image file.");