diff --git a/src/cgmemmgr.cpp b/src/cgmemmgr.cpp index 15d28ff270c55..b627224e027a9 100644 --- a/src/cgmemmgr.cpp +++ b/src/cgmemmgr.cpp @@ -866,6 +866,8 @@ uint8_t *RTDyldMemoryManagerJL::allocateCodeSection(uintptr_t Size, code_allocated = true; #endif total_allocated += Size; + jl_timing_counter_inc(JL_TIMING_COUNTER_JITSize, Size); + jl_timing_counter_inc(JL_TIMING_COUNTER_JITCodeSize, Size); if (exe_alloc) return (uint8_t*)exe_alloc->alloc(Size, Alignment); return SectionMemoryManager::allocateCodeSection(Size, Alignment, SectionID, @@ -879,6 +881,8 @@ uint8_t *RTDyldMemoryManagerJL::allocateDataSection(uintptr_t Size, bool isReadOnly) { total_allocated += Size; + jl_timing_counter_inc(JL_TIMING_COUNTER_JITSize, Size); + jl_timing_counter_inc(JL_TIMING_COUNTER_JITDataSize, Size); if (!isReadOnly) return (uint8_t*)rw_alloc.alloc(Size, Alignment); if (ro_alloc) diff --git a/src/gc.c b/src/gc.c index 60b110826ee80..b769ef40cc917 100644 --- a/src/gc.c +++ b/src/gc.c @@ -1189,10 +1189,16 @@ static void reset_thread_gc_counts(void) JL_NOTSAFEPOINT } } +static int64_t inc_live_bytes(int64_t inc) JL_NOTSAFEPOINT +{ + jl_timing_counter_inc(JL_TIMING_COUNTER_HeapSize, inc); + return live_bytes += inc; +} + void jl_gc_reset_alloc_count(void) JL_NOTSAFEPOINT { combine_thread_gc_counts(&gc_num); - live_bytes += (gc_num.deferred_alloc + gc_num.allocd); + inc_live_bytes(gc_num.deferred_alloc + gc_num.allocd); gc_num.allocd = 0; gc_num.deferred_alloc = 0; reset_thread_gc_counts(); @@ -3094,9 +3100,9 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) { combine_thread_gc_counts(&gc_num); -#ifdef USE_TRACY - TracyCPlot("Heap size", live_bytes + gc_num.allocd); -#endif + // We separate the update of the graph from the update of live_bytes here + // so that the sweep shows a downward trend in memory usage. + jl_timing_counter_inc(JL_TIMING_COUNTER_HeapSize, gc_num.allocd); jl_gc_markqueue_t *mq = &ptls->mark_queue; @@ -3342,7 +3348,10 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) } last_live_bytes = live_bytes; + // Can't call inc_live_bytes here because we already added allocd + // to the graph earlier live_bytes += -gc_num.freed + gc_num.allocd; + jl_timing_counter_dec(JL_TIMING_COUNTER_HeapSize, gc_num.freed); if (collection == JL_GC_AUTO) { //If we aren't freeing enough or are seeing lots and lots of pointers let it increase faster @@ -3486,10 +3495,6 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) SetLastError(last_error); #endif errno = last_errno; - -#ifdef USE_TRACY - TracyCPlot("Heap size", jl_gc_live_bytes()); -#endif } void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq) @@ -3593,9 +3598,6 @@ void jl_gc_init(void) if (jl_options.heap_size_hint) jl_gc_set_max_memory(jl_options.heap_size_hint); -#ifdef USE_TRACY - TracyCPlotConfig("Heap size", TracyPlotFormatMemory, /* rectilinear */ 0, /* fill */ 1, /* color */ 0); -#endif t_start = jl_hrtime(); } @@ -3781,7 +3783,7 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds if (jl_astaggedvalue(owner)->bits.gc == GC_OLD_MARKED) { ptls->gc_cache.perm_scanned_bytes += allocsz - oldsz; - live_bytes += allocsz - oldsz; + inc_live_bytes(allocsz - oldsz); } else if (allocsz < oldsz) jl_atomic_store_relaxed(&ptls->gc_num.freed, diff --git a/src/gf.c b/src/gf.c index 00a51f582a597..5e5afbdaf7460 100644 --- a/src/gf.c +++ b/src/gf.c @@ -1642,6 +1642,7 @@ static void do_nothing_with_codeinst(jl_code_instance_t *ci) {} // recursively invalidate cached methods that had an edge to a replaced method static void invalidate_method_instance(void (*f)(jl_code_instance_t*), jl_method_instance_t *replaced, size_t max_world, int depth) { + jl_timing_counter_inc(JL_TIMING_COUNTER_Invalidations, 1); if (_jl_debug_method_invalidation) { jl_value_t *boxeddepth = NULL; JL_GC_PUSH1(&boxeddepth); diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp index c6aef2d35839c..c8ea8d9a18448 100644 --- a/src/jitlayers.cpp +++ b/src/jitlayers.cpp @@ -851,10 +851,27 @@ class JLMemoryUsagePlugin : public ObjectLinkingLayer::Plugin { jitlink::PassConfiguration &Config) override { Config.PostAllocationPasses.push_back([this](jitlink::LinkGraph &G) { size_t graph_size = 0; + size_t code_size = 0; + size_t data_size = 0; for (auto block : G.blocks()) { graph_size += block->getSize(); } + for (auto §ion : G.sections()) { + size_t secsize = 0; + for (auto block : section.blocks()) { + secsize += block->getSize(); + } + if ((section.getMemProt() & jitlink::MemProt::Exec) == jitlink::MemProt::None) { + data_size += secsize; + } else { + code_size += secsize; + } + graph_size += secsize; + } this->total_size.fetch_add(graph_size, std::memory_order_relaxed); + jl_timing_counter_inc(JL_TIMING_COUNTER_JITSize, graph_size); + jl_timing_counter_inc(JL_TIMING_COUNTER_JITCodeSize, code_size); + jl_timing_counter_inc(JL_TIMING_COUNTER_JITDataSize, data_size); return Error::success(); }); } diff --git a/src/staticdata.c b/src/staticdata.c index 353382f11a067..96c3c3d147856 100644 --- a/src/staticdata.c +++ b/src/staticdata.c @@ -3277,6 +3277,7 @@ static void jl_restore_system_image_from_stream_(ios_t *f, jl_image_t *image, jl arraylist_push(&jl_linkage_blobs, (void*)image_base); arraylist_push(&jl_linkage_blobs, (void*)(image_base + sizeof_sysimg + sizeof(uintptr_t))); arraylist_push(&jl_image_relocs, (void*)relocs_base); + jl_timing_counter_inc(JL_TIMING_COUNTER_ImageSize, sizeof_sysimg + sizeof(uintptr_t)); rebuild_image_blob_tree(); // jl_printf(JL_STDOUT, "%ld blobs to link against\n", jl_linkage_blobs.len >> 1); diff --git a/src/timing.c b/src/timing.c index 2e0dba7c025bc..3290e68ee9169 100644 --- a/src/timing.c +++ b/src/timing.c @@ -48,23 +48,38 @@ const char *jl_timing_names[(int)JL_TIMING_LAST] = #undef X }; +JL_DLLEXPORT jl_timing_counter_t jl_timing_counters[JL_TIMING_COUNTER_LAST]; + #ifdef USE_ITTAPI JL_DLLEXPORT __itt_event jl_timing_ittapi_events[(int)JL_TIMING_EVENT_LAST]; #endif void jl_print_timings(void) { +#ifdef USE_TIMING_COUNTS uint64_t total_time = cycleclock() - t0; uint64_t root_time = total_time; for (int i = 0; i < JL_TIMING_LAST; i++) { root_time -= jl_timing_counts[i]; } jl_timing_counts[0] = root_time; + fprintf(stderr, "\nJULIA TIMINGS\n"); for (int i = 0; i < JL_TIMING_LAST; i++) { if (jl_timing_counts[i] != 0) fprintf(stderr, "%-25s : %5.2f %% %" PRIu64 "\n", jl_timing_names[i], 100 * (((double)jl_timing_counts[i]) / total_time), jl_timing_counts[i]); } + + fprintf(stderr, "\nJULIA COUNTERS\n"); +#define X(name) do { \ + int64_t val = (int64_t) jl_atomic_load_relaxed(&jl_timing_counters[(int)JL_TIMING_COUNTER_##name].basic_counter); \ + if (val != 0) \ + fprintf(stderr, "%-25s : %" PRIi64 "\n", #name, val); \ + } while (0); + + JL_TIMING_COUNTERS +#undef X +#endif } void jl_init_timing(void) @@ -79,6 +94,24 @@ void jl_init_timing(void) #define X(name) jl_timing_ittapi_events[i++] = __itt_event_create(#name, strlen(#name)); JL_TIMING_EVENTS #undef X + i = 0; +#define X(name) jl_timing_counters[i++].ittapi_counter = __itt_counter_create(#name, "julia.runtime"); + JL_TIMING_COUNTERS +#undef X +#endif +#ifdef USE_TRACY + i = 0; +#define X(counter_name) jl_timing_counters[i].tracy_counter = (jl_tracy_counter_t){0, #counter_name}; \ + TracyCPlotConfig(jl_timing_counters[i++].tracy_counter.name, TracyPlotFormatNumber, /* rectilinear */ 1, /* fill */ 1, /* color */ 0); + JL_TIMING_COUNTERS +#undef X + // We reference these by enum indexing and then asking for the name, since that allows the compiler + // to catch name mismatches. + TracyCPlotConfig(jl_timing_counters[JL_TIMING_COUNTER_HeapSize].tracy_counter.name, TracyPlotFormatMemory, /* rectilinear */ 0, /* fill */ 1, /* color */ 0); + TracyCPlotConfig(jl_timing_counters[JL_TIMING_COUNTER_JITSize].tracy_counter.name, TracyPlotFormatMemory, /* rectilinear */ 0, /* fill */ 1, /* color */ 0); + TracyCPlotConfig(jl_timing_counters[JL_TIMING_COUNTER_JITCodeSize].tracy_counter.name, TracyPlotFormatMemory, /* rectilinear */ 0, /* fill */ 1, /* color */ 0); + TracyCPlotConfig(jl_timing_counters[JL_TIMING_COUNTER_JITDataSize].tracy_counter.name, TracyPlotFormatMemory, /* rectilinear */ 0, /* fill */ 1, /* color */ 0); + TracyCPlotConfig(jl_timing_counters[JL_TIMING_COUNTER_ImageSize].tracy_counter.name, TracyPlotFormatMemory, /* rectilinear */ 0, /* fill */ 1, /* color */ 0); #endif } diff --git a/src/timing.h b/src/timing.h index 5d4e95c62bb67..13ac3c5a95de5 100644 --- a/src/timing.h +++ b/src/timing.h @@ -16,6 +16,13 @@ static inline const char *gnu_basename(const char *path) return base ? base+1 : path; } +#ifdef USE_TRACY +typedef struct { + _Atomic(int64_t) val; + char* name; +} jl_tracy_counter_t; +#endif + #ifdef __cplusplus extern "C" { #endif @@ -78,6 +85,9 @@ extern uint32_t jl_timing_print_limit; #define jl_timing_block_exit_task(ct, ptls) ((jl_timing_block_t *)NULL) #define jl_pop_timing_block(blk) +#define jl_timing_counter_inc(counter, value) +#define jl_timing_counter_dec(counter, value) + #define jl_profile_lock_init(lock, name) #define jl_profile_lock_start_wait(lock) #define jl_profile_lock_acquired(lock) @@ -181,6 +191,15 @@ void jl_timing_puts(jl_timing_block_t *cur_block, const char *str); X(NATIVE_Create) \ +#define JL_TIMING_COUNTERS \ + X(Invalidations) \ + X(HeapSize) \ + X(JITSize) \ + X(JITCodeSize) \ + X(JITDataSize) \ + X(ImageSize) \ + + enum jl_timing_owners { #define X(name) JL_TIMING_ ## name, JL_TIMING_OWNERS @@ -195,6 +214,13 @@ enum jl_timing_events { JL_TIMING_EVENT_LAST }; +enum jl_timing_counter_types { +#define X(name) JL_TIMING_COUNTER_ ## name, + JL_TIMING_COUNTERS +#undef X + JL_TIMING_COUNTER_LAST +}; + /** * Timing back-ends differ in terms of whether they support nested * and asynchronous events. @@ -404,6 +430,61 @@ struct jl_timing_suspend_cpp_t { _jl_timing_suspend_ctor(&__timing_suspend, #subsystem, ct) #endif +// Counting +#ifdef USE_ITTAPI +#define _ITTAPI_COUNTER_MEMBER __itt_counter ittapi_counter; +#else +#define _ITTAPI_COUNTER_MEMBER +#endif + +#ifdef USE_TRACY +# define _TRACY_COUNTER_MEMBER jl_tracy_counter_t tracy_counter; +# else +# define _TRACY_COUNTER_MEMBER +#endif + +#ifdef USE_TIMING_COUNTS +#define _COUNTS_MEMBER _Atomic(uint64_t) basic_counter; +#else +#define _COUNTS_MEMBER +#endif + +typedef struct { + _ITTAPI_COUNTER_MEMBER + _TRACY_COUNTER_MEMBER + _COUNTS_MEMBER +} jl_timing_counter_t; + +JL_DLLEXPORT extern jl_timing_counter_t jl_timing_counters[JL_TIMING_COUNTER_LAST]; + +static inline void jl_timing_counter_inc(int counter, uint64_t val) JL_NOTSAFEPOINT { +#ifdef USE_ITTAPI + __itt_counter_inc_delta(jl_timing_counters[counter].ittapi_counter, val); +#endif +#ifdef USE_TRACY + jl_tracy_counter_t *tracy_counter = &jl_timing_counters[counter].tracy_counter; + uint64_t oldval = jl_atomic_fetch_add_relaxed(&tracy_counter->val, val); + TracyCPlotI(tracy_counter->name, oldval + val); +#endif +#ifdef USE_TIMING_COUNTS + jl_atomic_fetch_add_relaxed(&jl_timing_counters[counter].basic_counter, val); +#endif +} + +static inline void jl_timing_counter_dec(int counter, uint64_t val) JL_NOTSAFEPOINT { +#ifdef USE_ITTAPI + __itt_counter_dec_delta(jl_timing_counters[counter].ittapi_counter, val); +#endif +#ifdef USE_TRACY + jl_tracy_counter_t *tracy_counter = &jl_timing_counters[counter].tracy_counter; + uint64_t oldval = jl_atomic_fetch_add_relaxed(&tracy_counter->val, -val); + TracyCPlotI(tracy_counter->name, oldval - val); +#endif +#ifdef USE_TIMING_COUNTS + jl_atomic_fetch_add_relaxed(&jl_timing_counters[counter].basic_counter, -(int64_t)val); +#endif +} + // Locking profiling static inline void jl_profile_lock_init(jl_mutex_t *lock, const char *name) { #ifdef USE_ITTAPI