Skip to content

Commit

Permalink
Make malloc() go 200x faster
Browse files Browse the repository at this point in the history
If pthread_create() is linked into the binary, then the cosmo runtime
will create an independent dlmalloc arena for each core. Whenever the
malloc() function is used it will index `g_heaps[sched_getcpu() / 2]`
to find the arena with the greatest hyperthread / numa locality. This
may be configured via an environment variable. For example if you say
`export COSMOPOLITAN_HEAP_COUNT=1` then you can restore the old ways.
Your process may be configured to have anywhere between 1 - 128 heaps

We need this revision because it makes multithreaded C++ applications
faster. For example, an HTTP server I'm working on that makes extreme
use of the STL went from 16k to 2000k requests per second, after this
change was made. To understand why, try out the malloc_test benchmark
which calls malloc() + realloc() in a loop across many threads, which
sees a a 250x improvement in process clock time and 200x on wall time

The tradeoff is this adds ~25ns of latency to individual malloc calls
compared to MODE=tiny, once the cosmo runtime has transitioned into a
fully multi-threaded state. If you don't need malloc() to be scalable
then cosmo provides many options for you. For starters the heap count
variable above can be set to put the process back in single heap mode
plus you can go even faster still, if you include tinymalloc.inc like
many of the programs in tool/build/.. are already doing since that'll
shave tens of kb off your binary footprint too. Theres also MODE=tiny
which is configured to use just 1 plain old dlmalloc arena by default

Another tradeoff is we need more memory now (except in MODE=tiny), to
track the provenance of memory allocation. This is so allocations can
be freely shared across threads, and because OSes can reschedule code
to different CPUs at any time.
  • Loading branch information
jart committed Jun 5, 2024
1 parent 9906f29 commit 3609f65
Show file tree
Hide file tree
Showing 60 changed files with 858 additions and 1,064 deletions.
1 change: 1 addition & 0 deletions examples/BUILD.mk
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ EXAMPLES_DIRECTDEPS = \
THIRD_PARTY_TZ \
THIRD_PARTY_VQSORT \
THIRD_PARTY_XED \
THIRD_PARTY_LIBCXXABI \
THIRD_PARTY_ZLIB \
TOOL_ARGS \
TOOL_BUILD_LIB \
Expand Down
1 change: 1 addition & 0 deletions examples/package/BUILD.mk
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ EXAMPLES_PACKAGE_OBJS = \
EXAMPLES_PACKAGE_DIRECTDEPS = \
EXAMPLES_PACKAGE_LIB \
LIBC_INTRIN \
LIBC_MEM \
LIBC_STDIO \
LIBC_TINYMATH

Expand Down
1 change: 1 addition & 0 deletions examples/package/lib/BUILD.mk
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ EXAMPLES_PACKAGE_LIB_A_CHECKS = \
EXAMPLES_PACKAGE_LIB_A_DIRECTDEPS = \
LIBC_INTRIN \
LIBC_NEXGEN32E \
LIBC_MEM \
LIBC_STDIO

# Evaluates variable as set of transitive package dependencies.
Expand Down
19 changes: 5 additions & 14 deletions libc/calls/getcpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,20 +48,11 @@ int getcpu(unsigned *out_opt_cpu, unsigned *out_opt_node) {
} else {
return __winerr();
}
} else if (IsXnuSilicon()) {
if (__syslib->__version >= 9) {
size_t cpu64;
errno_t err = __syslib->__pthread_cpu_number_np(&cpu64);
if (!err) {
cpu = cpu64;
node = 0;
} else {
errno = err;
return -1;
}
} else {
return enosys();
}
} else if (IsAarch64()) {
long tpidr_el0;
asm("mrs\t%0,tpidr_el0" : "=r"(tpidr_el0));
cpu = tpidr_el0 & 255;
node = 0;
} else {
int rc = sys_getcpu(&cpu, &node, 0);
if (rc == -1)
Expand Down
19 changes: 4 additions & 15 deletions libc/calls/sched_getcpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,12 @@
│ PERFORMANCE OF THIS SOFTWARE. │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "libc/calls/calls.h"
#include "libc/calls/struct/cpuset.h"
#include "libc/dce.h"
#include "libc/errno.h"
#include "libc/nexgen32e/rdtscp.h"
#include "libc/nexgen32e/x86feature.h"
#include "libc/nt/struct/processornumber.h"
#include "libc/nt/synchronization.h"
#include "libc/runtime/syslib.internal.h"
#include "libc/sysv/errfuns.h"

int sys_getcpu(unsigned *opt_cpu, unsigned *opt_node, void *tcache);
Expand All @@ -38,23 +36,14 @@ int sched_getcpu(void) {
unsigned tsc_aux;
rdtscp(&tsc_aux);
return TSC_AUX_CORE(tsc_aux);
} else if (IsAarch64()) {
long tpidr_el0;
asm("mrs\t%0,tpidr_el0" : "=r"(tpidr_el0));
return tpidr_el0 & 255;
} else if (IsWindows()) {
struct NtProcessorNumber pn;
GetCurrentProcessorNumberEx(&pn);
return 64 * pn.Group + pn.Number;
} else if (IsXnuSilicon()) {
if (__syslib->__version >= 9) {
size_t cpu;
errno_t err = __syslib->__pthread_cpu_number_np(&cpu);
if (!err) {
return cpu;
} else {
errno = err;
return -1;
}
} else {
return enosys();
}
} else {
unsigned cpu = 0;
int rc = sys_getcpu(&cpu, 0, 0);
Expand Down
2 changes: 2 additions & 0 deletions libc/dlopen/BUILD.mk
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ $(LIBC_DLOPEN_A_OBJS): private \
o/$(MODE)/libc/dlopen/foreign_tramp.o: libc/dlopen/foreign_tramp.S
@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<

$(LIBC_DLOPEN_A_OBJS): private COPTS += -fno-sanitize=address

LIBC_DLOPEN_LIBS = $(foreach x,$(LIBC_DLOPEN_ARTIFACTS),$($(x)))
LIBC_DLOPEN_SRCS = $(foreach x,$(LIBC_DLOPEN_ARTIFACTS),$($(x)_SRCS))
LIBC_DLOPEN_HDRS = $(foreach x,$(LIBC_DLOPEN_ARTIFACTS),$($(x)_HDRS))
Expand Down
3 changes: 2 additions & 1 deletion libc/fmt/BUILD.mk
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ $(LIBC_FMT_A).pkg: \

$(LIBC_FMT_A_OBJS): private \
CFLAGS += \
-fno-jump-tables
-fno-jump-tables \
-fno-sanitize=address

o/$(MODE)/libc/fmt/formatint64.o \
o/$(MODE)/libc/fmt/formatint64thousands.o \
Expand Down
7 changes: 0 additions & 7 deletions libc/intrin/BUILD.mk
Original file line number Diff line number Diff line change
Expand Up @@ -86,13 +86,6 @@ o//libc/intrin/memmove.o: private \
-finline \
-foptimize-sibling-calls

# make asan stack traces shorter
o/$(MODE)/libc/intrin/asanthunk.o: private \
CFLAGS += \
-Os \
$(NO_MAGIC) \
-foptimize-sibling-calls

o/$(MODE)/libc/intrin/bzero.o \
o/$(MODE)/libc/intrin/memcmp.o \
o/$(MODE)/libc/intrin/memmove.o: private \
Expand Down
Loading

0 comments on commit 3609f65

Please sign in to comment.