From ef8e98d08ad70fb3b982ea5d38aa0df65a2013c4 Mon Sep 17 00:00:00 2001 From: Kevin Modzelewski Date: Wed, 10 Aug 2022 16:55:00 -0400 Subject: [PATCH 01/11] Add support for the BOLT post-link binary optimizer Using [bolt](https://github.com/llvm/llvm-project/tree/main/bolt) provides a fairly large speedup without any code or functionality changes. It provides roughly a 1% speedup on pyperformance, and a 4% improvement on the Pyston web macrobenchmarks. It is gated behind an `--enable-bolt` configure arg because not all toolchains and environments are supported. It has been tested on a Linux x86_64 toolchain, using llvm-bolt built from the LLVM 14.0.6 sources (their binary distribution of this version did not include bolt). Compared to [a previous attempt](https://github.com/faster-cpython/ideas/issues/224), this commit uses bolt's preferred "instrumentation" approach, as well as adds some non-PIE flags which enable much better optimizations from bolt. The effects of this change are a bit more dependent on CPU microarchitecture than other changes, since it optimizes i-cache behavior which seems to be a bit more variable between architectures. The 1%/4% numbers were collected on an Intel Skylake CPU, and on an AMD Zen 3 CPU I got a slightly larger speedup (2%/4%), and on a c6i.xlarge EC2 instance I got a slightly lower speedup (1%/3%). The low speedup on pyperformance is not entirely unexpected, because BOLT improves i-cache behavior, and the benchmarks in the pyperformance suite are small and tend to fit in i-cache. This change uses the existing pgo profiling task (`python -m test --pgo`), though I was able to measure about a 1% macrobenchmark improvement by using the macrobenchmarks as the training task. I personally think that both the PGO and BOLT tasks should be updated to use macrobenchmarks, but for the sake of splitting up the work this PR uses the existing pgo task. --- Makefile.pre.in | 9 ++ Misc/no-pie-compile.specs | 2 + Misc/no-pie-link.specs | 2 + configure | 259 ++++++++++++++++++++++++++++++++++++++ configure.ac | 51 ++++++++ 5 files changed, 323 insertions(+) create mode 100644 Misc/no-pie-compile.specs create mode 100644 Misc/no-pie-link.specs diff --git a/Makefile.pre.in b/Makefile.pre.in index 79616160e495e3..d373aaf532d03d 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -640,6 +640,15 @@ profile-opt: profile-run-stamp -rm -f profile-clean-stamp $(MAKE) @DEF_MAKE_RULE@ CFLAGS_NODIST="$(CFLAGS_NODIST) $(PGO_PROF_USE_FLAG)" LDFLAGS_NODIST="$(LDFLAGS_NODIST)" +bolt-opt: @PREBOLT_RULE@ + rm -f *.fdata + @LLVM_BOLT@ $(BUILDPYTHON) -instrument -instrumentation-file-append-pid -instrumentation-file=$(abspath $(BUILDPYTHON).bolt) -o $(BUILDPYTHON).bolt_inst + ./$(BUILDPYTHON).bolt_inst $(PROFILE_TASK) || true + @MERGE_FDATA@ $(BUILDPYTHON).*.fdata > $(BUILDPYTHON).fdata + @LLVM_BOLT@ $(BUILDPYTHON) -o $(BUILDPYTHON).bolt -data=$(BUILDPYTHON).fdata -update-debug-sections -reorder-blocks=ext-tsp -reorder-functions=hfsort+ -split-functions=3 -icf=1 -inline-all -split-eh -reorder-functions-use-hot-size -peepholes=all -jump-tables=aggressive -inline-ap -indirect-call-promotion=all -dyno-stats -use-gnu-stack -frame-opt=hot + rm -f *.fdata + mv $(BUILDPYTHON).bolt $(BUILDPYTHON) + # Compile and run with gcov .PHONY=coverage coverage-lcov coverage-report coverage: diff --git a/Misc/no-pie-compile.specs b/Misc/no-pie-compile.specs new file mode 100644 index 00000000000000..2277b97efd4e3f --- /dev/null +++ b/Misc/no-pie-compile.specs @@ -0,0 +1,2 @@ +*self_spec: ++ %{!r:%{!fpie:%{!fPIE:%{!fpic:%{!fPIC:%{!fno-pic:-fno-PIE}}}}}} diff --git a/Misc/no-pie-link.specs b/Misc/no-pie-link.specs new file mode 100644 index 00000000000000..54db649b1e772b --- /dev/null +++ b/Misc/no-pie-link.specs @@ -0,0 +1,2 @@ +*self_spec: ++ %{!shared:%{!r:%{!fPIE:%{!pie:-fno-PIE -no-pie}}}} diff --git a/configure b/configure index 3f25d43dde6f0c..29f23069f0a452 100755 --- a/configure +++ b/configure @@ -887,6 +887,9 @@ LLVM_PROF_FILE LLVM_PROF_MERGER PGO_PROF_USE_FLAG PGO_PROF_GEN_FLAG +MERGE_FDATA +LLVM_BOLT +PREBOLT_RULE LLVM_AR_FOUND LLVM_AR PROFILE_TASK @@ -1048,6 +1051,7 @@ enable_pystats with_assertions enable_optimizations with_lto +enable_bolt with_address_sanitizer with_memory_sanitizer with_undefined_behavior_sanitizer @@ -1773,6 +1777,8 @@ Optional Features: --enable-pystats enable internal statistics gathering (default is no) --enable-optimizations enable expensive, stable optimizations (PGO, etc.) (default is no) + --enable-bolt enable usage of the llvm-bolt post-link optimizer + (default is no) --enable-loadable-sqlite-extensions support loadable extensions in the sqlite3 module, see Doc/library/sqlite3.rst (default is no) @@ -7722,6 +7728,259 @@ $as_echo "$as_me: llvm-ar found via xcrun: ${LLVM_AR}" >&6;} LDFLAGS_NODIST="$LDFLAGS_NODIST $LTOFLAGS" fi +# Enable bolt flags +Py_BOLT='false' +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for --enable-bolt" >&5 +$as_echo_n "checking for --enable-bolt... " >&6; } +# Check whether --enable-bolt was given. +if test "${enable_bolt+set}" = set; then : + enableval=$enable_bolt; +if test "$enableval" != no +then + Py_BOLT='true' + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; }; +else + Py_BOLT='false' + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; }; +fi +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + +if test "$Py_BOLT" = 'true' ; then + PREBOLT_RULE="${DEF_MAKE_ALL_RULE}" + DEF_MAKE_ALL_RULE="bolt-opt" + DEF_MAKE_RULE="build_all" + + # These flags are required for bolt to work: + CFLAGS_NODIST="$CFLAGS_NODIST -fno-reorder-blocks-and-partition" + LDFLAGS_NODIST="$LDFLAGS_NODIST -Wl,--emit-relocs" + + # These flags are required to get good performance from bolt: + CFLAGS_NODIST="$CFLAGS_NODIST -specs=Misc/no-pie-compile.specs" + LDFLAGS_NODIST="$LDFLAGS_NODIST -specs=Misc/no-pie-link.specs" + LDFLAGS_NOLTO="$LDFLAGS_NOLTO -specs=Misc/no-pie-link.specs" + + + if test -n "$ac_tool_prefix"; then + # Extract the first word of "${ac_tool_prefix}llvm-bolt", so it can be a program name with args. +set dummy ${ac_tool_prefix}llvm-bolt; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_path_LLVM_BOLT+:} false; then : + $as_echo_n "(cached) " >&6 +else + case $LLVM_BOLT in + [\\/]* | ?:[\\/]*) + ac_cv_path_LLVM_BOLT="$LLVM_BOLT" # Let the user override the test with a path. + ;; + *) + as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in ${llvm_path} +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_path_LLVM_BOLT="$as_dir/$ac_word$ac_exec_ext" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + + ;; +esac +fi +LLVM_BOLT=$ac_cv_path_LLVM_BOLT +if test -n "$LLVM_BOLT"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $LLVM_BOLT" >&5 +$as_echo "$LLVM_BOLT" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + +fi +if test -z "$ac_cv_path_LLVM_BOLT"; then + ac_pt_LLVM_BOLT=$LLVM_BOLT + # Extract the first word of "llvm-bolt", so it can be a program name with args. +set dummy llvm-bolt; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_path_ac_pt_LLVM_BOLT+:} false; then : + $as_echo_n "(cached) " >&6 +else + case $ac_pt_LLVM_BOLT in + [\\/]* | ?:[\\/]*) + ac_cv_path_ac_pt_LLVM_BOLT="$ac_pt_LLVM_BOLT" # Let the user override the test with a path. + ;; + *) + as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in ${llvm_path} +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_path_ac_pt_LLVM_BOLT="$as_dir/$ac_word$ac_exec_ext" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + + ;; +esac +fi +ac_pt_LLVM_BOLT=$ac_cv_path_ac_pt_LLVM_BOLT +if test -n "$ac_pt_LLVM_BOLT"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_pt_LLVM_BOLT" >&5 +$as_echo "$ac_pt_LLVM_BOLT" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + if test "x$ac_pt_LLVM_BOLT" = x; then + LLVM_BOLT="''" + else + case $cross_compiling:$ac_tool_warned in +yes:) +{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 +$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} +ac_tool_warned=yes ;; +esac + LLVM_BOLT=$ac_pt_LLVM_BOLT + fi +else + LLVM_BOLT="$ac_cv_path_LLVM_BOLT" +fi + + if test -n "${LLVM_BOLT}" -a -x "${LLVM_BOLT}" + then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: \"Found llvm-bolt\"" >&5 +$as_echo "\"Found llvm-bolt\"" >&6; } + else + as_fn_error $? "llvm-bolt is required for a --enable-bolt build but could not be found." "$LINENO" 5 + fi + + + if test -n "$ac_tool_prefix"; then + # Extract the first word of "${ac_tool_prefix}merge-fdata", so it can be a program name with args. +set dummy ${ac_tool_prefix}merge-fdata; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_path_MERGE_FDATA+:} false; then : + $as_echo_n "(cached) " >&6 +else + case $MERGE_FDATA in + [\\/]* | ?:[\\/]*) + ac_cv_path_MERGE_FDATA="$MERGE_FDATA" # Let the user override the test with a path. + ;; + *) + as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in ${llvm_path} +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_path_MERGE_FDATA="$as_dir/$ac_word$ac_exec_ext" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + + ;; +esac +fi +MERGE_FDATA=$ac_cv_path_MERGE_FDATA +if test -n "$MERGE_FDATA"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $MERGE_FDATA" >&5 +$as_echo "$MERGE_FDATA" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + +fi +if test -z "$ac_cv_path_MERGE_FDATA"; then + ac_pt_MERGE_FDATA=$MERGE_FDATA + # Extract the first word of "merge-fdata", so it can be a program name with args. +set dummy merge-fdata; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_path_ac_pt_MERGE_FDATA+:} false; then : + $as_echo_n "(cached) " >&6 +else + case $ac_pt_MERGE_FDATA in + [\\/]* | ?:[\\/]*) + ac_cv_path_ac_pt_MERGE_FDATA="$ac_pt_MERGE_FDATA" # Let the user override the test with a path. + ;; + *) + as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in ${llvm_path} +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_path_ac_pt_MERGE_FDATA="$as_dir/$ac_word$ac_exec_ext" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + + ;; +esac +fi +ac_pt_MERGE_FDATA=$ac_cv_path_ac_pt_MERGE_FDATA +if test -n "$ac_pt_MERGE_FDATA"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_pt_MERGE_FDATA" >&5 +$as_echo "$ac_pt_MERGE_FDATA" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + if test "x$ac_pt_MERGE_FDATA" = x; then + MERGE_FDATA="''" + else + case $cross_compiling:$ac_tool_warned in +yes:) +{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 +$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} +ac_tool_warned=yes ;; +esac + MERGE_FDATA=$ac_pt_MERGE_FDATA + fi +else + MERGE_FDATA="$ac_cv_path_MERGE_FDATA" +fi + + if test -n "${MERGE_FDATA}" -a -x "${MERGE_FDATA}" + then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: \"Found merge-fdata\"" >&5 +$as_echo "\"Found merge-fdata\"" >&6; } + else + as_fn_error $? "merge-fdata is required for a --enable-bolt build but could not be found." "$LINENO" 5 + fi +fi + # Enable PGO flags. diff --git a/configure.ac b/configure.ac index 8decd9ebae84cf..66ed627ad135e2 100644 --- a/configure.ac +++ b/configure.ac @@ -1881,6 +1881,57 @@ if test "$Py_LTO" = 'true' ; then LDFLAGS_NODIST="$LDFLAGS_NODIST $LTOFLAGS" fi +# Enable bolt flags +Py_BOLT='false' +AC_MSG_CHECKING(for --enable-bolt) +AC_ARG_ENABLE(bolt, AS_HELP_STRING( + [--enable-bolt], + [enable usage of the llvm-bolt post-link optimizer (default is no)]), +[ +if test "$enableval" != no +then + Py_BOLT='true' + AC_MSG_RESULT(yes); +else + Py_BOLT='false' + AC_MSG_RESULT(no); +fi], +[AC_MSG_RESULT(no)]) + +AC_SUBST(PREBOLT_RULE) +if test "$Py_BOLT" = 'true' ; then + PREBOLT_RULE="${DEF_MAKE_ALL_RULE}" + DEF_MAKE_ALL_RULE="bolt-opt" + DEF_MAKE_RULE="build_all" + + # These flags are required for bolt to work: + CFLAGS_NODIST="$CFLAGS_NODIST -fno-reorder-blocks-and-partition" + LDFLAGS_NODIST="$LDFLAGS_NODIST -Wl,--emit-relocs" + + # These flags are required to get good performance from bolt: + CFLAGS_NODIST="$CFLAGS_NODIST -specs=Misc/no-pie-compile.specs" + LDFLAGS_NODIST="$LDFLAGS_NODIST -specs=Misc/no-pie-link.specs" + LDFLAGS_NOLTO="$LDFLAGS_NOLTO -specs=Misc/no-pie-link.specs" + + AC_SUBST(LLVM_BOLT) + AC_PATH_TOOL(LLVM_BOLT, llvm-bolt, '', ${llvm_path}) + if test -n "${LLVM_BOLT}" -a -x "${LLVM_BOLT}" + then + AC_MSG_RESULT("Found llvm-bolt") + else + AC_MSG_ERROR([llvm-bolt is required for a --enable-bolt build but could not be found.]) + fi + + AC_SUBST(MERGE_FDATA) + AC_PATH_TOOL(MERGE_FDATA, merge-fdata, '', ${llvm_path}) + if test -n "${MERGE_FDATA}" -a -x "${MERGE_FDATA}" + then + AC_MSG_RESULT("Found merge-fdata") + else + AC_MSG_ERROR([merge-fdata is required for a --enable-bolt build but could not be found.]) + fi +fi + # Enable PGO flags. AC_SUBST(PGO_PROF_GEN_FLAG) AC_SUBST(PGO_PROF_USE_FLAG) From 1448a68bc6884b084d568067ff3936231ff52733 Mon Sep 17 00:00:00 2001 From: Kevin Modzelewski Date: Fri, 12 Aug 2022 12:54:39 -0400 Subject: [PATCH 02/11] Simplify the build flags --- Misc/no-pie-compile.specs | 2 -- Misc/no-pie-link.specs | 2 -- configure | 6 +++--- configure.ac | 6 +++--- 4 files changed, 6 insertions(+), 10 deletions(-) delete mode 100644 Misc/no-pie-compile.specs delete mode 100644 Misc/no-pie-link.specs diff --git a/Misc/no-pie-compile.specs b/Misc/no-pie-compile.specs deleted file mode 100644 index 2277b97efd4e3f..00000000000000 --- a/Misc/no-pie-compile.specs +++ /dev/null @@ -1,2 +0,0 @@ -*self_spec: -+ %{!r:%{!fpie:%{!fPIE:%{!fpic:%{!fPIC:%{!fno-pic:-fno-PIE}}}}}} diff --git a/Misc/no-pie-link.specs b/Misc/no-pie-link.specs deleted file mode 100644 index 54db649b1e772b..00000000000000 --- a/Misc/no-pie-link.specs +++ /dev/null @@ -1,2 +0,0 @@ -*self_spec: -+ %{!shared:%{!r:%{!fPIE:%{!pie:-fno-PIE -no-pie}}}} diff --git a/configure b/configure index 29f23069f0a452..d11baae8d08484 100755 --- a/configure +++ b/configure @@ -7762,9 +7762,9 @@ if test "$Py_BOLT" = 'true' ; then LDFLAGS_NODIST="$LDFLAGS_NODIST -Wl,--emit-relocs" # These flags are required to get good performance from bolt: - CFLAGS_NODIST="$CFLAGS_NODIST -specs=Misc/no-pie-compile.specs" - LDFLAGS_NODIST="$LDFLAGS_NODIST -specs=Misc/no-pie-link.specs" - LDFLAGS_NOLTO="$LDFLAGS_NOLTO -specs=Misc/no-pie-link.specs" + CFLAGS_NODIST="$CFLAGS_NODIST -fno-pie" + # We want to add these no-pie flags to linking executables but not shared libraries: + LINKCC="$LINKCC -fno-pie -no-pie" if test -n "$ac_tool_prefix"; then diff --git a/configure.ac b/configure.ac index 66ed627ad135e2..d14e29ed27568a 100644 --- a/configure.ac +++ b/configure.ac @@ -1909,9 +1909,9 @@ if test "$Py_BOLT" = 'true' ; then LDFLAGS_NODIST="$LDFLAGS_NODIST -Wl,--emit-relocs" # These flags are required to get good performance from bolt: - CFLAGS_NODIST="$CFLAGS_NODIST -specs=Misc/no-pie-compile.specs" - LDFLAGS_NODIST="$LDFLAGS_NODIST -specs=Misc/no-pie-link.specs" - LDFLAGS_NOLTO="$LDFLAGS_NOLTO -specs=Misc/no-pie-link.specs" + CFLAGS_NODIST="$CFLAGS_NODIST -fno-pie" + # We want to add these no-pie flags to linking executables but not shared libraries: + LINKCC="$LINKCC -fno-pie -no-pie" AC_SUBST(LLVM_BOLT) AC_PATH_TOOL(LLVM_BOLT, llvm-bolt, '', ${llvm_path}) From c5463743e5b7f6661d44b9ec5795a9977d2dff94 Mon Sep 17 00:00:00 2001 From: Kevin Modzelewski Date: Fri, 12 Aug 2022 13:06:18 -0400 Subject: [PATCH 03/11] Add a NEWS entry --- .../next/Build/2022-08-12-13-06-03.gh-issue-90536.qMpF6p.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/Build/2022-08-12-13-06-03.gh-issue-90536.qMpF6p.rst diff --git a/Misc/NEWS.d/next/Build/2022-08-12-13-06-03.gh-issue-90536.qMpF6p.rst b/Misc/NEWS.d/next/Build/2022-08-12-13-06-03.gh-issue-90536.qMpF6p.rst new file mode 100644 index 00000000000000..4605e03915ee63 --- /dev/null +++ b/Misc/NEWS.d/next/Build/2022-08-12-13-06-03.gh-issue-90536.qMpF6p.rst @@ -0,0 +1,2 @@ +Use the BOLT post-link optimizer to improve performance, particularly on +medium-to-large applications. From c12dbea53581a595c49d94712cce0500b2c3759d Mon Sep 17 00:00:00 2001 From: Kevin Modzelewski Date: Tue, 16 Aug 2022 17:31:54 -0400 Subject: [PATCH 04/11] Update Makefile.pre.in Co-authored-by: Dong-hee Na --- Makefile.pre.in | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile.pre.in b/Makefile.pre.in index d373aaf532d03d..7af2d40a4b4045 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -647,6 +647,7 @@ bolt-opt: @PREBOLT_RULE@ @MERGE_FDATA@ $(BUILDPYTHON).*.fdata > $(BUILDPYTHON).fdata @LLVM_BOLT@ $(BUILDPYTHON) -o $(BUILDPYTHON).bolt -data=$(BUILDPYTHON).fdata -update-debug-sections -reorder-blocks=ext-tsp -reorder-functions=hfsort+ -split-functions=3 -icf=1 -inline-all -split-eh -reorder-functions-use-hot-size -peepholes=all -jump-tables=aggressive -inline-ap -indirect-call-promotion=all -dyno-stats -use-gnu-stack -frame-opt=hot rm -f *.fdata + rm -f $(BUILDPYTHON).bolt_inst mv $(BUILDPYTHON).bolt $(BUILDPYTHON) # Compile and run with gcov From ce25757e6e990941cffda4d156f17240a58aac6a Mon Sep 17 00:00:00 2001 From: Kevin Modzelewski Date: Tue, 16 Aug 2022 17:32:13 -0400 Subject: [PATCH 05/11] Update configure.ac Co-authored-by: Dong-hee Na --- configure.ac | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index d14e29ed27568a..196feb4ac88df3 100644 --- a/configure.ac +++ b/configure.ac @@ -1912,7 +1912,9 @@ if test "$Py_BOLT" = 'true' ; then CFLAGS_NODIST="$CFLAGS_NODIST -fno-pie" # We want to add these no-pie flags to linking executables but not shared libraries: LINKCC="$LINKCC -fno-pie -no-pie" - + # Designate the DWARF version into 4 since the LLVM-BOLT does not support DWARF5 yet. + CFLAGS="$CFLAGS -gdwarf-4" + LDFLAGS="$LDFLAGS -gdwarf-4" AC_SUBST(LLVM_BOLT) AC_PATH_TOOL(LLVM_BOLT, llvm-bolt, '', ${llvm_path}) if test -n "${LLVM_BOLT}" -a -x "${LLVM_BOLT}" From ded38f0e57f760c37778ec141bf5f11c3e2b3bbd Mon Sep 17 00:00:00 2001 From: Kevin Modzelewski Date: Tue, 16 Aug 2022 17:20:17 -0400 Subject: [PATCH 06/11] Add myself to ACKS --- Misc/ACKS | 1 + 1 file changed, 1 insertion(+) diff --git a/Misc/ACKS b/Misc/ACKS index c1f570acaaf8e0..16a482e40a5173 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -1212,6 +1212,7 @@ Gideon Mitchell Tim Mitchell Zubin Mithra Florian Mladitsch +Kevin Modzelewski Doug Moen Jakub Molinski Juliette Monsel From cc17806259eed5efb77d4c4d7808302cb1f399af Mon Sep 17 00:00:00 2001 From: Kevin Modzelewski Date: Tue, 16 Aug 2022 17:31:28 -0400 Subject: [PATCH 07/11] Add docs --- Doc/using/configure.rst | 20 ++++++++++++++++++-- Doc/whatsnew/3.12.rst | 4 ++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/Doc/using/configure.rst b/Doc/using/configure.rst index 4e50e73a11b817..b0629e42269232 100644 --- a/Doc/using/configure.rst +++ b/Doc/using/configure.rst @@ -190,8 +190,8 @@ Install Options Performance options ------------------- -Configuring Python using ``--enable-optimizations --with-lto`` (PGO + LTO) is -recommended for best performance. +Configuring Python using ``--enable-optimizations --with-lto --enable-bolt`` +(PGO + LTO + BOLT) is recommended for best performance. .. cmdoption:: --enable-optimizations @@ -231,6 +231,22 @@ recommended for best performance. .. versionadded:: 3.11 To use ThinLTO feature, use ``--with-lto=thin`` on Clang. +.. cmdoption:: --enable-bolt + + Enable usage of the BOLT post-link binary optimizer (disabled by default). + + BOLT is part of the LLVM project but is not always included in their binary + distributions. This flag requires that ``llvm-bolt`` and ``merge-fdata`` + are available. + + BOLT is still a fairly new project so this flag should be considered + experimental for now. Because this tool operates on machine code its success + is dependent on a combination of the build environment + the other + optimization configure args + the CPU architecture, and not all combinations + are supported. + + .. versionadded:: 3.12 + .. cmdoption:: --with-computed-gotos Enable computed gotos in evaluation loop (enabled by default on supported diff --git a/Doc/whatsnew/3.12.rst b/Doc/whatsnew/3.12.rst index 5926205ce5c07d..d91173205d34b2 100644 --- a/Doc/whatsnew/3.12.rst +++ b/Doc/whatsnew/3.12.rst @@ -133,6 +133,10 @@ Optimizations It reduces object size by 8 or 16 bytes on 64bit platform. (:pep:`623`) (Contributed by Inada Naoki in :gh:`92536`.) +* Added experimental support for using the BOLT binary optimizer in the build + process, which improves performance by 1-5%. + (Contributed by Kevin Modzelewski in :gh:`90536`.) + CPython bytecode changes ======================== From 0050190807789c7786f817da4eb04697fbb55f5c Mon Sep 17 00:00:00 2001 From: Kevin Modzelewski Date: Tue, 16 Aug 2022 17:34:51 -0400 Subject: [PATCH 08/11] Other review comments --- Makefile.pre.in | 4 ++-- configure | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/Makefile.pre.in b/Makefile.pre.in index 7af2d40a4b4045..94f9ac7b6a9ee2 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -642,10 +642,10 @@ profile-opt: profile-run-stamp bolt-opt: @PREBOLT_RULE@ rm -f *.fdata - @LLVM_BOLT@ $(BUILDPYTHON) -instrument -instrumentation-file-append-pid -instrumentation-file=$(abspath $(BUILDPYTHON).bolt) -o $(BUILDPYTHON).bolt_inst + @LLVM_BOLT@ ./$(BUILDPYTHON) -instrument -instrumentation-file-append-pid -instrumentation-file=$(abspath $(BUILDPYTHON).bolt) -o $(BUILDPYTHON).bolt_inst ./$(BUILDPYTHON).bolt_inst $(PROFILE_TASK) || true @MERGE_FDATA@ $(BUILDPYTHON).*.fdata > $(BUILDPYTHON).fdata - @LLVM_BOLT@ $(BUILDPYTHON) -o $(BUILDPYTHON).bolt -data=$(BUILDPYTHON).fdata -update-debug-sections -reorder-blocks=ext-tsp -reorder-functions=hfsort+ -split-functions=3 -icf=1 -inline-all -split-eh -reorder-functions-use-hot-size -peepholes=all -jump-tables=aggressive -inline-ap -indirect-call-promotion=all -dyno-stats -use-gnu-stack -frame-opt=hot + @LLVM_BOLT@ ./$(BUILDPYTHON) -o $(BUILDPYTHON).bolt -data=$(BUILDPYTHON).fdata -update-debug-sections -reorder-blocks=ext-tsp -reorder-functions=hfsort+ -split-functions=3 -icf=1 -inline-all -split-eh -reorder-functions-use-hot-size -peepholes=all -jump-tables=aggressive -inline-ap -indirect-call-promotion=all -dyno-stats -use-gnu-stack -frame-opt=hot rm -f *.fdata rm -f $(BUILDPYTHON).bolt_inst mv $(BUILDPYTHON).bolt $(BUILDPYTHON) diff --git a/configure b/configure index d11baae8d08484..6b63db45537c5d 100755 --- a/configure +++ b/configure @@ -7765,7 +7765,9 @@ if test "$Py_BOLT" = 'true' ; then CFLAGS_NODIST="$CFLAGS_NODIST -fno-pie" # We want to add these no-pie flags to linking executables but not shared libraries: LINKCC="$LINKCC -fno-pie -no-pie" - + # Designate the DWARF version into 4 since the LLVM-BOLT does not support DWARF5 yet. + CFLAGS="$CFLAGS -gdwarf-4" + LDFLAGS="$LDFLAGS -gdwarf-4" if test -n "$ac_tool_prefix"; then # Extract the first word of "${ac_tool_prefix}llvm-bolt", so it can be a program name with args. From 83da8c4d5ed01c4f32e72ee22d36de1d1eddf13f Mon Sep 17 00:00:00 2001 From: Kevin Modzelewski Date: Tue, 16 Aug 2022 17:39:34 -0400 Subject: [PATCH 09/11] fix tab/space issue --- Makefile.pre.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.pre.in b/Makefile.pre.in index 94f9ac7b6a9ee2..0622e6c35cd45d 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -647,7 +647,7 @@ bolt-opt: @PREBOLT_RULE@ @MERGE_FDATA@ $(BUILDPYTHON).*.fdata > $(BUILDPYTHON).fdata @LLVM_BOLT@ ./$(BUILDPYTHON) -o $(BUILDPYTHON).bolt -data=$(BUILDPYTHON).fdata -update-debug-sections -reorder-blocks=ext-tsp -reorder-functions=hfsort+ -split-functions=3 -icf=1 -inline-all -split-eh -reorder-functions-use-hot-size -peepholes=all -jump-tables=aggressive -inline-ap -indirect-call-promotion=all -dyno-stats -use-gnu-stack -frame-opt=hot rm -f *.fdata - rm -f $(BUILDPYTHON).bolt_inst + rm -f $(BUILDPYTHON).bolt_inst mv $(BUILDPYTHON).bolt $(BUILDPYTHON) # Compile and run with gcov From 7f14cd18f7d4d693c16852d2bf4fa0c096f3131c Mon Sep 17 00:00:00 2001 From: Kevin Modzelewski Date: Wed, 17 Aug 2022 13:39:48 -0400 Subject: [PATCH 10/11] Make it more clear that --enable-bolt is experimental --- Doc/using/configure.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Doc/using/configure.rst b/Doc/using/configure.rst index b0629e42269232..cdee36e6cee88c 100644 --- a/Doc/using/configure.rst +++ b/Doc/using/configure.rst @@ -190,8 +190,9 @@ Install Options Performance options ------------------- -Configuring Python using ``--enable-optimizations --with-lto --enable-bolt`` -(PGO + LTO + BOLT) is recommended for best performance. +Configuring Python using ``--enable-optimizations --with-lto`` (PGO + LTO) is +recommended for best performance. The experimental ``--enable-bolt`` flag can +also be used to improve performance. .. cmdoption:: --enable-optimizations From f0f5e5345c14b846d77378949e43a4d8cacdff14 Mon Sep 17 00:00:00 2001 From: Kevin Modzelewski Date: Thu, 18 Aug 2022 13:18:13 -0400 Subject: [PATCH 11/11] Add link to bolt's github page --- Doc/using/configure.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Doc/using/configure.rst b/Doc/using/configure.rst index cdee36e6cee88c..ef770d7747c3d7 100644 --- a/Doc/using/configure.rst +++ b/Doc/using/configure.rst @@ -234,7 +234,9 @@ also be used to improve performance. .. cmdoption:: --enable-bolt - Enable usage of the BOLT post-link binary optimizer (disabled by default). + Enable usage of the `BOLT post-link binary optimizer + ` (disabled by + default). BOLT is part of the LLVM project but is not always included in their binary distributions. This flag requires that ``llvm-bolt`` and ``merge-fdata``