Skip to content

Commit

Permalink
[CMake] Add clang-bolt target
Browse files Browse the repository at this point in the history
This patch adds `CLANG_BOLT_INSTRUMENT` option that applies BOLT instrumentation
to Clang, performs a bootstrap build with the resulting Clang, merges resulting
fdata files into a single profile file, and uses it to perform BOLT optimization
on the original Clang binary.

The projects and targets used for bootstrap/profile collection are configurable via
`CLANG_BOLT_INSTRUMENT_PROJECTS` and `CLANG_BOLT_INSTRUMENT_TARGETS`.
The defaults are "llvm" and "count" respectively, which results in a profile with
~5.3B dynamically executed instructions.

The intended use of the functionality is through BOLT CMake cache file, similar
to PGO 2-stage build:
```
cmake <llvm-project>/llvm -C <llvm-project>/clang/cmake/caches/BOLT.cmake
ninja clang++-bolt # pulls clang-bolt
```

Stats with a recent checkout (clang-16), pre-built BOLT and Clang, 72vCPU/224G
| CMake configure with host Clang + BOLT.cmake | 1m6.592s
| Instrumenting Clang with BOLT | 2m50.508s
| CMake configure `llvm` with instrumented Clang | 5m46.364s (~5x slowdown)
| CMake build `not` with instrumented Clang |0m6.456s
| Merging fdata files | 0m9.439s
| Optimizing Clang with BOLT | 0m39.201s

Building Clang:
```cmake ../llvm-project/llvm -DCMAKE_C_COMPILER=... -DCMAKE_CXX_COMPILER=...
  -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS=clang
  -DLLVM_TARGETS_TO_BUILD=Native -GNinja```

| | Release | BOLT-optimized
| cmake | 0m24.016s | 0m22.333s
| ninja clang | 5m55.692s | 4m35.122s

I know it's not rigorous, but shows a ballpark figure.

Reviewed By: phosek

Differential Revision: https://reviews.llvm.org/D132975
  • Loading branch information
aaupov committed Sep 23, 2022
1 parent 1aaba40 commit 3dab7fe
Show file tree
Hide file tree
Showing 3 changed files with 143 additions and 4 deletions.
114 changes: 113 additions & 1 deletion clang/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,7 @@ CMAKE_DEPENDENT_OPTION(CLANG_PLUGIN_SUPPORT
"HAVE_CLANG_PLUGIN_SUPPORT" OFF)

# If libstdc++ is statically linked, clang-repl needs to statically link libstdc++
# itself, which is not possible in many platforms because of current limitations in
# itself, which is not possible in many platforms because of current limitations in
# JIT stack. (more platforms need to be supported by JITLink)
if(NOT LLVM_STATIC_LINK_CXX_STDLIB)
set(HAVE_CLANG_REPL_SUPPORT ON)
Expand Down Expand Up @@ -881,6 +881,118 @@ if (CLANG_ENABLE_BOOTSTRAP)
endforeach()
endif()

if (CLANG_BOLT_INSTRUMENT)
set(CLANG_PATH ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang)
set(CLANGXX_PATH ${CLANG_PATH}++)
set(CLANG_INSTRUMENTED ${CLANG_PATH}-bolt.inst)
set(CLANGXX_INSTRUMENTED ${CLANGXX_PATH}-bolt.inst)
set(CLANG_OPTIMIZED ${CLANG_PATH}-bolt)
set(CLANGXX_OPTIMIZED ${CLANGXX_PATH}-bolt)

# Instrument clang with BOLT
add_custom_target(clang-instrumented
DEPENDS ${CLANG_INSTRUMENTED}
)
add_custom_command(OUTPUT ${CLANG_INSTRUMENTED}
DEPENDS clang llvm-bolt
COMMAND llvm-bolt ${CLANG_PATH} -o ${CLANG_INSTRUMENTED}
-instrument --instrumentation-file-append-pid
--instrumentation-file=${CMAKE_CURRENT_BINARY_DIR}/prof.fdata
COMMENT "Instrumenting clang binary with BOLT"
VERBATIM
)

# Make a symlink from clang-bolt.inst to clang++-bolt.inst
add_custom_target(clang++-instrumented
DEPENDS ${CLANGXX_INSTRUMENTED}
)
add_custom_command(OUTPUT ${CLANGXX_INSTRUMENTED}
DEPENDS clang-instrumented
COMMAND ${CMAKE_COMMAND} -E create_symlink
${CLANG_INSTRUMENTED}
${CLANGXX_INSTRUMENTED}
COMMENT "Creating symlink from BOLT instrumented clang to clang++"
VERBATIM
)

# Build specified targets with instrumented Clang to collect the profile
set(STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt-instrumented-clang-stamps/)
set(BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt-instrumented-clang-bins/)
set(build_configuration "$<CONFIG>")
include(ExternalProject)
ExternalProject_Add(bolt-instrumentation-profile
DEPENDS clang++-instrumented
PREFIX bolt-instrumentation-profile
SOURCE_DIR ${CMAKE_SOURCE_DIR}
STAMP_DIR ${STAMP_DIR}
BINARY_DIR ${BINARY_DIR}
EXCLUDE_FROM_ALL 1
CMAKE_ARGS
${CLANG_BOLT_INSTRUMENT_EXTRA_CMAKE_FLAGS}
# We shouldn't need to set this here, but INSTALL_DIR doesn't
# seem to work, so instead I'm passing this through
-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}
-DCMAKE_C_COMPILER=${CLANG_INSTRUMENTED}
-DCMAKE_CXX_COMPILER=${CLANGXX_INSTRUMENTED}
-DCMAKE_ASM_COMPILER=${CLANG_INSTRUMENTED}
-DCMAKE_ASM_COMPILER_ID=Clang
-DCMAKE_BUILD_TYPE=Release
-DLLVM_ENABLE_PROJECTS=${CLANG_BOLT_INSTRUMENT_PROJECTS}
-DLLVM_TARGETS_TO_BUILD=${LLVM_TARGETS_TO_BUILD}
BUILD_COMMAND ${CMAKE_COMMAND} --build ${BINARY_DIR}
--config ${build_configuration}
--target ${CLANG_BOLT_INSTRUMENT_TARGETS}
INSTALL_COMMAND ""
STEP_TARGETS configure build
USES_TERMINAL_CONFIGURE 1
USES_TERMINAL_BUILD 1
USES_TERMINAL_INSTALL 1
)

# Merge profiles into one using merge-fdata
add_custom_target(clang-bolt-profile
DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata
)
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata
DEPENDS merge-fdata bolt-instrumentation-profile-build
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
COMMAND ${Python3_EXECUTABLE}
${CMAKE_CURRENT_SOURCE_DIR}/utils/perf-training/perf-helper.py merge-fdata
$<TARGET_FILE:merge-fdata> ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata
${CMAKE_CURRENT_BINARY_DIR}
COMMENT "Preparing BOLT profile"
VERBATIM
)

# Optimize original (pre-bolt) Clang using the collected profile
add_custom_target(clang-bolt
DEPENDS ${CLANG_OPTIMIZED}
)
add_custom_command(OUTPUT ${CLANG_OPTIMIZED}
DEPENDS clang-bolt-profile
COMMAND llvm-bolt ${CLANG_PATH}
-o ${CLANG_OPTIMIZED}
-data ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata
-reorder-blocks=ext-tsp -reorder-functions=hfsort+ -split-functions
-split-all-cold -split-eh -dyno-stats -icf=1 -use-gnu-stack
COMMENT "Optimizing Clang with BOLT"
VERBATIM
)

# Make a symlink from clang-bolt to clang++-bolt
add_custom_target(clang++-bolt
DEPENDS ${CLANGXX_OPTIMIZED}
)
add_custom_command(OUTPUT ${CLANGXX_OPTIMIZED}
DEPENDS clang-bolt
COMMAND ${CMAKE_COMMAND} -E create_symlink
${CLANG_OPTIMIZED}
${CLANGXX_OPTIMIZED}
COMMENT "Creating symlink from BOLT optimized clang to clang++"
VERBATIM
)
endif()

if (LLVM_ADD_NATIVE_VISUALIZERS_TO_SOLUTION)
add_subdirectory(utils/ClangVisualizers)
endif()
Expand Down
15 changes: 15 additions & 0 deletions clang/cmake/caches/BOLT.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
set(CMAKE_BUILD_TYPE Release CACHE STRING "")
set(CLANG_BOLT_INSTRUMENT ON CACHE BOOL "")
set(CLANG_BOLT_INSTRUMENT_PROJECTS "llvm" CACHE STRING "")
set(CLANG_BOLT_INSTRUMENT_TARGETS "count" CACHE STRING "")
set(CMAKE_EXE_LINKER_FLAGS "-Wl,--emit-relocs,-znow" CACHE STRING "")
set(CLANG_BOLT_INSTRUMENT_EXTRA_CMAKE_FLAGS "" CACHE STRING "")

set(LLVM_ENABLE_PROJECTS "bolt;clang" CACHE STRING "")
set(LLVM_TARGETS_TO_BUILD Native CACHE STRING "")

# Disable function splitting enabled by default in GCC8+
if("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-reorder-blocks-and-partition")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-reorder-blocks-and-partition")
endif()
18 changes: 15 additions & 3 deletions clang/utils/perf-training/perf-helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,24 @@ def clean(args):

def merge(args):
if len(args) != 3:
print('Usage: %s clean <llvm-profdata> <output> <path>\n' % __file__ +
print('Usage: %s merge <llvm-profdata> <output> <path>\n' % __file__ +
'\tMerges all profraw files from path into output.')
return 1
cmd = [args[0], 'merge', '-o', args[1]]
cmd.extend(findFilesWithExtension(args[2], "profraw"))
subprocess.check_call(cmd)
return 0

def merge_fdata(args):
if len(args) != 3:
print('Usage: %s merge-fdata <merge-fdata> <output> <path>\n' % __file__ +
'\tMerges all fdata files from path into output.')
return 1
cmd = [args[0], '-o', args[1]]
cmd.extend(findFilesWithExtension(args[2], "fdata"))
subprocess.check_call(cmd)
return 0

def dtrace(args):
parser = argparse.ArgumentParser(prog='perf-helper dtrace',
description='dtrace wrapper for order file generation')
Expand Down Expand Up @@ -395,10 +405,12 @@ def genOrderFile(args):
return 0

commands = {'clean' : clean,
'merge' : merge,
'merge' : merge,
'dtrace' : dtrace,
'cc1' : cc1,
'gen-order-file' : genOrderFile}
'gen-order-file' : genOrderFile,
'merge-fdata' : merge_fdata,
}

def main():
f = commands[sys.argv[1]]
Expand Down

0 comments on commit 3dab7fe

Please sign in to comment.