Skip to content

Commit

Permalink
[HIP] Align device binary
Browse files Browse the repository at this point in the history
To facilitate faster loading of device binaries and share them among processes,
HIP runtime favors their alignment being 4096 bytes. HIP runtime can load
unaligned device binaries, however, aligning them at 4096 bytes results in
faster loading and less shared memory usage.

This patch adds an option -bundle-align to clang-offload-bundler which allows
bundles to be aligned at specified alignment. By default it is 1, which is NFC
compared to existing format.

This patch then aligns embedded fat binary and device binary inside fat binary
at 4096 bytes.

It has been verified this change does not cause significant overall file size increase
for typical HIP applications (less than 1%).

Differential Revision: https://reviews.llvm.org/D88734
  • Loading branch information
yxsamliu committed Oct 2, 2020
1 parent 04fce15 commit dc6a0b0
Show file tree
Hide file tree
Showing 7 changed files with 41 additions and 4 deletions.
6 changes: 4 additions & 2 deletions clang/lib/CodeGen/CGCUDANV.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -597,8 +597,10 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
if (CudaGpuBinary) {
// If fatbin is available from early finalization, create a string
// literal containing the fat binary loaded from the given file.
FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()),
"", FatbinConstantName, 8);
const unsigned HIPCodeObjectAlign = 4096;
FatBinStr =
makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
FatbinConstantName, HIPCodeObjectAlign);
} else {
// If fatbin is not available, create an external symbol
// __hip_fatbin in section .hip_fatbin. The external symbol is supposed
Expand Down
7 changes: 6 additions & 1 deletion clang/lib/Driver/ToolChains/HIP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "clang/Driver/Driver.h"
#include "clang/Driver/DriverDiagnostic.h"
#include "clang/Driver/Options.h"
#include "llvm/Support/Alignment.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/TargetParser.h"
Expand All @@ -33,6 +34,7 @@ using namespace llvm::opt;
#endif

namespace {
const unsigned HIPCodeObjectAlign = 4096;

static void addBCLib(const Driver &D, const ArgList &Args,
ArgStringList &CmdArgs, ArgStringList LibraryPaths,
Expand Down Expand Up @@ -108,6 +110,8 @@ void AMDGCN::constructHIPFatbinCommand(Compilation &C, const JobAction &JA,
// for different GPU archs.
ArgStringList BundlerArgs;
BundlerArgs.push_back(Args.MakeArgString("-type=o"));
BundlerArgs.push_back(
Args.MakeArgString("-bundle-align=" + Twine(HIPCodeObjectAlign)));

// ToDo: Remove the dummy host binary entry which is required by
// clang-offload-bundler.
Expand Down Expand Up @@ -175,7 +179,8 @@ void AMDGCN::Linker::constructGenerateObjFileFromHIPFatBinary(
ObjStream << " .section .hip_fatbin,\"aMS\",@progbits,1\n";
ObjStream << " .data\n";
ObjStream << " .globl __hip_fatbin\n";
ObjStream << " .p2align 3\n";
ObjStream << " .p2align " << llvm::Log2(llvm::Align(HIPCodeObjectAlign))
<< "\n";
ObjStream << "__hip_fatbin:\n";
ObjStream << " .incbin \"" << BundleFile << "\"\n";
ObjStream.flush();
Expand Down
2 changes: 1 addition & 1 deletion clang/test/CodeGenCUDA/device-stub.cu
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ void use_pointers() {
// ALL: @4 = private unnamed_addr constant [21 x i8] c"ext_constant_var_def\00"
// * constant unnamed string with GPU binary
// CUDA: @[[FATBIN:.*]] = private constant{{.*GPU binary would be here.*}}\00",
// HIPEF: @[[FATBIN:.*]] = private constant{{.*GPU binary would be here.*}}\00",
// HIPEF: @[[FATBIN:.*]] = private constant{{.*GPU binary would be here.*}}\00",{{.*}}align 4096
// HIPNEF: @[[FATBIN:__hip_fatbin]] = external constant i8, section ".hip_fatbin"
// CUDANORDC-SAME: section ".nv_fatbin", align 8
// CUDARDC-SAME: section "__nv_relfatbin", align 8
Expand Down
10 changes: 10 additions & 0 deletions clang/test/Driver/clang-offload-bundler.c
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,16 @@
// RUN: diff %t.empty %t.res.tgt1
// RUN: diff %t.empty %t.res.tgt2

//
// Check -bundle-align option
//

// RUN: clang-offload-bundler -bundle-align=4096 -type=bc -targets=host-%itanium_abi_triple,openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu -inputs=%t.bc,%t.tgt1,%t.tgt2 -outputs=%t.bundle3.bc
// RUN: clang-offload-bundler -type=bc -targets=host-%itanium_abi_triple,openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu -outputs=%t.res.bc,%t.res.tgt1,%t.res.tgt2 -inputs=%t.bundle3.bc -unbundle
// RUN: diff %t.bc %t.res.bc
// RUN: diff %t.tgt1 %t.res.tgt1
// RUN: diff %t.tgt2 %t.res.tgt2

// Some code so that we can create a binary out of this file.
int A = 0;
void test_func(void) {
Expand Down
2 changes: 2 additions & 0 deletions clang/test/Driver/hip-toolchain-no-rdc.hip
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
//

// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
// CHECK-SAME: "-bundle-align=4096"
// CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
// CHECK-SAME: "-inputs={{.*}},[[IMG_DEV_A_803]],[[IMG_DEV_A_900]]" "-outputs=[[BUNDLE_A:.*hipfb]]"

Expand Down Expand Up @@ -143,6 +144,7 @@
//

// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
// CHECK-SAME: "-bundle-align=4096"
// CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
// CHECK-SAME: "-inputs={{.*}},[[IMG_DEV_B_803]],[[IMG_DEV_B_900]]" "-outputs=[[BUNDLE_A:.*hipfb]]"

Expand Down
5 changes: 5 additions & 0 deletions clang/test/Driver/hip-toolchain-rdc.hip
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,14 @@
// RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
// RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib2 \
// RUN: -fuse-ld=lld -fgpu-rdc -nogpuinc \
// RUN: -fhip-dump-offload-linker-script \
// RUN: %S/Inputs/hip_multiple_inputs/a.cu \
// RUN: %S/Inputs/hip_multiple_inputs/b.hip \
// RUN: 2>&1 | FileCheck %s

// check code object alignment in dumped llvm-mc input
// CHECK: .p2align 12

// emit objects for host side path
// CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
// CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
Expand Down Expand Up @@ -87,6 +91,7 @@

// combine images generated into hip fat binary object
// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
// CHECK-SAME: "-bundle-align=4096"
// CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900"
// CHECK-SAME: "-inputs={{.*}},[[IMG_DEV1]],[[IMG_DEV2]]" "-outputs=[[BUNDLE:.*hipfb]]"

Expand Down
13 changes: 13 additions & 0 deletions clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,11 @@ static cl::opt<bool> PrintExternalCommands(
"instead of actually executing them - for testing purposes.\n"),
cl::init(false), cl::cat(ClangOffloadBundlerCategory));

static cl::opt<unsigned>
BundleAlignment("bundle-align",
cl::desc("Alignment of bundle for binary files"),
cl::init(1), cl::cat(ClangOffloadBundlerCategory));

/// Magic string that marks the existence of offloading data.
#define OFFLOAD_BUNDLER_MAGIC_STR "__CLANG_OFFLOAD_BUNDLE__"

Expand Down Expand Up @@ -223,6 +228,9 @@ class BinaryFileHandler final : public FileHandler {
StringMap<BundleInfo>::iterator CurBundleInfo;
StringMap<BundleInfo>::iterator NextBundleInfo;

/// Current bundle target to be written.
std::string CurWriteBundleTarget;

public:
BinaryFileHandler() : FileHandler() {}

Expand Down Expand Up @@ -337,10 +345,12 @@ class BinaryFileHandler final : public FileHandler {
unsigned Idx = 0;
for (auto &T : TargetNames) {
MemoryBuffer &MB = *Inputs[Idx++];
HeaderSize = alignTo(HeaderSize, BundleAlignment);
// Bundle offset.
Write8byteIntegerToBuffer(OS, HeaderSize);
// Size of the bundle (adds to the next bundle's offset)
Write8byteIntegerToBuffer(OS, MB.getBufferSize());
BundlesInfo[T] = BundleInfo(MB.getBufferSize(), HeaderSize);
HeaderSize += MB.getBufferSize();
// Size of the triple
Write8byteIntegerToBuffer(OS, T.size());
Expand All @@ -351,6 +361,7 @@ class BinaryFileHandler final : public FileHandler {
}

Error WriteBundleStart(raw_fd_ostream &OS, StringRef TargetTriple) final {
CurWriteBundleTarget = TargetTriple.str();
return Error::success();
}

Expand All @@ -359,6 +370,8 @@ class BinaryFileHandler final : public FileHandler {
}

Error WriteBundle(raw_fd_ostream &OS, MemoryBuffer &Input) final {
auto BI = BundlesInfo[CurWriteBundleTarget];
OS.seek(BI.Offset);
OS.write(Input.getBufferStart(), Input.getBufferSize());
return Error::success();
}
Expand Down

0 comments on commit dc6a0b0

Please sign in to comment.