-
Notifications
You must be signed in to change notification settings - Fork 12.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[NVPTX] Fix internal indirect call prototypes not obeying the ABI #100131
Conversation
@llvm/pr-subscribers-libc @llvm/pr-subscribers-backend-nvptx Author: Joseph Huber (jhuber6) ChangesSummary: Fixes #100055 Full diff: https://github.com/llvm/llvm-project/pull/100131.diff 3 Files Affected:
diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index 42909cec55890..fa878d8999227 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -1,13 +1,3 @@
-if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
- set(extra_entrypoints
- # stdio.h entrypoints
- libc.src.stdio.snprintf
- libc.src.stdio.sprintf
- libc.src.stdio.vsnprintf
- libc.src.stdio.vsprintf
- )
-endif()
-
set(TARGET_LIBC_ENTRYPOINTS
# assert.h entrypoints
libc.src.assert.__assert_fail
@@ -186,13 +176,16 @@ set(TARGET_LIBC_ENTRYPOINTS
libc.src.errno.errno
# stdio.h entrypoints
- ${extra_entrypoints}
libc.src.stdio.clearerr
libc.src.stdio.fclose
libc.src.stdio.printf
libc.src.stdio.vprintf
libc.src.stdio.fprintf
libc.src.stdio.vfprintf
+ libc.src.stdio.snprintf
+ libc.src.stdio.sprintf
+ libc.src.stdio.vsnprintf
+ libc.src.stdio.vsprintf
libc.src.stdio.feof
libc.src.stdio.ferror
libc.src.stdio.fflush
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 44c1a2e50486c..bcd2ee2f4fc1e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1429,7 +1429,6 @@ std::string NVPTXTargetLowering::getPrototype(
bool first = true;
- const Function *F = CB.getFunction();
unsigned NumArgs = VAInfo ? VAInfo->first : Args.size();
for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) {
Type *Ty = Args[i].Ty;
@@ -1471,10 +1470,11 @@ std::string NVPTXTargetLowering::getPrototype(
continue;
}
+ // Indirect calls need strict ABI alignment so we disable optimizations.
Type *ETy = Args[i].IndirectType;
Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
Align ParamByValAlign =
- getFunctionByValParamAlign(F, ETy, InitialAlign, DL);
+ getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL);
O << ".param .align " << ParamByValAlign.value() << " .b8 ";
O << "_";
diff --git a/llvm/test/CodeGen/NVPTX/indirect_byval.ll b/llvm/test/CodeGen/NVPTX/indirect_byval.ll
new file mode 100644
index 0000000000000..ac6c4e262fd60
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/indirect_byval.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | %ptxas-verify %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.S = type { i8 }
+%struct.U = type { i64 }
+
+@ptr = external global ptr, align 8
+
+define internal i32 @foo() {
+; CHECK-LABEL: foo(
+; CHECK: {
+; CHECK-NEXT: .local .align 1 .b8 __local_depot0[2];
+; CHECK-NEXT: .reg .b64 %SP;
+; CHECK-NEXT: .reg .b64 %SPL;
+; CHECK-NEXT: .reg .b16 %rs<2>;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: mov.u64 %SPL, __local_depot0;
+; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT: ld.global.u64 %rd1, [ptr];
+; CHECK-NEXT: ld.u8 %rs1, [%SP+1];
+; CHECK-NEXT: add.u64 %rd2, %SP, 0;
+; CHECK-NEXT: { // callseq 0, 0
+; CHECK-NEXT: .param .align 1 .b8 param0[1];
+; CHECK-NEXT: st.param.b8 [param0+0], %rs1;
+; CHECK-NEXT: .param .b64 param1;
+; CHECK-NEXT: st.param.b64 [param1+0], %rd2;
+; CHECK-NEXT: .param .b32 retval0;
+; CHECK-NEXT: prototype_0 : .callprototype (.param .b32 _) _ (.param .align 1 .b8 _[1], .param .b64 _);
+; CHECK-NEXT: call (retval0),
+; CHECK-NEXT: %rd1,
+; CHECK-NEXT: (
+; CHECK-NEXT: param0,
+; CHECK-NEXT: param1
+; CHECK-NEXT: )
+; CHECK-NEXT: , prototype_0;
+; CHECK-NEXT: ld.param.b32 %r1, [retval0+0];
+; CHECK-NEXT: } // callseq 0
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %s = alloca %struct.S, align 1
+ %agg.tmp = alloca %struct.S, align 1
+ %0 = load ptr, ptr @ptr, align 8
+ %call = call i32 %0(ptr byval(%struct.S) align 1 %agg.tmp, ptr noundef %s)
+ ret i32 %call
+}
+
+define internal i32 @bar() {
+; CHECK-LABEL: bar(
+; CHECK: // @bar
+; CHECK-NEXT: {
+; CHECK-NEXT: .local .align 8 .b8 __local_depot1[16];
+; CHECK-NEXT: .reg .b64 %SP;
+; CHECK-NEXT: .reg .b64 %SPL;
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: mov.u64 %SPL, __local_depot1;
+; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT: ld.global.u64 %rd1, [ptr];
+; CHECK-NEXT: ld.u64 %rd2, [%SP+8];
+; CHECK-NEXT: add.u64 %rd3, %SP, 0;
+; CHECK-NEXT: { // callseq 1, 0
+; CHECK-NEXT: .param .align 8 .b8 param0[8];
+; CHECK-NEXT: st.param.b64 [param0+0], %rd2;
+; CHECK-NEXT: .param .b64 param1;
+; CHECK-NEXT: st.param.b64 [param1+0], %rd3;
+; CHECK-NEXT: .param .b32 retval0;
+; CHECK-NEXT: prototype_1 : .callprototype (.param .b32 _) _ (.param .align 8 .b8 _[8], .param .b64 _);
+; CHECK-NEXT: call (retval0),
+; CHECK-NEXT: %rd1,
+; CHECK-NEXT: (
+; CHECK-NEXT: param0,
+; CHECK-NEXT: param1
+; CHECK-NEXT: )
+; CHECK-NEXT: , prototype_1;
+; CHECK-NEXT: ld.param.b32 %r1, [retval0+0];
+; CHECK-NEXT: } // callseq 1
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT: ret;
+entry:
+ %s = alloca %struct.U, align 8
+ %agg.tmp = alloca %struct.U, align 8
+ %0 = load ptr, ptr @ptr, align 8
+ %call = call noundef i32 %0(ptr byval(%struct.U) align 8 %agg.tmp, ptr %s)
+ ret i32 %call
+}
|
The |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM in principle, with a nit.
Type *ETy = Args[i].IndirectType; | ||
Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); | ||
Align ParamByValAlign = | ||
getFunctionByValParamAlign(F, ETy, InitialAlign, DL); | ||
getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The comment above needs to mantion that it's the nullptr
here that forces strict ABI-compliant alignment. Otherwise it's not clear which part of the code controls this.
Ideally, it would be even nicer to make "ABI-compliant alignment" a parameter.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This doesn't really make sense to me as a fix. Indirect calls shouldn't require special handling vs. direct
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The NVPTX backend thinks that internal
functions can have their ABI modified, so they up-scale alignment on things. This doesn't work for these indirect calls, which seem to strictly require the ABI.
Summary: The NVPTX backend optimizes the ABI for functions that are internal, however, this is not legal for indirect call prototypes. Previously, we would modify the ABI on an aggregate byval type passed to an indirect call prototype, which would make PTXAS error. This patch just passes the function as a nullptr to force strict ABI compliance without modification in the helper function. Fixes llvm#100055
cbfdca3
to
1a779ff
Compare
…vm#100131) Summary: The NVPTX backend optimizes the ABI for functions that are internal, however, this is not legal for indirect call prototypes. Previously, we would modify the ABI on an aggregate byval type passed to an indirect call prototype, which would make PTXAS error. This patch just passes the function as a nullptr to force strict ABI compliance without modification in the helper function. Fixes llvm#100055 (cherry picked from commit e0649a5)
…vm#100131) Summary: The NVPTX backend optimizes the ABI for functions that are internal, however, this is not legal for indirect call prototypes. Previously, we would modify the ABI on an aggregate byval type passed to an indirect call prototype, which would make PTXAS error. This patch just passes the function as a nullptr to force strict ABI compliance without modification in the helper function. Fixes llvm#100055 (cherry picked from commit e0649a5)
…00131) Summary: The NVPTX backend optimizes the ABI for functions that are internal, however, this is not legal for indirect call prototypes. Previously, we would modify the ABI on an aggregate byval type passed to an indirect call prototype, which would make PTXAS error. This patch just passes the function as a nullptr to force strict ABI compliance without modification in the helper function. Fixes #100055 Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60251110
Summary:
The NVPTX backend optimizes the ABI for functions that are internal,
however, this is not legal for indirect call prototypes. Previously, we
would modify the ABI on an aggregate byval type passed to an indirect
call prototype, which would make PTXAS error. This patch just passes the
function as a nullptr to force strict ABI compliance without
modification in the helper function.
Fixes #100055