[clang][OpenCL][CodeGen][AMDGPU] Do not use private as the default …

…AS for when `generic` is available (#112442) Currently, for AMDGPU, when compiling for OpenCL, we unconditionally use `private` as the default address space. This is wrong for cases where the `generic` address space is available, and is corrected via this patch. In general, this AS map abuse is a bad hack and we should re-work it altogether, but at least after this patch we will stop being incorrect for e.g. OpenCL 2.0.
llvm · Oct 22, 2024 · 6e0b003 · 6e0b003
1 parent aea60ab
commit 6e0b003
Show file tree

Hide file tree

Showing 20 changed files with 1,154 additions and 549 deletions.
diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp
@@ -260,9 +260,9 @@ AMDGPUTargetInfo::AMDGPUTargetInfo(const llvm::Triple &Triple,
 void AMDGPUTargetInfo::adjust(DiagnosticsEngine &Diags, LangOptions &Opts) {
   TargetInfo::adjust(Diags, Opts);
   // ToDo: There are still a few places using default address space as private
-  // address space in OpenCL, which needs to be cleaned up, then Opts.OpenCL
-  // can be removed from the following line.
-  setAddressSpaceMap(/*DefaultIsPrivate=*/Opts.OpenCL ||
+  // address space in OpenCL, which needs to be cleaned up, then the references
+  // to OpenCL can be removed from the following line.
+  setAddressSpaceMap((Opts.OpenCL && !Opts.OpenCLGenericAddressSpace) ||
                      !isAMDGCN(getTriple()));
 }
 

diff --git a/clang/lib/CodeGen/CGBlocks.cpp b/clang/lib/CodeGen/CGBlocks.cpp
@@ -1397,7 +1397,8 @@ void CodeGenFunction::setBlockContextParameter(const ImplicitParamDecl *D,
       DI->setLocation(D->getLocation());
       DI->EmitDeclareOfBlockLiteralArgVariable(
           *BlockInfo, D->getName(), argNum,
-          cast<llvm::AllocaInst>(alloc.getPointer()), Builder);
+          cast<llvm::AllocaInst>(alloc.getPointer()->stripPointerCasts()),
+          Builder);
     }
   }
 

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5853,8 +5853,13 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
           /*IndexTypeQuals=*/0);
       auto Tmp = CreateMemTemp(SizeArrayTy, "block_sizes");
       llvm::Value *TmpPtr = Tmp.getPointer();
+      // The EmitLifetime* pair expect a naked Alloca as their last argument,
+      // however for cases where the default AS is not the Alloca AS, Tmp is
+      // actually the Alloca ascasted to the default AS, hence the
+      // stripPointerCasts()
+      llvm::Value *Alloca = TmpPtr->stripPointerCasts();
       llvm::Value *TmpSize = EmitLifetimeStart(
-          CGM.getDataLayout().getTypeAllocSize(Tmp.getElementType()), TmpPtr);
+          CGM.getDataLayout().getTypeAllocSize(Tmp.getElementType()), Alloca);
       llvm::Value *ElemPtr;
       // Each of the following arguments specifies the size of the corresponding
       // argument passed to the enqueued block.
@@ -5870,7 +5875,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
         Builder.CreateAlignedStore(
             V, GEP, CGM.getDataLayout().getPrefTypeAlign(SizeTy));
       }
-      return std::tie(ElemPtr, TmpSize, TmpPtr);
+      // Return the Alloca itself rather than a potential ascast as this is only
+      // used by the paired EmitLifetimeEnd.
+      return std::tie(ElemPtr, TmpSize, Alloca);
     };
 
     // Could have events and/or varargs.

diff --git a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
diff --git a/clang/test/CodeGenOpenCL/amdgcn-automatic-variable.cl b/clang/test/CodeGenOpenCL/amdgcn-automatic-variable.cl
@@ -15,8 +15,9 @@
 // CL20-SAME: ptr noundef [[X:%.*]]) #[[ATTR0:[0-9]+]] {
 // CL20-NEXT:  [[ENTRY:.*:]]
 // CL20-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CL20-NEXT:    store ptr [[X]], ptr addrspace(5) [[X_ADDR]], align 8
-// CL20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[X_ADDR]], align 8
+// CL20-NEXT:    [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
+// CL20-NEXT:    store ptr [[X]], ptr [[X_ADDR_ASCAST]], align 8
+// CL20-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR_ASCAST]], align 8
 // CL20-NEXT:    store i32 1, ptr [[TMP0]], align 4
 // CL20-NEXT:    ret void
 //
@@ -54,25 +55,27 @@ void func1(int *x) {
 // CL20-NEXT:    [[LP1:%.*]] = alloca ptr, align 8, addrspace(5)
 // CL20-NEXT:    [[LP2:%.*]] = alloca ptr, align 8, addrspace(5)
 // CL20-NEXT:    [[LVC:%.*]] = alloca i32, align 4, addrspace(5)
-// CL20-NEXT:    store i32 1, ptr addrspace(5) [[LV1]], align 4
-// CL20-NEXT:    store i32 2, ptr addrspace(5) [[LV2]], align 4
-// CL20-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x i32], ptr addrspace(5) [[LA]], i64 0, i64 0
-// CL20-NEXT:    store i32 3, ptr addrspace(5) [[ARRAYIDX]], align 4
 // CL20-NEXT:    [[LV1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LV1]] to ptr
-// CL20-NEXT:    store ptr [[LV1_ASCAST]], ptr addrspace(5) [[LP1]], align 8
-// CL20-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [100 x i32], ptr addrspace(5) [[LA]], i64 0, i64 0
-// CL20-NEXT:    [[ARRAYDECAY_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ARRAYDECAY]] to ptr
-// CL20-NEXT:    store ptr [[ARRAYDECAY_ASCAST]], ptr addrspace(5) [[LP2]], align 8
-// CL20-NEXT:    [[LV1_ASCAST1:%.*]] = addrspacecast ptr addrspace(5) [[LV1]] to ptr
-// CL20-NEXT:    call void @func1(ptr noundef [[LV1_ASCAST1]]) #[[ATTR2:[0-9]+]]
-// CL20-NEXT:    store i32 4, ptr addrspace(5) [[LVC]], align 4
-// CL20-NEXT:    store i32 4, ptr addrspace(5) [[LV1]], align 4
+// CL20-NEXT:    [[LV2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LV2]] to ptr
+// CL20-NEXT:    [[LA_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LA]] to ptr
+// CL20-NEXT:    [[LP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LP1]] to ptr
+// CL20-NEXT:    [[LP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LP2]] to ptr
+// CL20-NEXT:    [[LVC_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LVC]] to ptr
+// CL20-NEXT:    store i32 1, ptr [[LV1_ASCAST]], align 4
+// CL20-NEXT:    store i32 2, ptr [[LV2_ASCAST]], align 4
+// CL20-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x i32], ptr [[LA_ASCAST]], i64 0, i64 0
+// CL20-NEXT:    store i32 3, ptr [[ARRAYIDX]], align 4
+// CL20-NEXT:    store ptr [[LV1_ASCAST]], ptr [[LP1_ASCAST]], align 8
+// CL20-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [100 x i32], ptr [[LA_ASCAST]], i64 0, i64 0
+// CL20-NEXT:    store ptr [[ARRAYDECAY]], ptr [[LP2_ASCAST]], align 8
+// CL20-NEXT:    call void @func1(ptr noundef [[LV1_ASCAST]]) #[[ATTR2:[0-9]+]]
+// CL20-NEXT:    store i32 4, ptr [[LVC_ASCAST]], align 4
+// CL20-NEXT:    store i32 4, ptr [[LV1_ASCAST]], align 4
 // CL20-NEXT:    ret void
 //
 void func2(void) {
   int lv1;
   lv1 = 1;
-
   int lv2 = 2;
 
   int la[100];
@@ -99,7 +102,8 @@ void func2(void) {
 // CL20-SAME: ) #[[ATTR0]] {
 // CL20-NEXT:  [[ENTRY:.*:]]
 // CL20-NEXT:    [[A:%.*]] = alloca [16 x [1 x float]], align 4, addrspace(5)
-// CL20-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) align 4 [[A]], i8 0, i64 64, i1 false)
+// CL20-NEXT:    [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr
+// CL20-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[A_ASCAST]], i8 0, i64 64, i1 false)
 // CL20-NEXT:    ret void
 //
 void func3(void) {