Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMDGPU] Swap range metadata to attribute for workitem id. #94871

Merged
merged 1 commit into from
Jun 9, 2024

Conversation

andjo403
Copy link
Contributor

@andjo403 andjo403 commented Jun 8, 2024

Swap out range metadata to range attribute for calls to be able to deprecate range metadata on calls in the future.

@llvmbot
Copy link
Collaborator

llvmbot commented Jun 8, 2024

@llvm/pr-subscribers-backend-amdgpu

Author: Andreas Jonson (andjo403)

Changes

Swap out range metadata to range attribute for calls to be able to deprecate range metadata on calls in the future.


Full diff: https://github.com/llvm/llvm-project/pull/94871.diff

5 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp (+10-4)
  • (modified) llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll (+8-13)
  • (modified) llvm/test/CodeGen/AMDGPU/private-memory-r600.ll (+5-8)
  • (modified) llvm/test/CodeGen/AMDGPU/promote-alloca-strip-abi-opt-attributes.ll (+3-3)
  • (modified) llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll (+46-46)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 94ee4ac78142d..0751c8dc8b8bf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -560,10 +560,16 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
   else
     ++MaxSize;
 
-  MDBuilder MDB(I->getContext());
-  MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
-                                                  APInt(32, MaxSize));
-  I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+  APInt Lower{32, MinSize};
+  APInt Upper{32, MaxSize};
+  if (auto *CI = dyn_cast<CallBase>(I)) {
+    ConstantRange Range(Lower, Upper);
+    CI->addRangeRetAttr(Range);
+  } else {
+    MDBuilder MDB(I->getContext());
+    MDNode *MaxWorkGroupSizeRange = MDB.createRange(Lower, Upper);
+    I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+  }
   return true;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
index d6841d40f2313..bd61558905f63 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -52,9 +52,9 @@
 ; HSAOPT: [[LDZU:%[0-9]+]] = load i32, ptr addrspace(4) [[GEP1]], align 4, !range !2, !invariant.load !1
 ; HSAOPT: [[EXTRACTY:%[0-9]+]] = lshr i32 [[LDXY]], 16
 
-; HSAOPT: [[WORKITEM_ID_X:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.x(), !range !3
-; HSAOPT: [[WORKITEM_ID_Y:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.y(), !range !3
-; HSAOPT: [[WORKITEM_ID_Z:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.z(), !range !3
+; HSAOPT: [[WORKITEM_ID_X:%[0-9]+]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.x()
+; HSAOPT: [[WORKITEM_ID_Y:%[0-9]+]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.y()
+; HSAOPT: [[WORKITEM_ID_Z:%[0-9]+]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.z()
 
 ; HSAOPT: [[Y_SIZE_X_Z_SIZE:%[0-9]+]] = mul nuw nsw i32 [[EXTRACTY]], [[LDZU]]
 ; HSAOPT: [[YZ_X_XID:%[0-9]+]] = mul i32 [[Y_SIZE_X_Z_SIZE]], [[WORKITEM_ID_X]]
@@ -68,11 +68,11 @@
 ; HSAOPT: %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(3) [[LOCAL_GEP]], i32 0, i32 1
 
 
-; NOHSAOPT: call i32 @llvm.r600.read.local.size.y(), !range !1
-; NOHSAOPT: call i32 @llvm.r600.read.local.size.z(), !range !1
-; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.x(), !range !2
-; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.y(), !range !2
-; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.z(), !range !2
+; NOHSAOPT: call range(i32 0, 257) i32 @llvm.r600.read.local.size.y()
+; NOHSAOPT: call range(i32 0, 257) i32 @llvm.r600.read.local.size.z()
+; NOHSAOPT: call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.x()
+; NOHSAOPT: call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.y()
+; NOHSAOPT: call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.z()
 define amdgpu_kernel void @mova_same_clause(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 {
 entry:
   %stack = alloca [5 x i32], align 4, addrspace(5)
@@ -533,8 +533,3 @@ attributes #1 = { nounwind "amdgpu-flat-work-group-size"="1,256" }
 !99 = !{i32 1, !"amdhsa_code_object_version", i32 400}
 
 ; HSAOPT: !1 = !{}
-; HSAOPT: !2 = !{i32 0, i32 257}
-; HSAOPT: !3 = !{i32 0, i32 256}
-
-; NOHSAOPT: !1 = !{i32 0, i32 257}
-; NOHSAOPT: !2 = !{i32 0, i32 256}
diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll b/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll
index 462ab38b4cd58..1f7de7343efdb 100644
--- a/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll
@@ -12,11 +12,11 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 ; R600: LDS_READ
 ; R600: LDS_READ
 
-; OPT: call i32 @llvm.r600.read.local.size.y(), !range !0
-; OPT: call i32 @llvm.r600.read.local.size.z(), !range !0
-; OPT: call i32 @llvm.r600.read.tidig.x(), !range !1
-; OPT: call i32 @llvm.r600.read.tidig.y(), !range !1
-; OPT: call i32 @llvm.r600.read.tidig.z(), !range !1
+; OPT: call range(i32 0, 257) i32 @llvm.r600.read.local.size.y()
+; OPT: call range(i32 0, 257) i32 @llvm.r600.read.local.size.z()
+; OPT: call range(i32 0, 256) i32 @llvm.r600.read.tidig.x()
+; OPT: call range(i32 0, 256) i32 @llvm.r600.read.tidig.y()
+; OPT: call range(i32 0, 256) i32 @llvm.r600.read.tidig.z()
 
 define amdgpu_kernel void @mova_same_clause(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 {
 entry:
@@ -276,7 +276,4 @@ define amdgpu_kernel void @ptrtoint(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
   ret void
 }
 
-; OPT: !0 = !{i32 0, i32 257}
-; OPT: !1 = !{i32 0, i32 256}
-
 attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" "amdgpu-flat-work-group-size"="1,256" }
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-strip-abi-opt-attributes.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-strip-abi-opt-attributes.ll
index ada1b841cd67c..778fe904382f2 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-strip-abi-opt-attributes.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-strip-abi-opt-attributes.ll
@@ -5,9 +5,9 @@
 
 ; CHECK-LABEL: define amdgpu_kernel void @promote_to_lds(ptr addrspace(1) %out, i32 %in) #0 {
 ; CHECK: call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-; CHECK: call i32 @llvm.amdgcn.workitem.id.x(), !range !2
-; CHECK: call i32 @llvm.amdgcn.workitem.id.y(), !range !2
-; CHECK: call i32 @llvm.amdgcn.workitem.id.z(), !range !2
+; CHECK: call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.x()
+; CHECK: call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.y()
+; CHECK: call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.z()
 define amdgpu_kernel void @promote_to_lds(ptr addrspace(1) %out, i32 %in) #0 {
 entry:
   %tmp = alloca [2 x i32], addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
index efc11bf1a606d..f56656270c335 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
@@ -8,22 +8,22 @@
 define amdgpu_kernel void @lds_promoted_alloca_icmp_same_derived_pointer(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
 ; CHECK-LABEL: @lds_promoted_alloca_icmp_same_derived_pointer(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !invariant.load !0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(4) [[TMP5]], align 4, !range [[RNG1:![0-9]+]], !invariant.load !0
-; CHECK-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP4]], 16
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.workitem.id.x(), !range [[RNG2:![0-9]+]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.workitem.id.y(), !range [[RNG2]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.workitem.id.z(), !range [[RNG2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP8]]
-; CHECK-NEXT:    [[TMP13:%.*]] = mul nuw nsw i32 [[TMP9]], [[TMP6]]
-; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], [[TMP10]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [256 x [16 x i32]], ptr addrspace(3) @lds_promoted_alloca_icmp_same_derived_pointer.alloca, i32 0, i32 [[TMP15]]
-; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP16]], i32 0, i32 [[A:%.*]]
-; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP16]], i32 0, i32 [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !invariant.load [[META0:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(4) [[TMP4]], align 4, !range [[RNG1:![0-9]+]], !invariant.load [[META0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr i32 [[TMP3]], 16
+; CHECK-NEXT:    [[TMP7:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TMP8:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TMP9:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw nsw i32 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i32 [[TMP10]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul nuw nsw i32 [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP9]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [256 x [16 x i32]], ptr addrspace(3) @lds_promoted_alloca_icmp_same_derived_pointer.alloca, i32 0, i32 [[TMP14]]
+; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP15]], i32 0, i32 [[A:%.*]]
+; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP15]], i32 0, i32 [[B:%.*]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr addrspace(3) [[PTR0]], [[PTR1]]
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4
@@ -50,21 +50,21 @@ define amdgpu_kernel void @lds_promoted_alloca_icmp_same_derived_pointer(ptr add
 define amdgpu_kernel void @lds_promoted_alloca_icmp_null_rhs(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
 ; CHECK-LABEL: @lds_promoted_alloca_icmp_null_rhs(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !invariant.load !0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(4) [[TMP5]], align 4, !range [[RNG1]], !invariant.load !0
-; CHECK-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP4]], 16
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.workitem.id.x(), !range [[RNG2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.workitem.id.y(), !range [[RNG2]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.workitem.id.z(), !range [[RNG2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP8]]
-; CHECK-NEXT:    [[TMP13:%.*]] = mul nuw nsw i32 [[TMP9]], [[TMP6]]
-; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], [[TMP10]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [256 x [16 x i32]], ptr addrspace(3) @lds_promoted_alloca_icmp_null_rhs.alloca, i32 0, i32 [[TMP15]]
-; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP16]], i32 0, i32 [[A:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !invariant.load [[META0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(4) [[TMP4]], align 4, !range [[RNG1]], !invariant.load [[META0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr i32 [[TMP3]], 16
+; CHECK-NEXT:    [[TMP7:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TMP8:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TMP9:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw nsw i32 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i32 [[TMP10]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul nuw nsw i32 [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP9]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [256 x [16 x i32]], ptr addrspace(3) @lds_promoted_alloca_icmp_null_rhs.alloca, i32 0, i32 [[TMP14]]
+; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP15]], i32 0, i32 [[A:%.*]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr addrspace(3) [[PTR0]], null
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4
@@ -89,21 +89,21 @@ define amdgpu_kernel void @lds_promoted_alloca_icmp_null_rhs(ptr addrspace(1) %o
 define amdgpu_kernel void @lds_promoted_alloca_icmp_null_lhs(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
 ; CHECK-LABEL: @lds_promoted_alloca_icmp_null_lhs(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !invariant.load !0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(4) [[TMP5]], align 4, !range [[RNG1]], !invariant.load !0
-; CHECK-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP4]], 16
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.workitem.id.x(), !range [[RNG2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.workitem.id.y(), !range [[RNG2]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.workitem.id.z(), !range [[RNG2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP8]]
-; CHECK-NEXT:    [[TMP13:%.*]] = mul nuw nsw i32 [[TMP9]], [[TMP6]]
-; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], [[TMP10]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [256 x [16 x i32]], ptr addrspace(3) @lds_promoted_alloca_icmp_null_lhs.alloca, i32 0, i32 [[TMP15]]
-; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP16]], i32 0, i32 [[A:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !invariant.load [[META0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(4) [[TMP4]], align 4, !range [[RNG1]], !invariant.load [[META0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr i32 [[TMP3]], 16
+; CHECK-NEXT:    [[TMP7:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TMP8:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TMP9:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw nsw i32 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i32 [[TMP10]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul nuw nsw i32 [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP9]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [256 x [16 x i32]], ptr addrspace(3) @lds_promoted_alloca_icmp_null_lhs.alloca, i32 0, i32 [[TMP14]]
+; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP15]], i32 0, i32 [[A:%.*]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr addrspace(3) null, [[PTR0]]
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4

Copy link
Contributor

@arsenm arsenm left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM. We also should move setting this into AMDGPUAttributor

@andjo403
Copy link
Contributor Author

andjo403 commented Jun 9, 2024

@arsenm thanks for the review I need help with merging as I do not have access.

@shiltian shiltian merged commit cc19374 into llvm:main Jun 9, 2024
9 checks passed
@andjo403 andjo403 deleted the amdGpuRangeAttr branch June 9, 2024 15:03
@HerrCai0907 HerrCai0907 mentioned this pull request Jun 13, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants