-
Notifications
You must be signed in to change notification settings - Fork 12.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[flang] Allow to pass an async id to allocate the descriptor #118713
Conversation
@llvm/pr-subscribers-flang-fir-hlfir @llvm/pr-subscribers-openacc Author: Valentin Clement (バレンタイン クレメン) (clementval) ChangesThis is a patch in preparation for the support stream ordered memory allocator in CUDA Fortran. This patch adds an asynchronous id to the AllocatableAllocate runtime function and to Descriptor::Allocate so it can be passed down to the registered allocator. It is up to the allocator to use this value or not. A follow up patch will implement that asynchronous allocator for CUDA Fortran. Patch is 38.65 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/118713.diff 22 Files Affected:
diff --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h
index 06bda81c6f75ad..40423c5ce04885 100644
--- a/flang/include/flang/Runtime/CUDA/allocator.h
+++ b/flang/include/flang/Runtime/CUDA/allocator.h
@@ -19,16 +19,16 @@ extern "C" {
void RTDECL(CUFRegisterAllocator)();
}
-void *CUFAllocPinned(std::size_t);
+void *CUFAllocPinned(std::size_t, std::int64_t);
void CUFFreePinned(void *);
-void *CUFAllocDevice(std::size_t);
+void *CUFAllocDevice(std::size_t, std::int64_t);
void CUFFreeDevice(void *);
-void *CUFAllocManaged(std::size_t);
+void *CUFAllocManaged(std::size_t, std::int64_t);
void CUFFreeManaged(void *);
-void *CUFAllocUnified(std::size_t);
+void *CUFAllocUnified(std::size_t, std::int64_t);
void CUFFreeUnified(void *);
} // namespace Fortran::runtime::cuda
diff --git a/flang/include/flang/Runtime/CUDA/common.h b/flang/include/flang/Runtime/CUDA/common.h
index e9f61932230e95..8172ea39a14f84 100644
--- a/flang/include/flang/Runtime/CUDA/common.h
+++ b/flang/include/flang/Runtime/CUDA/common.h
@@ -23,6 +23,9 @@ static constexpr unsigned kHostToDevice = 0;
static constexpr unsigned kDeviceToHost = 1;
static constexpr unsigned kDeviceToDevice = 2;
+/// Value used for asyncId when no specific stream is specified.
+static constexpr std::int64_t kCudaNoStream = -1;
+
#define CUDA_REPORT_IF_ERROR(expr) \
[](cudaError_t err) { \
if (err == cudaSuccess) \
diff --git a/flang/include/flang/Runtime/allocatable.h b/flang/include/flang/Runtime/allocatable.h
index 58061d9862095e..121c31af963aa0 100644
--- a/flang/include/flang/Runtime/allocatable.h
+++ b/flang/include/flang/Runtime/allocatable.h
@@ -94,9 +94,9 @@ int RTDECL(AllocatableCheckLengthParameter)(Descriptor &,
// Successfully allocated memory is initialized if the allocatable has a
// derived type, and is always initialized by AllocatableAllocateSource().
// Performs all necessary coarray synchronization and validation actions.
-int RTDECL(AllocatableAllocate)(Descriptor &, bool hasStat = false,
- const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
- int sourceLine = 0);
+int RTDECL(AllocatableAllocate)(Descriptor &, std::int64_t asyncId = -1,
+ bool hasStat = false, const Descriptor *errMsg = nullptr,
+ const char *sourceFile = nullptr, int sourceLine = 0);
int RTDECL(AllocatableAllocateSource)(Descriptor &, const Descriptor &source,
bool hasStat = false, const Descriptor *errMsg = nullptr,
const char *sourceFile = nullptr, int sourceLine = 0);
diff --git a/flang/include/flang/Runtime/allocator-registry.h b/flang/include/flang/Runtime/allocator-registry.h
index 3ccee56dc3fc0f..771fa8a9a9933c 100644
--- a/flang/include/flang/Runtime/allocator-registry.h
+++ b/flang/include/flang/Runtime/allocator-registry.h
@@ -10,6 +10,7 @@
#define FORTRAN_RUNTIME_ALLOCATOR_REGISTRY_H_
#include "flang/Common/api-attrs.h"
+#include <cstdint>
#include <cstdlib>
#include <vector>
@@ -25,7 +26,7 @@ static constexpr unsigned kUnifiedAllocatorPos = 4;
namespace Fortran::runtime {
-using AllocFct = void *(*)(std::size_t);
+using AllocFct = void *(*)(std::size_t, std::int64_t);
using FreeFct = void (*)(void *);
typedef struct Allocator_t {
@@ -33,10 +34,11 @@ typedef struct Allocator_t {
FreeFct free{nullptr};
} Allocator_t;
-#ifdef RT_DEVICE_COMPILATION
-static RT_API_ATTRS void *MallocWrapper(std::size_t size) {
+static RT_API_ATTRS void *MallocWrapper(
+ std::size_t size, [[maybe_unused]] std::int64_t) {
return std::malloc(size);
}
+#ifdef RT_DEVICE_COMPILATION
static RT_API_ATTRS void FreeWrapper(void *p) { return std::free(p); }
#endif
@@ -46,7 +48,7 @@ struct AllocatorRegistry {
: allocators{{&MallocWrapper, &FreeWrapper}} {}
#else
constexpr AllocatorRegistry() {
- allocators[kDefaultAllocator] = {&std::malloc, &std::free};
+ allocators[kDefaultAllocator] = {&MallocWrapper, &std::free};
};
#endif
RT_API_ATTRS void Register(int, Allocator_t);
diff --git a/flang/include/flang/Runtime/descriptor.h b/flang/include/flang/Runtime/descriptor.h
index 030d0c1031fbaa..e6300accfeac08 100644
--- a/flang/include/flang/Runtime/descriptor.h
+++ b/flang/include/flang/Runtime/descriptor.h
@@ -374,7 +374,7 @@ class Descriptor {
// before calling. It (re)computes the byte strides after
// allocation. Does not allocate automatic components or
// perform default component initialization.
- RT_API_ATTRS int Allocate();
+ RT_API_ATTRS int Allocate(std::int64_t asyncId = -1);
RT_API_ATTRS void SetByteStrides();
// Deallocates storage; does not call FINAL subroutines or
diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index fb8380ac7e8c51..f1436564aabaa2 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -184,9 +184,14 @@ static mlir::Value genRuntimeAllocate(fir::FirOpBuilder &builder,
? fir::runtime::getRuntimeFunc<mkRTKey(PointerAllocate)>(loc, builder)
: fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(loc,
builder);
- llvm::SmallVector<mlir::Value> args{
- box.getAddr(), errorManager.hasStat, errorManager.errMsgAddr,
- errorManager.sourceFile, errorManager.sourceLine};
+ llvm::SmallVector<mlir::Value> args{box.getAddr()};
+ if (!box.isPointer())
+ args.push_back(
+ builder.createIntegerConstant(loc, builder.getI64Type(), -1));
+ args.push_back(errorManager.hasStat);
+ args.push_back(errorManager.errMsgAddr);
+ args.push_back(errorManager.sourceFile);
+ args.push_back(errorManager.sourceLine);
llvm::SmallVector<mlir::Value> operands;
for (auto [fst, snd] : llvm::zip(args, callee.getFunctionType().getInputs()))
operands.emplace_back(builder.createConvert(loc, snd, fst));
diff --git a/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp b/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp
index 70a88ff18cb1da..28452d3b486da3 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp
@@ -76,16 +76,19 @@ void fir::runtime::genAllocatableAllocate(fir::FirOpBuilder &builder,
mlir::func::FuncOp func{
fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(loc, builder)};
mlir::FunctionType fTy{func.getFunctionType()};
+ mlir::Value asyncId =
+ builder.createIntegerConstant(loc, builder.getI64Type(), -1);
mlir::Value sourceFile{fir::factory::locationToFilename(builder, loc)};
mlir::Value sourceLine{
- fir::factory::locationToLineNo(builder, loc, fTy.getInput(4))};
+ fir::factory::locationToLineNo(builder, loc, fTy.getInput(5))};
if (!hasStat)
hasStat = builder.createBool(loc, false);
if (!errMsg) {
mlir::Type boxNoneTy = fir::BoxType::get(builder.getNoneType());
errMsg = builder.create<fir::AbsentOp>(loc, boxNoneTy).getResult();
}
- llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
- builder, loc, fTy, desc, hasStat, errMsg, sourceFile, sourceLine)};
+ llvm::SmallVector<mlir::Value> args{
+ fir::runtime::createArguments(builder, loc, fTy, desc, asyncId, hasStat,
+ errMsg, sourceFile, sourceLine)};
builder.create<fir::CallOp>(loc, func, args);
}
diff --git a/flang/runtime/CUDA/allocatable.cpp b/flang/runtime/CUDA/allocatable.cpp
index 9be54e8906903d..3f6f8f3d6d5de0 100644
--- a/flang/runtime/CUDA/allocatable.cpp
+++ b/flang/runtime/CUDA/allocatable.cpp
@@ -52,7 +52,7 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t stream,
}
// Perform the standard allocation.
int stat{RTNAME(AllocatableAllocate)(
- desc, hasStat, errMsg, sourceFile, sourceLine)};
+ desc, stream, hasStat, errMsg, sourceFile, sourceLine)};
return stat;
}
diff --git a/flang/runtime/CUDA/allocator.cpp b/flang/runtime/CUDA/allocator.cpp
index 85b3daf65a8ba4..e41ed77e40ff99 100644
--- a/flang/runtime/CUDA/allocator.cpp
+++ b/flang/runtime/CUDA/allocator.cpp
@@ -33,7 +33,8 @@ void RTDEF(CUFRegisterAllocator)() {
}
}
-void *CUFAllocPinned(std::size_t sizeInBytes) {
+void *CUFAllocPinned(
+ std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
void *p;
CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
return p;
@@ -41,7 +42,8 @@ void *CUFAllocPinned(std::size_t sizeInBytes) {
void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }
-void *CUFAllocDevice(std::size_t sizeInBytes) {
+void *CUFAllocDevice(
+ std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
void *p;
CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
return p;
@@ -49,7 +51,8 @@ void *CUFAllocDevice(std::size_t sizeInBytes) {
void CUFFreeDevice(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
-void *CUFAllocManaged(std::size_t sizeInBytes) {
+void *CUFAllocManaged(
+ std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
void *p;
CUDA_REPORT_IF_ERROR(
cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
@@ -58,9 +61,10 @@ void *CUFAllocManaged(std::size_t sizeInBytes) {
void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
-void *CUFAllocUnified(std::size_t sizeInBytes) {
+void *CUFAllocUnified(
+ std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
// Call alloc managed for the time being.
- return CUFAllocManaged(sizeInBytes);
+ return CUFAllocManaged(sizeInBytes, asyncId);
}
void CUFFreeUnified(void *p) {
diff --git a/flang/runtime/CUDA/descriptor.cpp b/flang/runtime/CUDA/descriptor.cpp
index 7ce1429cd94d4a..f1feb00941aa8a 100644
--- a/flang/runtime/CUDA/descriptor.cpp
+++ b/flang/runtime/CUDA/descriptor.cpp
@@ -19,7 +19,8 @@ RT_EXT_API_GROUP_BEGIN
Descriptor *RTDEF(CUFAllocDesciptor)(
std::size_t sizeInBytes, const char *sourceFile, int sourceLine) {
- return reinterpret_cast<Descriptor *>(CUFAllocManaged(sizeInBytes));
+ return reinterpret_cast<Descriptor *>(
+ CUFAllocManaged(sizeInBytes, kCudaNoStream));
}
void RTDEF(CUFFreeDesciptor)(
diff --git a/flang/runtime/allocatable.cpp b/flang/runtime/allocatable.cpp
index 5e065f47636a89..b65cec8d51cf86 100644
--- a/flang/runtime/allocatable.cpp
+++ b/flang/runtime/allocatable.cpp
@@ -133,15 +133,17 @@ void RTDEF(AllocatableApplyMold)(
}
}
-int RTDEF(AllocatableAllocate)(Descriptor &descriptor, bool hasStat,
- const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
+int RTDEF(AllocatableAllocate)(Descriptor &descriptor, std::int64_t asyncId,
+ bool hasStat, const Descriptor *errMsg, const char *sourceFile,
+ int sourceLine) {
Terminator terminator{sourceFile, sourceLine};
if (!descriptor.IsAllocatable()) {
return ReturnError(terminator, StatInvalidDescriptor, errMsg, hasStat);
} else if (descriptor.IsAllocated()) {
return ReturnError(terminator, StatBaseNotNull, errMsg, hasStat);
} else {
- int stat{ReturnError(terminator, descriptor.Allocate(), errMsg, hasStat)};
+ int stat{
+ ReturnError(terminator, descriptor.Allocate(asyncId), errMsg, hasStat)};
if (stat == StatOk) {
if (const DescriptorAddendum * addendum{descriptor.Addendum()}) {
if (const auto *derived{addendum->derivedType()}) {
@@ -160,7 +162,7 @@ int RTDEF(AllocatableAllocateSource)(Descriptor &alloc,
const Descriptor &source, bool hasStat, const Descriptor *errMsg,
const char *sourceFile, int sourceLine) {
int stat{RTNAME(AllocatableAllocate)(
- alloc, hasStat, errMsg, sourceFile, sourceLine)};
+ alloc, /*asyncId=*/-1, hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
Terminator terminator{sourceFile, sourceLine};
DoFromSourceAssign(alloc, source, terminator);
diff --git a/flang/runtime/array-constructor.cpp b/flang/runtime/array-constructor.cpp
index 72e08feff7fd10..3d0e969188f259 100644
--- a/flang/runtime/array-constructor.cpp
+++ b/flang/runtime/array-constructor.cpp
@@ -50,8 +50,8 @@ static RT_API_ATTRS void AllocateOrReallocateVectorIfNeeded(
initialAllocationSize(fromElements, to.ElementBytes())};
to.GetDimension(0).SetBounds(1, allocationSize);
RTNAME(AllocatableAllocate)
- (to, /*hasStat=*/false, /*errMsg=*/nullptr, vector.sourceFile,
- vector.sourceLine);
+ (to, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr,
+ vector.sourceFile, vector.sourceLine);
to.GetDimension(0).SetBounds(1, fromElements);
vector.actualAllocationSize = allocationSize;
} else {
@@ -59,8 +59,8 @@ static RT_API_ATTRS void AllocateOrReallocateVectorIfNeeded(
// first value: there should be no reallocation.
RUNTIME_CHECK(terminator, previousToElements >= fromElements);
RTNAME(AllocatableAllocate)
- (to, /*hasStat=*/false, /*errMsg=*/nullptr, vector.sourceFile,
- vector.sourceLine);
+ (to, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr,
+ vector.sourceFile, vector.sourceLine);
vector.actualAllocationSize = previousToElements;
}
} else {
diff --git a/flang/runtime/descriptor.cpp b/flang/runtime/descriptor.cpp
index 32f43e89dc7a36..f43c96bed7d00d 100644
--- a/flang/runtime/descriptor.cpp
+++ b/flang/runtime/descriptor.cpp
@@ -163,7 +163,7 @@ RT_API_ATTRS static inline int MapAllocIdx(const Descriptor &desc) {
#endif
}
-RT_API_ATTRS int Descriptor::Allocate() {
+RT_API_ATTRS int Descriptor::Allocate(std::int64_t asyncId) {
std::size_t elementBytes{ElementBytes()};
if (static_cast<std::int64_t>(elementBytes) < 0) {
// F'2023 7.4.4.2 p5: "If the character length parameter value evaluates
@@ -175,7 +175,7 @@ RT_API_ATTRS int Descriptor::Allocate() {
// Zero size allocation is possible in Fortran and the resulting
// descriptor must be allocated/associated. Since std::malloc(0)
// result is implementation defined, always allocate at least one byte.
- void *p{alloc(byteSize ? byteSize : 1)};
+ void *p{alloc(byteSize ? byteSize : 1, asyncId)};
if (!p) {
return CFI_ERROR_MEM_ALLOCATION;
}
diff --git a/flang/test/HLFIR/elemental-codegen.fir b/flang/test/HLFIR/elemental-codegen.fir
index 0d5f343cb17711..3c33bf8fca2d14 100644
--- a/flang/test/HLFIR/elemental-codegen.fir
+++ b/flang/test/HLFIR/elemental-codegen.fir
@@ -192,7 +192,7 @@ func.func @test_polymorphic(%arg0: !fir.class<!fir.type<_QMtypesTt>> {fir.bindc_
// CHECK: %[[VAL_35:.*]] = fir.absent !fir.box<none>
// CHECK: %[[VAL_36:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: %[[VAL_37:.*]] = fir.convert %[[VAL_31]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK: %[[VAL_38:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_36]], %[[VAL_34]], %[[VAL_35]], %[[VAL_37]], %[[VAL_33]]) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %[[VAL_38:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_36]], %{{.*}}, %[[VAL_34]], %[[VAL_35]], %[[VAL_37]], %[[VAL_33]]) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
// CHECK: %[[VAL_39:.*]] = fir.load %[[VAL_13]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>
// CHECK: %[[VAL_40:.*]] = arith.constant 1 : index
// CHECK: fir.do_loop %[[VAL_41:.*]] = %[[VAL_40]] to %[[EX1]] step %[[VAL_40]] unordered {
@@ -276,7 +276,7 @@ func.func @test_polymorphic_expr(%arg0: !fir.class<!fir.type<_QMtypesTt>> {fir.b
// CHECK: %[[VAL_36:.*]] = fir.absent !fir.box<none>
// CHECK: %[[VAL_37:.*]] = fir.convert %[[VAL_5]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: %[[VAL_38:.*]] = fir.convert %[[VAL_32]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK: %[[VAL_39:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_37]], %[[VAL_35]], %[[VAL_36]], %[[VAL_38]], %[[VAL_34]]) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %[[VAL_39:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_37]], %{{.*}}, %[[VAL_35]], %[[VAL_36]], %[[VAL_38]], %[[VAL_34]]) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
// CHECK: %[[VAL_40:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>
// CHECK: %[[VAL_41:.*]] = arith.constant 1 : index
// CHECK: fir.do_loop %[[VAL_42:.*]] = %[[VAL_41]] to %[[VAL_3]] step %[[VAL_41]] unordered {
@@ -329,7 +329,7 @@ func.func @test_polymorphic_expr(%arg0: !fir.class<!fir.type<_QMtypesTt>> {fir.b
// CHECK: %[[VAL_85:.*]] = fir.absent !fir.box<none>
// CHECK: %[[VAL_86:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: %[[VAL_87:.*]] = fir.convert %[[VAL_81]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK: %[[VAL_88:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_86]], %[[VAL_84]], %[[VAL_85]], %[[VAL_87]], %[[VAL_83]]) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %[[VAL_88:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_86]], %{{.*}}, %[[VAL_84]], %[[VAL_85]], %[[VAL_87]], %[[VAL_83]]) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
// CHECK: %[[VAL_89:.*]] = fir.load %[[VAL_63]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>
// CHECK: %[[VAL_90:.*]] = arith.constant 1 : index
// CHECK: fir.do_loop %[[VAL_91:.*]] = %[[VAL_90]] to %[[VAL_3]] step %[[VAL_90]] unordered {
diff --git a/flang/test/Lower/OpenACC/acc-declare.f90 b/flang/test/Lower/OpenACC/acc-declare.f90
index 0066e712fbdcce..9fe51a8db55e3b 100644
--- a/flang/test/Lower/OpenACC/acc-declare.f90
+++ b/flang/test/Lower/OpenACC/acc-declare.f90
@@ -469,6 +469,6 @@ subroutine init()
end module
! CHECK-LABEL: func.func @_QMacc_declare_post_action_statPinit()
-! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEx_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEx_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
! CHECK: fir.if
-! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEy_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEy_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
diff --git a/flang/test/Lower/allocatable-polymorphic.f90 b/flang/test/Lower/allocatable-polymorphic.f90
index 4d70e1ea4c739a..852ce5159c18ce 100644
--- a/flang/test/Lower/allocatable-polymorphic.f90
+++ b/flang/test/Lower/allocatable-polymorphic.f90
@@ -267,7 +267,7 @@ subroutine test_allocatable()
! CHECK: %[[C0:.*]] = arith.constant 0 : i32
! CHECK: fir.call @_FortranAAllocatableInitDerivedForAllocate(%[[P_CAST]], %[[TYPE_DESC_P1_CAST]], %[[RANK]], %[[C0]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> none
! CHECK: %[[P_CAST:.*]] = fir.convert %[[P_DECL]]#1 : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[P_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[P_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) ...
[truncated]
|
@llvm/pr-subscribers-flang-runtime Author: Valentin Clement (バレンタイン クレメン) (clementval) ChangesThis is a patch in preparation for the support stream ordered memory allocator in CUDA Fortran. This patch adds an asynchronous id to the AllocatableAllocate runtime function and to Descriptor::Allocate so it can be passed down to the registered allocator. It is up to the allocator to use this value or not. A follow up patch will implement that asynchronous allocator for CUDA Fortran. Patch is 38.65 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/118713.diff 22 Files Affected:
diff --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h
index 06bda81c6f75ad..40423c5ce04885 100644
--- a/flang/include/flang/Runtime/CUDA/allocator.h
+++ b/flang/include/flang/Runtime/CUDA/allocator.h
@@ -19,16 +19,16 @@ extern "C" {
void RTDECL(CUFRegisterAllocator)();
}
-void *CUFAllocPinned(std::size_t);
+void *CUFAllocPinned(std::size_t, std::int64_t);
void CUFFreePinned(void *);
-void *CUFAllocDevice(std::size_t);
+void *CUFAllocDevice(std::size_t, std::int64_t);
void CUFFreeDevice(void *);
-void *CUFAllocManaged(std::size_t);
+void *CUFAllocManaged(std::size_t, std::int64_t);
void CUFFreeManaged(void *);
-void *CUFAllocUnified(std::size_t);
+void *CUFAllocUnified(std::size_t, std::int64_t);
void CUFFreeUnified(void *);
} // namespace Fortran::runtime::cuda
diff --git a/flang/include/flang/Runtime/CUDA/common.h b/flang/include/flang/Runtime/CUDA/common.h
index e9f61932230e95..8172ea39a14f84 100644
--- a/flang/include/flang/Runtime/CUDA/common.h
+++ b/flang/include/flang/Runtime/CUDA/common.h
@@ -23,6 +23,9 @@ static constexpr unsigned kHostToDevice = 0;
static constexpr unsigned kDeviceToHost = 1;
static constexpr unsigned kDeviceToDevice = 2;
+/// Value used for asyncId when no specific stream is specified.
+static constexpr std::int64_t kCudaNoStream = -1;
+
#define CUDA_REPORT_IF_ERROR(expr) \
[](cudaError_t err) { \
if (err == cudaSuccess) \
diff --git a/flang/include/flang/Runtime/allocatable.h b/flang/include/flang/Runtime/allocatable.h
index 58061d9862095e..121c31af963aa0 100644
--- a/flang/include/flang/Runtime/allocatable.h
+++ b/flang/include/flang/Runtime/allocatable.h
@@ -94,9 +94,9 @@ int RTDECL(AllocatableCheckLengthParameter)(Descriptor &,
// Successfully allocated memory is initialized if the allocatable has a
// derived type, and is always initialized by AllocatableAllocateSource().
// Performs all necessary coarray synchronization and validation actions.
-int RTDECL(AllocatableAllocate)(Descriptor &, bool hasStat = false,
- const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
- int sourceLine = 0);
+int RTDECL(AllocatableAllocate)(Descriptor &, std::int64_t asyncId = -1,
+ bool hasStat = false, const Descriptor *errMsg = nullptr,
+ const char *sourceFile = nullptr, int sourceLine = 0);
int RTDECL(AllocatableAllocateSource)(Descriptor &, const Descriptor &source,
bool hasStat = false, const Descriptor *errMsg = nullptr,
const char *sourceFile = nullptr, int sourceLine = 0);
diff --git a/flang/include/flang/Runtime/allocator-registry.h b/flang/include/flang/Runtime/allocator-registry.h
index 3ccee56dc3fc0f..771fa8a9a9933c 100644
--- a/flang/include/flang/Runtime/allocator-registry.h
+++ b/flang/include/flang/Runtime/allocator-registry.h
@@ -10,6 +10,7 @@
#define FORTRAN_RUNTIME_ALLOCATOR_REGISTRY_H_
#include "flang/Common/api-attrs.h"
+#include <cstdint>
#include <cstdlib>
#include <vector>
@@ -25,7 +26,7 @@ static constexpr unsigned kUnifiedAllocatorPos = 4;
namespace Fortran::runtime {
-using AllocFct = void *(*)(std::size_t);
+using AllocFct = void *(*)(std::size_t, std::int64_t);
using FreeFct = void (*)(void *);
typedef struct Allocator_t {
@@ -33,10 +34,11 @@ typedef struct Allocator_t {
FreeFct free{nullptr};
} Allocator_t;
-#ifdef RT_DEVICE_COMPILATION
-static RT_API_ATTRS void *MallocWrapper(std::size_t size) {
+static RT_API_ATTRS void *MallocWrapper(
+ std::size_t size, [[maybe_unused]] std::int64_t) {
return std::malloc(size);
}
+#ifdef RT_DEVICE_COMPILATION
static RT_API_ATTRS void FreeWrapper(void *p) { return std::free(p); }
#endif
@@ -46,7 +48,7 @@ struct AllocatorRegistry {
: allocators{{&MallocWrapper, &FreeWrapper}} {}
#else
constexpr AllocatorRegistry() {
- allocators[kDefaultAllocator] = {&std::malloc, &std::free};
+ allocators[kDefaultAllocator] = {&MallocWrapper, &std::free};
};
#endif
RT_API_ATTRS void Register(int, Allocator_t);
diff --git a/flang/include/flang/Runtime/descriptor.h b/flang/include/flang/Runtime/descriptor.h
index 030d0c1031fbaa..e6300accfeac08 100644
--- a/flang/include/flang/Runtime/descriptor.h
+++ b/flang/include/flang/Runtime/descriptor.h
@@ -374,7 +374,7 @@ class Descriptor {
// before calling. It (re)computes the byte strides after
// allocation. Does not allocate automatic components or
// perform default component initialization.
- RT_API_ATTRS int Allocate();
+ RT_API_ATTRS int Allocate(std::int64_t asyncId = -1);
RT_API_ATTRS void SetByteStrides();
// Deallocates storage; does not call FINAL subroutines or
diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index fb8380ac7e8c51..f1436564aabaa2 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -184,9 +184,14 @@ static mlir::Value genRuntimeAllocate(fir::FirOpBuilder &builder,
? fir::runtime::getRuntimeFunc<mkRTKey(PointerAllocate)>(loc, builder)
: fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(loc,
builder);
- llvm::SmallVector<mlir::Value> args{
- box.getAddr(), errorManager.hasStat, errorManager.errMsgAddr,
- errorManager.sourceFile, errorManager.sourceLine};
+ llvm::SmallVector<mlir::Value> args{box.getAddr()};
+ if (!box.isPointer())
+ args.push_back(
+ builder.createIntegerConstant(loc, builder.getI64Type(), -1));
+ args.push_back(errorManager.hasStat);
+ args.push_back(errorManager.errMsgAddr);
+ args.push_back(errorManager.sourceFile);
+ args.push_back(errorManager.sourceLine);
llvm::SmallVector<mlir::Value> operands;
for (auto [fst, snd] : llvm::zip(args, callee.getFunctionType().getInputs()))
operands.emplace_back(builder.createConvert(loc, snd, fst));
diff --git a/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp b/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp
index 70a88ff18cb1da..28452d3b486da3 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp
@@ -76,16 +76,19 @@ void fir::runtime::genAllocatableAllocate(fir::FirOpBuilder &builder,
mlir::func::FuncOp func{
fir::runtime::getRuntimeFunc<mkRTKey(AllocatableAllocate)>(loc, builder)};
mlir::FunctionType fTy{func.getFunctionType()};
+ mlir::Value asyncId =
+ builder.createIntegerConstant(loc, builder.getI64Type(), -1);
mlir::Value sourceFile{fir::factory::locationToFilename(builder, loc)};
mlir::Value sourceLine{
- fir::factory::locationToLineNo(builder, loc, fTy.getInput(4))};
+ fir::factory::locationToLineNo(builder, loc, fTy.getInput(5))};
if (!hasStat)
hasStat = builder.createBool(loc, false);
if (!errMsg) {
mlir::Type boxNoneTy = fir::BoxType::get(builder.getNoneType());
errMsg = builder.create<fir::AbsentOp>(loc, boxNoneTy).getResult();
}
- llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
- builder, loc, fTy, desc, hasStat, errMsg, sourceFile, sourceLine)};
+ llvm::SmallVector<mlir::Value> args{
+ fir::runtime::createArguments(builder, loc, fTy, desc, asyncId, hasStat,
+ errMsg, sourceFile, sourceLine)};
builder.create<fir::CallOp>(loc, func, args);
}
diff --git a/flang/runtime/CUDA/allocatable.cpp b/flang/runtime/CUDA/allocatable.cpp
index 9be54e8906903d..3f6f8f3d6d5de0 100644
--- a/flang/runtime/CUDA/allocatable.cpp
+++ b/flang/runtime/CUDA/allocatable.cpp
@@ -52,7 +52,7 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t stream,
}
// Perform the standard allocation.
int stat{RTNAME(AllocatableAllocate)(
- desc, hasStat, errMsg, sourceFile, sourceLine)};
+ desc, stream, hasStat, errMsg, sourceFile, sourceLine)};
return stat;
}
diff --git a/flang/runtime/CUDA/allocator.cpp b/flang/runtime/CUDA/allocator.cpp
index 85b3daf65a8ba4..e41ed77e40ff99 100644
--- a/flang/runtime/CUDA/allocator.cpp
+++ b/flang/runtime/CUDA/allocator.cpp
@@ -33,7 +33,8 @@ void RTDEF(CUFRegisterAllocator)() {
}
}
-void *CUFAllocPinned(std::size_t sizeInBytes) {
+void *CUFAllocPinned(
+ std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
void *p;
CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
return p;
@@ -41,7 +42,8 @@ void *CUFAllocPinned(std::size_t sizeInBytes) {
void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }
-void *CUFAllocDevice(std::size_t sizeInBytes) {
+void *CUFAllocDevice(
+ std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
void *p;
CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
return p;
@@ -49,7 +51,8 @@ void *CUFAllocDevice(std::size_t sizeInBytes) {
void CUFFreeDevice(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
-void *CUFAllocManaged(std::size_t sizeInBytes) {
+void *CUFAllocManaged(
+ std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
void *p;
CUDA_REPORT_IF_ERROR(
cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
@@ -58,9 +61,10 @@ void *CUFAllocManaged(std::size_t sizeInBytes) {
void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
-void *CUFAllocUnified(std::size_t sizeInBytes) {
+void *CUFAllocUnified(
+ std::size_t sizeInBytes, [[maybe_unused]] std::int64_t asyncId) {
// Call alloc managed for the time being.
- return CUFAllocManaged(sizeInBytes);
+ return CUFAllocManaged(sizeInBytes, asyncId);
}
void CUFFreeUnified(void *p) {
diff --git a/flang/runtime/CUDA/descriptor.cpp b/flang/runtime/CUDA/descriptor.cpp
index 7ce1429cd94d4a..f1feb00941aa8a 100644
--- a/flang/runtime/CUDA/descriptor.cpp
+++ b/flang/runtime/CUDA/descriptor.cpp
@@ -19,7 +19,8 @@ RT_EXT_API_GROUP_BEGIN
Descriptor *RTDEF(CUFAllocDesciptor)(
std::size_t sizeInBytes, const char *sourceFile, int sourceLine) {
- return reinterpret_cast<Descriptor *>(CUFAllocManaged(sizeInBytes));
+ return reinterpret_cast<Descriptor *>(
+ CUFAllocManaged(sizeInBytes, kCudaNoStream));
}
void RTDEF(CUFFreeDesciptor)(
diff --git a/flang/runtime/allocatable.cpp b/flang/runtime/allocatable.cpp
index 5e065f47636a89..b65cec8d51cf86 100644
--- a/flang/runtime/allocatable.cpp
+++ b/flang/runtime/allocatable.cpp
@@ -133,15 +133,17 @@ void RTDEF(AllocatableApplyMold)(
}
}
-int RTDEF(AllocatableAllocate)(Descriptor &descriptor, bool hasStat,
- const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
+int RTDEF(AllocatableAllocate)(Descriptor &descriptor, std::int64_t asyncId,
+ bool hasStat, const Descriptor *errMsg, const char *sourceFile,
+ int sourceLine) {
Terminator terminator{sourceFile, sourceLine};
if (!descriptor.IsAllocatable()) {
return ReturnError(terminator, StatInvalidDescriptor, errMsg, hasStat);
} else if (descriptor.IsAllocated()) {
return ReturnError(terminator, StatBaseNotNull, errMsg, hasStat);
} else {
- int stat{ReturnError(terminator, descriptor.Allocate(), errMsg, hasStat)};
+ int stat{
+ ReturnError(terminator, descriptor.Allocate(asyncId), errMsg, hasStat)};
if (stat == StatOk) {
if (const DescriptorAddendum * addendum{descriptor.Addendum()}) {
if (const auto *derived{addendum->derivedType()}) {
@@ -160,7 +162,7 @@ int RTDEF(AllocatableAllocateSource)(Descriptor &alloc,
const Descriptor &source, bool hasStat, const Descriptor *errMsg,
const char *sourceFile, int sourceLine) {
int stat{RTNAME(AllocatableAllocate)(
- alloc, hasStat, errMsg, sourceFile, sourceLine)};
+ alloc, /*asyncId=*/-1, hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
Terminator terminator{sourceFile, sourceLine};
DoFromSourceAssign(alloc, source, terminator);
diff --git a/flang/runtime/array-constructor.cpp b/flang/runtime/array-constructor.cpp
index 72e08feff7fd10..3d0e969188f259 100644
--- a/flang/runtime/array-constructor.cpp
+++ b/flang/runtime/array-constructor.cpp
@@ -50,8 +50,8 @@ static RT_API_ATTRS void AllocateOrReallocateVectorIfNeeded(
initialAllocationSize(fromElements, to.ElementBytes())};
to.GetDimension(0).SetBounds(1, allocationSize);
RTNAME(AllocatableAllocate)
- (to, /*hasStat=*/false, /*errMsg=*/nullptr, vector.sourceFile,
- vector.sourceLine);
+ (to, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr,
+ vector.sourceFile, vector.sourceLine);
to.GetDimension(0).SetBounds(1, fromElements);
vector.actualAllocationSize = allocationSize;
} else {
@@ -59,8 +59,8 @@ static RT_API_ATTRS void AllocateOrReallocateVectorIfNeeded(
// first value: there should be no reallocation.
RUNTIME_CHECK(terminator, previousToElements >= fromElements);
RTNAME(AllocatableAllocate)
- (to, /*hasStat=*/false, /*errMsg=*/nullptr, vector.sourceFile,
- vector.sourceLine);
+ (to, /*asyncId=*/-1, /*hasStat=*/false, /*errMsg=*/nullptr,
+ vector.sourceFile, vector.sourceLine);
vector.actualAllocationSize = previousToElements;
}
} else {
diff --git a/flang/runtime/descriptor.cpp b/flang/runtime/descriptor.cpp
index 32f43e89dc7a36..f43c96bed7d00d 100644
--- a/flang/runtime/descriptor.cpp
+++ b/flang/runtime/descriptor.cpp
@@ -163,7 +163,7 @@ RT_API_ATTRS static inline int MapAllocIdx(const Descriptor &desc) {
#endif
}
-RT_API_ATTRS int Descriptor::Allocate() {
+RT_API_ATTRS int Descriptor::Allocate(std::int64_t asyncId) {
std::size_t elementBytes{ElementBytes()};
if (static_cast<std::int64_t>(elementBytes) < 0) {
// F'2023 7.4.4.2 p5: "If the character length parameter value evaluates
@@ -175,7 +175,7 @@ RT_API_ATTRS int Descriptor::Allocate() {
// Zero size allocation is possible in Fortran and the resulting
// descriptor must be allocated/associated. Since std::malloc(0)
// result is implementation defined, always allocate at least one byte.
- void *p{alloc(byteSize ? byteSize : 1)};
+ void *p{alloc(byteSize ? byteSize : 1, asyncId)};
if (!p) {
return CFI_ERROR_MEM_ALLOCATION;
}
diff --git a/flang/test/HLFIR/elemental-codegen.fir b/flang/test/HLFIR/elemental-codegen.fir
index 0d5f343cb17711..3c33bf8fca2d14 100644
--- a/flang/test/HLFIR/elemental-codegen.fir
+++ b/flang/test/HLFIR/elemental-codegen.fir
@@ -192,7 +192,7 @@ func.func @test_polymorphic(%arg0: !fir.class<!fir.type<_QMtypesTt>> {fir.bindc_
// CHECK: %[[VAL_35:.*]] = fir.absent !fir.box<none>
// CHECK: %[[VAL_36:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: %[[VAL_37:.*]] = fir.convert %[[VAL_31]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK: %[[VAL_38:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_36]], %[[VAL_34]], %[[VAL_35]], %[[VAL_37]], %[[VAL_33]]) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %[[VAL_38:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_36]], %{{.*}}, %[[VAL_34]], %[[VAL_35]], %[[VAL_37]], %[[VAL_33]]) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
// CHECK: %[[VAL_39:.*]] = fir.load %[[VAL_13]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>
// CHECK: %[[VAL_40:.*]] = arith.constant 1 : index
// CHECK: fir.do_loop %[[VAL_41:.*]] = %[[VAL_40]] to %[[EX1]] step %[[VAL_40]] unordered {
@@ -276,7 +276,7 @@ func.func @test_polymorphic_expr(%arg0: !fir.class<!fir.type<_QMtypesTt>> {fir.b
// CHECK: %[[VAL_36:.*]] = fir.absent !fir.box<none>
// CHECK: %[[VAL_37:.*]] = fir.convert %[[VAL_5]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: %[[VAL_38:.*]] = fir.convert %[[VAL_32]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK: %[[VAL_39:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_37]], %[[VAL_35]], %[[VAL_36]], %[[VAL_38]], %[[VAL_34]]) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %[[VAL_39:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_37]], %{{.*}}, %[[VAL_35]], %[[VAL_36]], %[[VAL_38]], %[[VAL_34]]) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
// CHECK: %[[VAL_40:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>
// CHECK: %[[VAL_41:.*]] = arith.constant 1 : index
// CHECK: fir.do_loop %[[VAL_42:.*]] = %[[VAL_41]] to %[[VAL_3]] step %[[VAL_41]] unordered {
@@ -329,7 +329,7 @@ func.func @test_polymorphic_expr(%arg0: !fir.class<!fir.type<_QMtypesTt>> {fir.b
// CHECK: %[[VAL_85:.*]] = fir.absent !fir.box<none>
// CHECK: %[[VAL_86:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: %[[VAL_87:.*]] = fir.convert %[[VAL_81]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK: %[[VAL_88:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_86]], %[[VAL_84]], %[[VAL_85]], %[[VAL_87]], %[[VAL_83]]) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %[[VAL_88:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_86]], %{{.*}}, %[[VAL_84]], %[[VAL_85]], %[[VAL_87]], %[[VAL_83]]) : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
// CHECK: %[[VAL_89:.*]] = fir.load %[[VAL_63]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMtypesTt>>>>>
// CHECK: %[[VAL_90:.*]] = arith.constant 1 : index
// CHECK: fir.do_loop %[[VAL_91:.*]] = %[[VAL_90]] to %[[VAL_3]] step %[[VAL_90]] unordered {
diff --git a/flang/test/Lower/OpenACC/acc-declare.f90 b/flang/test/Lower/OpenACC/acc-declare.f90
index 0066e712fbdcce..9fe51a8db55e3b 100644
--- a/flang/test/Lower/OpenACC/acc-declare.f90
+++ b/flang/test/Lower/OpenACC/acc-declare.f90
@@ -469,6 +469,6 @@ subroutine init()
end module
! CHECK-LABEL: func.func @_QMacc_declare_post_action_statPinit()
-! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEx_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEx_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
! CHECK: fir.if
-! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEy_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEy_acc_declare_update_desc_post_alloc>} : (!fir.ref<!fir.box<none>>, i64, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
diff --git a/flang/test/Lower/allocatable-polymorphic.f90 b/flang/test/Lower/allocatable-polymorphic.f90
index 4d70e1ea4c739a..852ce5159c18ce 100644
--- a/flang/test/Lower/allocatable-polymorphic.f90
+++ b/flang/test/Lower/allocatable-polymorphic.f90
@@ -267,7 +267,7 @@ subroutine test_allocatable()
! CHECK: %[[C0:.*]] = arith.constant 0 : i32
! CHECK: fir.call @_FortranAAllocatableInitDerivedForAllocate(%[[P_CAST]], %[[TYPE_DESC_P1_CAST]], %[[RANK]], %[[C0]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> none
! CHECK: %[[P_CAST:.*]] = fir.convert %[[P_DECL]]#1 : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[P_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[P_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) ...
[truncated]
|
…lvm#118713)" This reverts commit 7d1c661.
…lvm#118713)" (llvm#119109) This reverts commit 7d1c661. This commit breaks some device runtime builds. Need time to investigate.
…8713) This is a patch in preparation for the support stream ordered memory allocator in CUDA Fortran. This patch adds an asynchronous id to the AllocatableAllocate runtime function and to Descriptor::Allocate so it can be passed down to the registered allocator. It is up to the allocator to use this value or not. A follow up patch will implement that asynchronous allocator for CUDA Fortran.
…8713) This is a patch in preparation for the support stream ordered memory allocator in CUDA Fortran. This patch adds an asynchronous id to the AllocatableAllocate runtime function and to Descriptor::Allocate so it can be passed down to the registered allocator. It is up to the allocator to use this value or not. A follow up patch will implement that asynchronous allocator for CUDA Fortran.
This is a patch in preparation for the support stream ordered memory allocator in CUDA Fortran.
This patch adds an asynchronous id to the AllocatableAllocate runtime function and to Descriptor::Allocate so it can be passed down to the registered allocator. It is up to the allocator to use this value or not.
A follow up patch will implement that asynchronous allocator for CUDA Fortran.