diff --git a/flang/include/flang/Lower/OpenACC.h b/flang/include/flang/Lower/OpenACC.h index af3451023e3df..e974f3d6eef11 100644 --- a/flang/include/flang/Lower/OpenACC.h +++ b/flang/include/flang/Lower/OpenACC.h @@ -43,6 +43,7 @@ struct ProcedureDesignator; namespace parser { struct AccClauseList; +struct DoConstruct; struct OpenACCConstruct; struct OpenACCDeclarativeConstruct; struct OpenACCRoutineConstruct; @@ -58,6 +59,7 @@ namespace lower { class AbstractConverter; class StatementContext; +class SymMap; namespace pft { struct Evaluation; @@ -114,14 +116,32 @@ void attachDeclarePostDeallocAction(AbstractConverter &, fir::FirOpBuilder &, void genOpenACCTerminator(fir::FirOpBuilder &, mlir::Operation *, mlir::Location); -int64_t getLoopCountForCollapseAndTile(const Fortran::parser::AccClauseList &); +/// Used to obtain the number of contained loops to look for +/// since this is dependent on number of tile operands and collapse +/// clause. +uint64_t getLoopCountForCollapseAndTile(const Fortran::parser::AccClauseList &); +/// Checks whether the current insertion point is inside OpenACC loop. bool isInOpenACCLoop(fir::FirOpBuilder &); +/// Checks whether the current insertion point is inside OpenACC compute +/// construct. +bool isInsideOpenACCComputeConstruct(fir::FirOpBuilder &); + void setInsertionPointAfterOpenACCLoopIfInside(fir::FirOpBuilder &); void genEarlyReturnInOpenACCLoop(fir::FirOpBuilder &, mlir::Location); +/// Generates an OpenACC loop from a do construct in order to +/// properly capture the loop bounds, parallelism determination mode, +/// and to privatize the loop variables. +/// When the conversion is rejected, nullptr is returned. +mlir::Operation *genOpenACCLoopFromDoConstruct( + AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semanticsContext, + Fortran::lower::SymMap &localSymbols, + const Fortran::parser::DoConstruct &doConstruct, pft::Evaluation &eval); + } // namespace lower } // namespace Fortran diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 5f0783f869bf6..5eb1bafbb7ea2 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -2164,10 +2164,35 @@ class FirConverter : public Fortran::lower::AbstractConverter { /// - structured and unstructured concurrent loops void genFIR(const Fortran::parser::DoConstruct &doConstruct) { setCurrentPositionAt(doConstruct); - // Collect loop nest information. - // Generate begin loop code directly for infinite and while loops. Fortran::lower::pft::Evaluation &eval = getEval(); bool unstructuredContext = eval.lowerAsUnstructured(); + + // Loops with induction variables inside OpenACC compute constructs + // need special handling to ensure that the IVs are privatized. + if (Fortran::lower::isInsideOpenACCComputeConstruct(*builder)) { + mlir::Operation *loopOp = Fortran::lower::genOpenACCLoopFromDoConstruct( + *this, bridge.getSemanticsContext(), localSymbols, doConstruct, eval); + bool success = loopOp != nullptr; + if (success) { + // Sanity check that the builder insertion point is inside the newly + // generated loop. + assert( + loopOp->getRegion(0).isAncestor( + builder->getInsertionPoint()->getBlock()->getParent()) && + "builder insertion point is not inside the newly generated loop"); + + // Loop body code. + auto iter = eval.getNestedEvaluations().begin(); + for (auto end = --eval.getNestedEvaluations().end(); iter != end; + ++iter) + genFIR(*iter, unstructuredContext); + return; + } + // Fall back to normal loop handling. + } + + // Collect loop nest information. + // Generate begin loop code directly for infinite and while loops. Fortran::lower::pft::Evaluation &doStmtEval = eval.getFirstNestedEvaluation(); auto *doStmt = doStmtEval.getIf(); @@ -3121,7 +3146,7 @@ class FirConverter : public Fortran::lower::AbstractConverter { Fortran::lower::pft::Evaluation *curEval = &getEval(); if (accLoop || accCombined) { - int64_t loopCount; + uint64_t loopCount; if (accLoop) { const Fortran::parser::AccBeginLoopDirective &beginLoopDir = std::get(accLoop->t); @@ -3139,7 +3164,7 @@ class FirConverter : public Fortran::lower::AbstractConverter { if (curEval->lowerAsStructured()) { curEval = &curEval->getFirstNestedEvaluation(); - for (int64_t i = 1; i < loopCount; i++) + for (uint64_t i = 1; i < loopCount; i++) curEval = &*std::next(curEval->getNestedEvaluations().begin()); } } diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp index 51eb33dec186b..5f58ce02c11b1 100644 --- a/flang/lib/Lower/OpenACC.cpp +++ b/flang/lib/Lower/OpenACC.cpp @@ -36,6 +36,7 @@ #include "mlir/IR/MLIRContext.h" #include "mlir/Support/LLVM.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/ScopeExit.h" #include "llvm/Frontend/OpenACC/ACC.h.inc" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -2138,6 +2139,168 @@ static void determineDefaultLoopParMode( } } +// Extract loop bounds, steps, induction variables, and privatization info +// for both DO CONCURRENT and regular do loops +static void processDoLoopBounds( + Fortran::lower::AbstractConverter &converter, + mlir::Location currentLocation, Fortran::lower::StatementContext &stmtCtx, + fir::FirOpBuilder &builder, + const Fortran::parser::DoConstruct &outerDoConstruct, + Fortran::lower::pft::Evaluation &eval, + llvm::SmallVector &lowerbounds, + llvm::SmallVector &upperbounds, + llvm::SmallVector &steps, + llvm::SmallVector &privateOperands, + llvm::SmallVector &ivPrivate, + llvm::SmallVector &privatizationRecipes, + llvm::SmallVector &ivTypes, + llvm::SmallVector &ivLocs, + llvm::SmallVector &inclusiveBounds, + llvm::SmallVector &locs, uint64_t loopsToProcess) { + assert(loopsToProcess > 0 && "expect at least one loop"); + locs.push_back(currentLocation); // Location of the directive + Fortran::lower::pft::Evaluation *crtEval = &eval.getFirstNestedEvaluation(); + bool isDoConcurrent = outerDoConstruct.IsDoConcurrent(); + + if (isDoConcurrent) { + locs.push_back(converter.genLocation( + Fortran::parser::FindSourceLocation(outerDoConstruct))); + const Fortran::parser::LoopControl *loopControl = + &*outerDoConstruct.GetLoopControl(); + const auto &concurrent = + std::get(loopControl->u); + if (!std::get>(concurrent.t) + .empty()) + TODO(currentLocation, "DO CONCURRENT with locality spec inside ACC"); + + const auto &concurrentHeader = + std::get(concurrent.t); + const auto &controls = + std::get>( + concurrentHeader.t); + for (const auto &control : controls) { + lowerbounds.push_back(fir::getBase(converter.genExprValue( + *Fortran::semantics::GetExpr(std::get<1>(control.t)), stmtCtx))); + upperbounds.push_back(fir::getBase(converter.genExprValue( + *Fortran::semantics::GetExpr(std::get<2>(control.t)), stmtCtx))); + if (const auto &expr = + std::get>( + control.t)) + steps.push_back(fir::getBase(converter.genExprValue( + *Fortran::semantics::GetExpr(*expr), stmtCtx))); + else // If `step` is not present, assume it is `1`. + steps.push_back(builder.createIntegerConstant( + currentLocation, upperbounds[upperbounds.size() - 1].getType(), 1)); + + const auto &name = std::get(control.t); + privatizeIv(converter, *name.symbol, currentLocation, ivTypes, ivLocs, + privateOperands, ivPrivate, privatizationRecipes, + isDoConcurrent); + + inclusiveBounds.push_back(true); + } + } else { + for (uint64_t i = 0; i < loopsToProcess; ++i) { + const Fortran::parser::LoopControl *loopControl; + if (i == 0) { + loopControl = &*outerDoConstruct.GetLoopControl(); + locs.push_back(converter.genLocation( + Fortran::parser::FindSourceLocation(outerDoConstruct))); + } else { + auto *doCons = crtEval->getIf(); + assert(doCons && "expect do construct"); + loopControl = &*doCons->GetLoopControl(); + locs.push_back(converter.genLocation( + Fortran::parser::FindSourceLocation(*doCons))); + } + + const Fortran::parser::LoopControl::Bounds *bounds = + std::get_if(&loopControl->u); + assert(bounds && "Expected bounds on the loop construct"); + lowerbounds.push_back(fir::getBase(converter.genExprValue( + *Fortran::semantics::GetExpr(bounds->lower), stmtCtx))); + upperbounds.push_back(fir::getBase(converter.genExprValue( + *Fortran::semantics::GetExpr(bounds->upper), stmtCtx))); + if (bounds->step) + steps.push_back(fir::getBase(converter.genExprValue( + *Fortran::semantics::GetExpr(bounds->step), stmtCtx))); + else // If `step` is not present, assume it is `1`. + steps.push_back(builder.createIntegerConstant( + currentLocation, upperbounds[upperbounds.size() - 1].getType(), 1)); + + Fortran::semantics::Symbol &ivSym = + bounds->name.thing.symbol->GetUltimate(); + privatizeIv(converter, ivSym, currentLocation, ivTypes, ivLocs, + privateOperands, ivPrivate, privatizationRecipes); + + inclusiveBounds.push_back(true); + + if (i < loopsToProcess - 1) + crtEval = &*std::next(crtEval->getNestedEvaluations().begin()); + } + } +} + +static mlir::acc::LoopOp +buildACCLoopOp(Fortran::lower::AbstractConverter &converter, + mlir::Location currentLocation, + Fortran::semantics::SemanticsContext &semanticsContext, + Fortran::lower::StatementContext &stmtCtx, + const Fortran::parser::DoConstruct &outerDoConstruct, + Fortran::lower::pft::Evaluation &eval, + llvm::SmallVector &privateOperands, + llvm::SmallVector &privatizationRecipes, + llvm::SmallVector &gangOperands, + llvm::SmallVector &workerNumOperands, + llvm::SmallVector &vectorOperands, + llvm::SmallVector &tileOperands, + llvm::SmallVector &cacheOperands, + llvm::SmallVector &reductionOperands, + llvm::SmallVector &retTy, mlir::Value yieldValue, + uint64_t loopsToProcess) { + fir::FirOpBuilder &builder = converter.getFirOpBuilder(); + + llvm::SmallVector ivPrivate; + llvm::SmallVector ivTypes; + llvm::SmallVector ivLocs; + llvm::SmallVector inclusiveBounds; + llvm::SmallVector locs; + llvm::SmallVector lowerbounds, upperbounds, steps; + + // Look at the do/do concurrent loops to extract bounds information. + processDoLoopBounds(converter, currentLocation, stmtCtx, builder, + outerDoConstruct, eval, lowerbounds, upperbounds, steps, + privateOperands, ivPrivate, privatizationRecipes, ivTypes, + ivLocs, inclusiveBounds, locs, loopsToProcess); + + // Prepare the operand segment size attribute and the operands value range. + llvm::SmallVector operands; + llvm::SmallVector operandSegments; + addOperands(operands, operandSegments, lowerbounds); + addOperands(operands, operandSegments, upperbounds); + addOperands(operands, operandSegments, steps); + addOperands(operands, operandSegments, gangOperands); + addOperands(operands, operandSegments, workerNumOperands); + addOperands(operands, operandSegments, vectorOperands); + addOperands(operands, operandSegments, tileOperands); + addOperands(operands, operandSegments, cacheOperands); + addOperands(operands, operandSegments, privateOperands); + addOperands(operands, operandSegments, reductionOperands); + + auto loopOp = createRegionOp( + builder, builder.getFusedLoc(locs), currentLocation, eval, operands, + operandSegments, /*outerCombined=*/false, retTy, yieldValue, ivTypes, + ivLocs); + + for (auto [arg, value] : llvm::zip( + loopOp.getLoopRegions().front()->front().getArguments(), ivPrivate)) + builder.create(currentLocation, arg, value); + + loopOp.setInclusiveUpperbound(inclusiveBounds); + + return loopOp; +} + static mlir::acc::LoopOp createLoopOp( Fortran::lower::AbstractConverter &converter, mlir::Location currentLocation, @@ -2150,9 +2313,9 @@ static mlir::acc::LoopOp createLoopOp( std::nullopt, bool needEarlyReturnHandling = false) { fir::FirOpBuilder &builder = converter.getFirOpBuilder(); - llvm::SmallVector tileOperands, privateOperands, ivPrivate, + llvm::SmallVector tileOperands, privateOperands, reductionOperands, cacheOperands, vectorOperands, workerNumOperands, - gangOperands, lowerbounds, upperbounds, steps; + gangOperands; llvm::SmallVector privatizationRecipes, reductionRecipes; llvm::SmallVector tileOperandsSegments, gangOperandsSegments; llvm::SmallVector collapseValues; @@ -2321,107 +2484,6 @@ static mlir::acc::LoopOp createLoopOp( } } - llvm::SmallVector ivTypes; - llvm::SmallVector ivLocs; - llvm::SmallVector inclusiveBounds; - llvm::SmallVector locs; - locs.push_back(currentLocation); // Location of the directive - Fortran::lower::pft::Evaluation *crtEval = &eval.getFirstNestedEvaluation(); - bool isDoConcurrent = outerDoConstruct.IsDoConcurrent(); - if (isDoConcurrent) { - locs.push_back(converter.genLocation( - Fortran::parser::FindSourceLocation(outerDoConstruct))); - const Fortran::parser::LoopControl *loopControl = - &*outerDoConstruct.GetLoopControl(); - const auto &concurrent = - std::get(loopControl->u); - if (!std::get>(concurrent.t) - .empty()) - TODO(currentLocation, "DO CONCURRENT with locality spec"); - - const auto &concurrentHeader = - std::get(concurrent.t); - const auto &controls = - std::get>( - concurrentHeader.t); - for (const auto &control : controls) { - lowerbounds.push_back(fir::getBase(converter.genExprValue( - *Fortran::semantics::GetExpr(std::get<1>(control.t)), stmtCtx))); - upperbounds.push_back(fir::getBase(converter.genExprValue( - *Fortran::semantics::GetExpr(std::get<2>(control.t)), stmtCtx))); - if (const auto &expr = - std::get>( - control.t)) - steps.push_back(fir::getBase(converter.genExprValue( - *Fortran::semantics::GetExpr(*expr), stmtCtx))); - else // If `step` is not present, assume it is `1`. - steps.push_back(builder.createIntegerConstant( - currentLocation, upperbounds[upperbounds.size() - 1].getType(), 1)); - - const auto &name = std::get(control.t); - privatizeIv(converter, *name.symbol, currentLocation, ivTypes, ivLocs, - privateOperands, ivPrivate, privatizationRecipes, - isDoConcurrent); - - inclusiveBounds.push_back(true); - } - } else { - int64_t loopCount = - Fortran::lower::getLoopCountForCollapseAndTile(accClauseList); - for (unsigned i = 0; i < loopCount; ++i) { - const Fortran::parser::LoopControl *loopControl; - if (i == 0) { - loopControl = &*outerDoConstruct.GetLoopControl(); - locs.push_back(converter.genLocation( - Fortran::parser::FindSourceLocation(outerDoConstruct))); - } else { - auto *doCons = crtEval->getIf(); - assert(doCons && "expect do construct"); - loopControl = &*doCons->GetLoopControl(); - locs.push_back(converter.genLocation( - Fortran::parser::FindSourceLocation(*doCons))); - } - - const Fortran::parser::LoopControl::Bounds *bounds = - std::get_if(&loopControl->u); - assert(bounds && "Expected bounds on the loop construct"); - lowerbounds.push_back(fir::getBase(converter.genExprValue( - *Fortran::semantics::GetExpr(bounds->lower), stmtCtx))); - upperbounds.push_back(fir::getBase(converter.genExprValue( - *Fortran::semantics::GetExpr(bounds->upper), stmtCtx))); - if (bounds->step) - steps.push_back(fir::getBase(converter.genExprValue( - *Fortran::semantics::GetExpr(bounds->step), stmtCtx))); - else // If `step` is not present, assume it is `1`. - steps.push_back(builder.createIntegerConstant( - currentLocation, upperbounds[upperbounds.size() - 1].getType(), 1)); - - Fortran::semantics::Symbol &ivSym = - bounds->name.thing.symbol->GetUltimate(); - privatizeIv(converter, ivSym, currentLocation, ivTypes, ivLocs, - privateOperands, ivPrivate, privatizationRecipes); - - inclusiveBounds.push_back(true); - - if (i < loopCount - 1) - crtEval = &*std::next(crtEval->getNestedEvaluations().begin()); - } - } - - // Prepare the operand segment size attribute and the operands value range. - llvm::SmallVector operands; - llvm::SmallVector operandSegments; - addOperands(operands, operandSegments, lowerbounds); - addOperands(operands, operandSegments, upperbounds); - addOperands(operands, operandSegments, steps); - addOperands(operands, operandSegments, gangOperands); - addOperands(operands, operandSegments, workerNumOperands); - addOperands(operands, operandSegments, vectorOperands); - addOperands(operands, operandSegments, tileOperands); - addOperands(operands, operandSegments, cacheOperands); - addOperands(operands, operandSegments, privateOperands); - addOperands(operands, operandSegments, reductionOperands); - llvm::SmallVector retTy; mlir::Value yieldValue; if (needEarlyReturnHandling) { @@ -2430,16 +2492,13 @@ static mlir::acc::LoopOp createLoopOp( retTy.push_back(i1Ty); } - auto loopOp = createRegionOp( - builder, builder.getFusedLoc(locs), currentLocation, eval, operands, - operandSegments, /*outerCombined=*/false, retTy, yieldValue, ivTypes, - ivLocs); - - for (auto [arg, value] : llvm::zip( - loopOp.getLoopRegions().front()->front().getArguments(), ivPrivate)) - builder.create(currentLocation, arg, value); - - loopOp.setInclusiveUpperbound(inclusiveBounds); + uint64_t loopsToProcess = + Fortran::lower::getLoopCountForCollapseAndTile(accClauseList); + auto loopOp = buildACCLoopOp( + converter, currentLocation, semanticsContext, stmtCtx, outerDoConstruct, + eval, privateOperands, privatizationRecipes, gangOperands, + workerNumOperands, vectorOperands, tileOperands, cacheOperands, + reductionOperands, retTy, yieldValue, loopsToProcess); if (!gangDeviceTypes.empty()) loopOp.setGangAttr(builder.getArrayAttr(gangDeviceTypes)); @@ -4891,6 +4950,12 @@ bool Fortran::lower::isInOpenACCLoop(fir::FirOpBuilder &builder) { return false; } +bool Fortran::lower::isInsideOpenACCComputeConstruct( + fir::FirOpBuilder &builder) { + return mlir::isa_and_nonnull( + mlir::acc::getEnclosingComputeOp(builder.getRegion())); +} + void Fortran::lower::setInsertionPointAfterOpenACCLoopIfInside( fir::FirOpBuilder &builder) { if (auto loopOp = @@ -4905,10 +4970,10 @@ void Fortran::lower::genEarlyReturnInOpenACCLoop(fir::FirOpBuilder &builder, builder.create(loc, yieldValue); } -int64_t Fortran::lower::getLoopCountForCollapseAndTile( +uint64_t Fortran::lower::getLoopCountForCollapseAndTile( const Fortran::parser::AccClauseList &clauseList) { - int64_t collapseLoopCount = 1; - int64_t tileLoopCount = 1; + uint64_t collapseLoopCount = 1; + uint64_t tileLoopCount = 1; for (const Fortran::parser::AccClause &clause : clauseList.v) { if (const auto *collapseClause = std::get_if(&clause.u)) { @@ -4927,3 +4992,91 @@ int64_t Fortran::lower::getLoopCountForCollapseAndTile( return tileLoopCount; return collapseLoopCount; } + +/// Create an ACC loop operation for a DO construct when inside ACC compute +/// constructs This serves as a bridge between regular DO construct handling and +/// ACC loop creation +mlir::Operation *Fortran::lower::genOpenACCLoopFromDoConstruct( + AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semanticsContext, + Fortran::lower::SymMap &localSymbols, + const Fortran::parser::DoConstruct &doConstruct, pft::Evaluation &eval) { + // Only convert loops which have induction variables that need privatized. + if (!doConstruct.IsDoNormal() && !doConstruct.IsDoConcurrent()) + return nullptr; + + // If the evaluation is not structured, then we cannot convert the loop + // because acc loop does not have an unstructured form. + // TODO: There may be other strategies that can be employed such + // as generating acc.private for the loop variables without attaching + // them to acc.loop. + if (eval.lowerAsUnstructured()) + return nullptr; + + // Open up a new scope for the loop variables. + localSymbols.pushScope(); + auto scopeGuard = llvm::make_scope_exit([&]() { localSymbols.popScope(); }); + + // Prepare empty operand vectors since there are no associated `acc loop` + // clauses with the Fortran do loops being handled here. + llvm::SmallVector privateOperands, gangOperands, + workerNumOperands, vectorOperands, tileOperands, cacheOperands, + reductionOperands; + llvm::SmallVector privatizationRecipes; + llvm::SmallVector retTy; + mlir::Value yieldValue; + uint64_t loopsToProcess = 1; // Single loop construct + + // Use same mechanism that handles `acc loop` contained do loops to handle + // the implicit loop case. + Fortran::lower::StatementContext stmtCtx; + auto loopOp = buildACCLoopOp( + converter, converter.getCurrentLocation(), semanticsContext, stmtCtx, + doConstruct, eval, privateOperands, privatizationRecipes, gangOperands, + workerNumOperands, vectorOperands, tileOperands, cacheOperands, + reductionOperands, retTy, yieldValue, loopsToProcess); + + fir::FirOpBuilder &builder = converter.getFirOpBuilder(); + if (!privatizationRecipes.empty()) + loopOp.setPrivatizationRecipesAttr(mlir::ArrayAttr::get( + converter.getFirOpBuilder().getContext(), privatizationRecipes)); + + // Normal do loops which are not annotated with `acc loop` should be + // left for analysis by marking with `auto`. This is the case even in the case + // of `acc parallel` region because the normal rules of applying `independent` + // is only for loops marked with `acc loop`. + // For do concurrent loops, the spec says in section 2.17.2: + // "When do concurrent appears without a loop construct in a kernels construct + // it is treated as if it is annotated with loop auto. If it appears in a + // parallel construct or an accelerator routine then it is treated as if it is + // annotated with loop independent." + // So this means that in all cases we mark with `auto` unless it is a + // `do concurrent` in an `acc parallel` construct or it must be `seq` because + // it is in an `acc serial` construct. + mlir::Operation *accRegionOp = + mlir::acc::getEnclosingComputeOp(converter.getFirOpBuilder().getRegion()); + mlir::acc::LoopParMode parMode = + mlir::isa_and_present(accRegionOp) && + doConstruct.IsDoConcurrent() + ? mlir::acc::LoopParMode::loop_independent + : mlir::isa_and_present(accRegionOp) + ? mlir::acc::LoopParMode::loop_seq + : mlir::acc::LoopParMode::loop_auto; + + // Set the parallel mode based on the computed parMode + auto deviceNoneAttr = mlir::acc::DeviceTypeAttr::get( + builder.getContext(), mlir::acc::DeviceType::None); + auto arrOfDeviceNone = + mlir::ArrayAttr::get(builder.getContext(), deviceNoneAttr); + if (parMode == mlir::acc::LoopParMode::loop_independent) { + loopOp.setIndependentAttr(arrOfDeviceNone); + } else if (parMode == mlir::acc::LoopParMode::loop_seq) { + loopOp.setSeqAttr(arrOfDeviceNone); + } else if (parMode == mlir::acc::LoopParMode::loop_auto) { + loopOp.setAuto_Attr(arrOfDeviceNone); + } else { + llvm_unreachable("Unexpected loop par mode"); + } + + return loopOp; +} diff --git a/flang/test/Lower/OpenACC/do-loops-to-acc-loops.f90 b/flang/test/Lower/OpenACC/do-loops-to-acc-loops.f90 new file mode 100644 index 0000000000000..d34cd11795b0f --- /dev/null +++ b/flang/test/Lower/OpenACC/do-loops-to-acc-loops.f90 @@ -0,0 +1,332 @@ +! This test checks lowering of Fortran do loops and do concurrent loops to OpenACC loop constructs. +! Tests the new functionality that converts Fortran iteration constructs to acc.loop with proper IV handling. + +! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s + +! CHECK-LABEL: func.func @_QPbasic_do_loop +subroutine basic_do_loop() + integer :: i + integer, parameter :: n = 10 + real, dimension(n) :: a, b + + ! Basic do loop that should be converted to acc.loop + !$acc kernels + do i = 1, n + a(i) = b(i) + 1.0 + end do + !$acc end kernels + +! CHECK: acc.kernels { +! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) +! CHECK: acc.yield +! CHECK: attributes {auto_ = [#acc.device_type], inclusiveUpperbound = array} + +end subroutine + +! CHECK-LABEL: func.func @_QPbasic_do_concurrent +subroutine basic_do_concurrent() + integer :: i + integer, parameter :: n = 10 + real, dimension(n) :: a, b + + ! Basic do concurrent loop + !$acc kernels + do concurrent (i = 1:n) + a(i) = b(i) + 1.0 + end do + !$acc end kernels + +! CHECK: acc.kernels { +! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) +! CHECK: acc.yield +! CHECK: attributes {auto_ = [#acc.device_type], inclusiveUpperbound = array} + +end subroutine + +! CHECK-LABEL: func.func @_QPbasic_do_loop_parallel +subroutine basic_do_loop_parallel() + integer :: i + integer, parameter :: n = 10 + real, dimension(n) :: a, b + + ! Basic do loop with acc parallel that should be converted to acc.loop + !$acc parallel + do i = 1, n + a(i) = b(i) + 1.0 + end do + !$acc end parallel + +! CHECK: acc.parallel { +! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) +! CHECK: acc.yield +! CHECK: attributes {auto_ = [#acc.device_type], inclusiveUpperbound = array} + +end subroutine + +! CHECK-LABEL: func.func @_QPbasic_do_loop_serial +subroutine basic_do_loop_serial() + integer :: i + integer, parameter :: n = 10 + real, dimension(n) :: a, b + + ! Basic do loop with acc serial that should be converted to acc.loop + !$acc serial + do i = 1, n + a(i) = b(i) + 1.0 + end do + !$acc end serial + +! CHECK: acc.serial { +! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) +! CHECK: acc.yield +! CHECK: attributes {inclusiveUpperbound = array, seq = [#acc.device_type]} + +end subroutine + +! CHECK-LABEL: func.func @_QPbasic_do_concurrent_parallel +subroutine basic_do_concurrent_parallel() + integer :: i + integer, parameter :: n = 10 + real, dimension(n) :: a, b + + ! Basic do concurrent loop with acc parallel + !$acc parallel + do concurrent (i = 1:n) + a(i) = b(i) + 1.0 + end do + !$acc end parallel + +! CHECK: acc.parallel { +! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) +! CHECK: acc.yield +! CHECK: attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} + +end subroutine + +! CHECK-LABEL: func.func @_QPbasic_do_concurrent_serial +subroutine basic_do_concurrent_serial() + integer :: i + integer, parameter :: n = 10 + real, dimension(n) :: a, b + + ! Basic do concurrent loop with acc serial + !$acc serial + do concurrent (i = 1:n) + a(i) = b(i) + 1.0 + end do + !$acc end serial + +! CHECK: acc.serial { +! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) +! CHECK: acc.yield +! CHECK: attributes {inclusiveUpperbound = array, seq = [#acc.device_type]} + +end subroutine + +! CHECK-LABEL: func.func @_QPmulti_dimension_do_concurrent +subroutine multi_dimension_do_concurrent() + integer :: i, j, k + integer, parameter :: n = 10, m = 20, l = 5 + real, dimension(n,m,l) :: a, b + + ! Multi-dimensional do concurrent with multiple iteration variables + !$acc kernels + do concurrent (i = 1:n, j = 1:m, k = 1:l) + a(i,j,k) = b(i,j,k) * 2.0 + end do + !$acc end kernels + +! CHECK: acc.kernels { +! CHECK: acc.loop {{.*}} control(%{{.*}} : i32, %{{.*}} : i32, %{{.*}} : i32) = (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32) to (%{{.*}}, %{{.*}}, %{{.*}} : i32, i32, i32) step (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32) +! CHECK: acc.yield +! CHECK: attributes {auto_ = [#acc.device_type], inclusiveUpperbound = array} +end subroutine + + +! CHECK-LABEL: func.func @_QPnested_do_loops +subroutine nested_do_loops() + integer :: i, j + integer, parameter :: n = 10, m = 20 + real, dimension(n,m) :: a, b + + ! Nested do loops + !$acc kernels + do i = 1, n + do j = 1, m + a(i,j) = b(i,j) + i + j + end do + end do + !$acc end kernels + +! CHECK: acc.kernels { +! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) +! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) +! CHECK: acc.yield +! CHECK: attributes {auto_ = [#acc.device_type], inclusiveUpperbound = array} +! CHECK: acc.yield +! CHECK: attributes {auto_ = [#acc.device_type], inclusiveUpperbound = array} + +end subroutine + +! CHECK-LABEL: func.func @_QPvariable_bounds_and_step +subroutine variable_bounds_and_step(n, start_val, step_val) + integer, intent(in) :: n, start_val, step_val + integer :: i + real, dimension(n) :: a, b + + ! Do loop with variable bounds and step + !$acc kernels + do i = start_val, n, step_val + a(i) = b(i) * 2.0 + end do + !$acc end kernels + +! CHECK: acc.kernels { +! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) +! CHECK: acc.yield +! CHECK: attributes {auto_ = [#acc.device_type], inclusiveUpperbound = array} + +end subroutine + +! CHECK-LABEL: func.func @_QPdifferent_iv_types +subroutine different_iv_types() + integer(kind=8) :: i8 + integer(kind=4) :: i4 + integer(kind=2) :: i2 + integer, parameter :: n = 10 + real, dimension(n) :: a, b, c, d + + ! Test different iteration variable types + !$acc kernels + do i8 = 1_8, int(n,8) + a(i8) = b(i8) + 1.0 + end do + !$acc end kernels + + !$acc kernels + do i4 = 1, n + b(i4) = c(i4) + 1.0 + end do + !$acc end kernels + + !$acc kernels + do i2 = 1_2, int(n,2) + c(i2) = d(i2) + 1.0 + end do + !$acc end kernels + +! CHECK: acc.kernels { +! CHECK: acc.loop {{.*}} control(%{{.*}} : i64) = (%{{.*}} : i64) to (%{{.*}} : i64) step (%{{.*}} : i64) +! CHECK: acc.kernels { +! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) +! CHECK: acc.kernels { +! CHECK: acc.loop {{.*}} control(%{{.*}} : i16) = (%{{.*}} : i16) to (%{{.*}} : i16) step (%{{.*}} : i16) + +end subroutine + +! ----------------------------------------------------------------------------------------- +! Tests for loops that should NOT be converted to acc.loop due to unstructured control flow + +! CHECK-LABEL: func.func @_QPinfinite_loop_no_iv +subroutine infinite_loop_no_iv() + integer :: i + logical :: condition + + ! Infinite loop with no induction variable - should NOT convert to acc.loop + !$acc kernels + do + i = i + 1 + if (i > 100) exit + end do + !$acc end kernels + +! CHECK: acc.kernels { +! CHECK-NOT: acc.loop + +end subroutine + +! CHECK-LABEL: func.func @_QPdo_loop_with_goto +subroutine do_loop_with_goto() + integer :: i + integer, parameter :: n = 10 + real, dimension(n) :: a, b + + ! Do loop with goto - unstructured control flow is not converted. + !$acc kernels + do i = 1, n + a(i) = b(i) + 1.0 + if (i == 5) goto 100 + 100 continue + end do + !$acc end kernels + +! CHECK: acc.kernels { +! CHECK-NOT: acc.loop + +end subroutine + + +! CHECK-LABEL: func.func @_QPdo_loop_with_cycle_goto +subroutine do_loop_with_cycle_goto() + integer :: i + integer, parameter :: n = 10 + real, dimension(n) :: a, b + + ! Do loop with cycle and goto - unstructured control flow is not converted. + !$acc kernels + do i = 1, n + if (i == 3) cycle + a(i) = b(i) + 1.0 + if (i == 7) goto 200 + a(i) = a(i) * 2.0 + end do +200 continue + !$acc end kernels + +! CHECK: acc.kernels { +! CHECK-NOT: acc.loop + +end subroutine + +! CHECK-LABEL: func.func @_QPnested_goto_loop +subroutine nested_goto_loop() + integer :: i, j + integer, parameter :: n = 10, m = 5 + real, dimension(n,m) :: a, b + + ! Nested loop with goto from inner to outer - should NOT convert to acc.loop + !$acc kernels + do i = 1, n + do j = 1, m + a(i,j) = b(i,j) + 1.0 + if (i * j > 20) goto 300 ! Exit both loops + end do + end do +300 continue + !$acc end kernels + +! CHECK: acc.kernels { +! CHECK-NOT: acc.loop + +end subroutine + +! CHECK-LABEL: func.func @_QPwhile_like_loop +subroutine while_like_loop() + integer :: i + logical :: condition + + i = 1 + condition = .true. + + ! While-like infinite loop - should NOT convert to acc.loop + !$acc kernels + do while (condition) + i = i + 1 + if (i > 100) condition = .false. + end do + !$acc end kernels + +! CHECK: acc.kernels { +! CHECK-NOT: acc.loop + +end subroutine diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td index 96b9adcc53b3c..19b81267c32dd 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td @@ -134,6 +134,24 @@ def OpenACC_VariableTypeCategory : I32BitEnumAttr< let printBitEnumPrimaryGroups = 1; } +// These are parallelism determination modes for `acc loop`. +// In the enum names, we use the "loop_" prefix because "auto" is +// a language keyword - and thus for consistency all other cases +// do the same. +def OpenACC_LoopSeq : I32EnumAttrCase<"loop_seq", 0>; +def OpenACC_LoopAuto : I32EnumAttrCase<"loop_auto", 1>; +def OpenACC_LoopIndependent : I32EnumAttrCase<"loop_independent", 2>; + +def OpenACC_LoopParMode : I32EnumAttr< + "LoopParMode", + "Encodes the options for loop parallelism determination mode", + [ + OpenACC_LoopAuto, OpenACC_LoopIndependent, + OpenACC_LoopSeq]> { + let cppNamespace = "::mlir::acc"; + let genSpecializedAttr = 0; +} + // Type used in operation below. def IntOrIndex : AnyTypeOf<[AnyInteger, Index]>; @@ -2404,6 +2422,53 @@ def OpenACC_LoopOp : OpenACC_Op<"loop", }]; let hasVerifier = 1; + + let builders = [ + OpBuilder<(ins "::mlir::ValueRange":$lowerbounds, + "::mlir::ValueRange":$upperbounds, + "::mlir::ValueRange":$steps, + "LoopParMode":$parMode), [{ + auto deviceNoneAttr = mlir::acc::DeviceTypeAttr::get( + $_builder.getContext(), mlir::acc::DeviceType::None); + auto arrOfDeviceNone = mlir::ArrayAttr::get( + $_builder.getContext(), deviceNoneAttr); + build($_builder, $_state, + /*results=*/{}, + /*lowerbound=*/lowerbounds, + /*upperbound=*/upperbounds, + /*step=*/steps, + /*inclusiveUpperbound=*/nullptr, + /*collapse=*/nullptr, + /*collapseDeviceType=*/nullptr, + /*gangOperands=*/{}, + /*gangOperandsArgType=*/nullptr, + /*gangOperandsSegments=*/nullptr, + /*gangOperandsDeviceType=*/nullptr, + /*workerNumOperands=*/{}, + /*workerNumOperandsDeviceType=*/nullptr, + /*vectorOperands=*/{}, + /*vectorOperandsDeviceType=*/nullptr, + /*seq=*/parMode == LoopParMode::loop_seq ? + arrOfDeviceNone : nullptr, + /*independent=*/parMode == LoopParMode::loop_independent ? + arrOfDeviceNone : nullptr, + /*auto_=*/parMode == LoopParMode::loop_auto ? + arrOfDeviceNone : nullptr, + /*gang=*/nullptr, + /*worker=*/nullptr, + /*vector=*/nullptr, + /*tileOperands=*/{}, + /*tileOperandsSegments=*/nullptr, + /*tileOperandsDeviceType=*/nullptr, + /*cacheOperands=*/{}, + /*privateOperands=*/{}, + /*privatizationRecipes=*/nullptr, + /*reductionOperands=*/{}, + /*reductionRecipes=*/nullptr, + /*combined=*/nullptr); + }] + > + ]; } // Yield operation for the acc.loop and acc.parallel operations.