From 78affa2a398ce503de265dba50c6988406d21697 Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <rlupusoru@nvidia.com>
Date: Fri, 18 Jul 2025 16:05:43 -0700
Subject: [PATCH 1/5] [flang][acc] Lower do and do concurrent loops specially
 in acc regions

When OpenACC is enabled and Fortran loops are annotated with `acc loop`,
they are lowered to `acc.loop` operation. And rest of the contained
loops use the normal FIR lowering path.

Hovever, the OpenACC specification has special provisions related
to contained loops and their induction variable. In order to adhere to
this, we convert all valid contained loops to `acc.loop` in order to
store this information appropriately.

The provisions in the spec that motivated this change (line numbers
are from OpenACC 3.4):
- 1353 Loop variables in Fortran do statements within a compute
construct are predetermined to be private to the thread that executes
the loop.
- 3783 When do concurrent appears without a loop construct in a kernels
construct it is treated as if it is annotated with loop auto. If it
appears in a parallel construct or an accelerator routine then it is
treated as if it is annotated with loop independent.

By valid loops - we convert do loops and do concurrent loops which have
induction variable. Loops which are unstructured are not handled.
---
 flang/include/flang/Lower/OpenACC.h           |  22 +-
 flang/lib/Lower/Bridge.cpp                    |  33 +-
 flang/lib/Lower/OpenACC.cpp                   | 385 ++++++++++++------
 .../Lower/OpenACC/do-loops-to-acc-loops.f90   | 332 +++++++++++++++
 .../mlir/Dialect/OpenACC/OpenACCOps.td        |  65 +++
 5 files changed, 716 insertions(+), 121 deletions(-)
 create mode 100644 flang/test/Lower/OpenACC/do-loops-to-acc-loops.f90

diff --git a/flang/include/flang/Lower/OpenACC.h b/flang/include/flang/Lower/OpenACC.h
index af3451023e3df..8b13ce94f7bc4 100644
--- a/flang/include/flang/Lower/OpenACC.h
+++ b/flang/include/flang/Lower/OpenACC.h
@@ -43,6 +43,7 @@ struct ProcedureDesignator;
 
 namespace parser {
 struct AccClauseList;
+struct DoConstruct;
 struct OpenACCConstruct;
 struct OpenACCDeclarativeConstruct;
 struct OpenACCRoutineConstruct;
@@ -58,6 +59,7 @@ namespace lower {
 
 class AbstractConverter;
 class StatementContext;
+class SymMap;
 
 namespace pft {
 struct Evaluation;
@@ -114,14 +116,32 @@ void attachDeclarePostDeallocAction(AbstractConverter &, fir::FirOpBuilder &,
 void genOpenACCTerminator(fir::FirOpBuilder &, mlir::Operation *,
                           mlir::Location);
 
-int64_t getLoopCountForCollapseAndTile(const Fortran::parser::AccClauseList &);
+/// Used to obtain the number of contained loops to look for
+/// since this is dependent on number of tile operands and collapse
+/// clause.
+uint64_t getLoopCountForCollapseAndTile(const Fortran::parser::AccClauseList &);
 
+/// Checks whether the current insertion point is inside OpenACC loop.
 bool isInOpenACCLoop(fir::FirOpBuilder &);
 
+/// Checks whether the current insertion point is inside OpenACC compute construct.
+bool isInsideOpenACCComputeConstruct(fir::FirOpBuilder &);
+
 void setInsertionPointAfterOpenACCLoopIfInside(fir::FirOpBuilder &);
 
 void genEarlyReturnInOpenACCLoop(fir::FirOpBuilder &, mlir::Location);
 
+/// Generates an OpenACC loop from a do construct in order to
+/// properly capture the loop bounds, parallelism determination mode,
+/// and to privatize the loop variables.
+/// When the conversion is rejected, nullptr is returned.
+mlir::Operation *genOpenACCLoopFromDoConstruct(
+    AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semanticsContext,
+    Fortran::lower::SymMap &localSymbols,
+    const Fortran::parser::DoConstruct &doConstruct,
+    pft::Evaluation &eval);
+
 } // namespace lower
 } // namespace Fortran
 
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 5f0783f869bf6..4073b8623e333 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -2164,10 +2164,35 @@ class FirConverter : public Fortran::lower::AbstractConverter {
   ///  - structured and unstructured concurrent loops
   void genFIR(const Fortran::parser::DoConstruct &doConstruct) {
     setCurrentPositionAt(doConstruct);
-    // Collect loop nest information.
-    // Generate begin loop code directly for infinite and while loops.
     Fortran::lower::pft::Evaluation &eval = getEval();
     bool unstructuredContext = eval.lowerAsUnstructured();
+
+    // Loops with induction variables inside OpenACC compute constructs
+    // need special handling to ensure that the IVs are privatized.
+    if (Fortran::lower::isInsideOpenACCComputeConstruct(*builder)) {
+      mlir::Operation* loopOp = Fortran::lower::genOpenACCLoopFromDoConstruct(
+                         *this, bridge.getSemanticsContext(), localSymbols,
+                         doConstruct, eval);
+      bool success = loopOp != nullptr;
+      if (success) {
+        // Sanity check that the builder insertion point is inside the newly
+        // generated loop.
+        assert(
+            loopOp->getRegion(0).isAncestor(
+                builder->getInsertionPoint()->getBlock()->getParent()) &&
+            "builder insertion point is not inside the newly generated loop");
+
+        // Loop body code.
+        auto iter = eval.getNestedEvaluations().begin();
+        for (auto end = --eval.getNestedEvaluations().end(); iter != end; ++iter)
+          genFIR(*iter, unstructuredContext);
+        return;
+      }
+      // Fall back to normal loop handling.
+    }
+
+    // Collect loop nest information.
+    // Generate begin loop code directly for infinite and while loops.
     Fortran::lower::pft::Evaluation &doStmtEval =
         eval.getFirstNestedEvaluation();
     auto *doStmt = doStmtEval.getIf<Fortran::parser::NonLabelDoStmt>();
@@ -3121,7 +3146,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     Fortran::lower::pft::Evaluation *curEval = &getEval();
 
     if (accLoop || accCombined) {
-      int64_t loopCount;
+      uint64_t loopCount;
       if (accLoop) {
         const Fortran::parser::AccBeginLoopDirective &beginLoopDir =
             std::get<Fortran::parser::AccBeginLoopDirective>(accLoop->t);
@@ -3139,7 +3164,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
 
       if (curEval->lowerAsStructured()) {
         curEval = &curEval->getFirstNestedEvaluation();
-        for (int64_t i = 1; i < loopCount; i++)
+        for (uint64_t i = 1; i < loopCount; i++)
           curEval = &*std::next(curEval->getNestedEvaluations().begin());
       }
     }
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 51eb33dec186b..950b02501751a 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -35,6 +35,7 @@
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Support/LLVM.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Frontend/OpenACC/ACC.h.inc"
 #include "llvm/Support/CommandLine.h"
@@ -2138,6 +2139,168 @@ static void determineDefaultLoopParMode(
   }
 }
 
+// Extract loop bounds, steps, induction variables, and privatization info
+// for both DO CONCURRENT and regular do loops
+static void processDoLoopBounds(
+    Fortran::lower::AbstractConverter &converter,
+    mlir::Location currentLocation, Fortran::lower::StatementContext &stmtCtx,
+    fir::FirOpBuilder &builder,
+    const Fortran::parser::DoConstruct &outerDoConstruct,
+    Fortran::lower::pft::Evaluation &eval,
+    llvm::SmallVector<mlir::Value> &lowerbounds,
+    llvm::SmallVector<mlir::Value> &upperbounds,
+    llvm::SmallVector<mlir::Value> &steps,
+    llvm::SmallVector<mlir::Value> &privateOperands,
+    llvm::SmallVector<mlir::Value> &ivPrivate,
+    llvm::SmallVector<mlir::Attribute> &privatizationRecipes,
+    llvm::SmallVector<mlir::Type> &ivTypes,
+    llvm::SmallVector<mlir::Location> &ivLocs,
+    llvm::SmallVector<bool> &inclusiveBounds,
+    llvm::SmallVector<mlir::Location> &locs, uint64_t loopsToProcess) {
+  assert(loopsToProcess > 0 && "expect at least one loop");
+  locs.push_back(currentLocation); // Location of the directive
+  Fortran::lower::pft::Evaluation *crtEval = &eval.getFirstNestedEvaluation();
+  bool isDoConcurrent = outerDoConstruct.IsDoConcurrent();
+
+  if (isDoConcurrent) {
+    locs.push_back(converter.genLocation(
+        Fortran::parser::FindSourceLocation(outerDoConstruct)));
+    const Fortran::parser::LoopControl *loopControl =
+        &*outerDoConstruct.GetLoopControl();
+    const auto &concurrent =
+        std::get<Fortran::parser::LoopControl::Concurrent>(loopControl->u);
+    if (!std::get<std::list<Fortran::parser::LocalitySpec>>(concurrent.t)
+             .empty())
+      TODO(currentLocation, "DO CONCURRENT with locality spec inside ACC");
+
+    const auto &concurrentHeader =
+        std::get<Fortran::parser::ConcurrentHeader>(concurrent.t);
+    const auto &controls =
+        std::get<std::list<Fortran::parser::ConcurrentControl>>(
+            concurrentHeader.t);
+    for (const auto &control : controls) {
+      lowerbounds.push_back(fir::getBase(converter.genExprValue(
+          *Fortran::semantics::GetExpr(std::get<1>(control.t)), stmtCtx)));
+      upperbounds.push_back(fir::getBase(converter.genExprValue(
+          *Fortran::semantics::GetExpr(std::get<2>(control.t)), stmtCtx)));
+      if (const auto &expr =
+              std::get<std::optional<Fortran::parser::ScalarIntExpr>>(
+                  control.t))
+        steps.push_back(fir::getBase(converter.genExprValue(
+            *Fortran::semantics::GetExpr(*expr), stmtCtx)));
+      else // If `step` is not present, assume it is `1`.
+        steps.push_back(builder.createIntegerConstant(
+            currentLocation, upperbounds[upperbounds.size() - 1].getType(), 1));
+
+      const auto &name = std::get<Fortran::parser::Name>(control.t);
+      privatizeIv(converter, *name.symbol, currentLocation, ivTypes, ivLocs,
+                  privateOperands, ivPrivate, privatizationRecipes,
+                  isDoConcurrent);
+
+      inclusiveBounds.push_back(true);
+    }
+  } else {
+    for (uint64_t i = 0; i < loopsToProcess; ++i) {
+      const Fortran::parser::LoopControl *loopControl;
+      if (i == 0) {
+        loopControl = &*outerDoConstruct.GetLoopControl();
+        locs.push_back(converter.genLocation(
+            Fortran::parser::FindSourceLocation(outerDoConstruct)));
+      } else {
+        auto *doCons = crtEval->getIf<Fortran::parser::DoConstruct>();
+        assert(doCons && "expect do construct");
+        loopControl = &*doCons->GetLoopControl();
+        locs.push_back(converter.genLocation(
+            Fortran::parser::FindSourceLocation(*doCons)));
+      }
+
+      const Fortran::parser::LoopControl::Bounds *bounds =
+          std::get_if<Fortran::parser::LoopControl::Bounds>(&loopControl->u);
+      assert(bounds && "Expected bounds on the loop construct");
+      lowerbounds.push_back(fir::getBase(converter.genExprValue(
+          *Fortran::semantics::GetExpr(bounds->lower), stmtCtx)));
+      upperbounds.push_back(fir::getBase(converter.genExprValue(
+          *Fortran::semantics::GetExpr(bounds->upper), stmtCtx)));
+      if (bounds->step)
+        steps.push_back(fir::getBase(converter.genExprValue(
+            *Fortran::semantics::GetExpr(bounds->step), stmtCtx)));
+      else // If `step` is not present, assume it is `1`.
+        steps.push_back(builder.createIntegerConstant(
+            currentLocation, upperbounds[upperbounds.size() - 1].getType(), 1));
+
+      Fortran::semantics::Symbol &ivSym =
+          bounds->name.thing.symbol->GetUltimate();
+      privatizeIv(converter, ivSym, currentLocation, ivTypes, ivLocs,
+                  privateOperands, ivPrivate, privatizationRecipes);
+
+      inclusiveBounds.push_back(true);
+
+      if (i < loopsToProcess - 1)
+        crtEval = &*std::next(crtEval->getNestedEvaluations().begin());
+    }
+  }
+}
+
+static mlir::acc::LoopOp
+buildACCLoopOp(Fortran::lower::AbstractConverter &converter,
+               mlir::Location currentLocation,
+               Fortran::semantics::SemanticsContext &semanticsContext,
+               Fortran::lower::StatementContext &stmtCtx,
+               const Fortran::parser::DoConstruct &outerDoConstruct,
+               Fortran::lower::pft::Evaluation &eval,
+               llvm::SmallVector<mlir::Value> &privateOperands,
+               llvm::SmallVector<mlir::Attribute> &privatizationRecipes,
+               llvm::SmallVector<mlir::Value> &gangOperands,
+               llvm::SmallVector<mlir::Value> &workerNumOperands,
+               llvm::SmallVector<mlir::Value> &vectorOperands,
+               llvm::SmallVector<mlir::Value> &tileOperands,
+               llvm::SmallVector<mlir::Value> &cacheOperands,
+               llvm::SmallVector<mlir::Value> &reductionOperands,
+               llvm::SmallVector<mlir::Type> &retTy, mlir::Value yieldValue,
+               uint64_t loopsToProcess) {
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+
+  llvm::SmallVector<mlir::Value> ivPrivate;
+  llvm::SmallVector<mlir::Type> ivTypes;
+  llvm::SmallVector<mlir::Location> ivLocs;
+  llvm::SmallVector<bool> inclusiveBounds;
+  llvm::SmallVector<mlir::Location> locs;
+  llvm::SmallVector<mlir::Value> lowerbounds, upperbounds, steps;
+
+  // Look at the do/do concurrent loops to extract bounds information.
+  processDoLoopBounds(converter, currentLocation, stmtCtx, builder,
+                      outerDoConstruct, eval, lowerbounds, upperbounds, steps,
+                      privateOperands, ivPrivate, privatizationRecipes, ivTypes,
+                      ivLocs, inclusiveBounds, locs, loopsToProcess);
+
+  // Prepare the operand segment size attribute and the operands value range.
+  llvm::SmallVector<mlir::Value> operands;
+  llvm::SmallVector<int32_t> operandSegments;
+  addOperands(operands, operandSegments, lowerbounds);
+  addOperands(operands, operandSegments, upperbounds);
+  addOperands(operands, operandSegments, steps);
+  addOperands(operands, operandSegments, gangOperands);
+  addOperands(operands, operandSegments, workerNumOperands);
+  addOperands(operands, operandSegments, vectorOperands);
+  addOperands(operands, operandSegments, tileOperands);
+  addOperands(operands, operandSegments, cacheOperands);
+  addOperands(operands, operandSegments, privateOperands);
+  addOperands(operands, operandSegments, reductionOperands);
+
+  auto loopOp = createRegionOp<mlir::acc::LoopOp, mlir::acc::YieldOp>(
+      builder, builder.getFusedLoc(locs), currentLocation, eval, operands,
+      operandSegments, /*outerCombined=*/false, retTy, yieldValue, ivTypes,
+      ivLocs);
+
+  for (auto [arg, value] : llvm::zip(
+           loopOp.getLoopRegions().front()->front().getArguments(), ivPrivate))
+    builder.create<fir::StoreOp>(currentLocation, arg, value);
+
+  loopOp.setInclusiveUpperbound(inclusiveBounds);
+
+  return loopOp;
+}
+
 static mlir::acc::LoopOp createLoopOp(
     Fortran::lower::AbstractConverter &converter,
     mlir::Location currentLocation,
@@ -2150,9 +2313,9 @@ static mlir::acc::LoopOp createLoopOp(
         std::nullopt,
     bool needEarlyReturnHandling = false) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-  llvm::SmallVector<mlir::Value> tileOperands, privateOperands, ivPrivate,
+  llvm::SmallVector<mlir::Value> tileOperands, privateOperands,
       reductionOperands, cacheOperands, vectorOperands, workerNumOperands,
-      gangOperands, lowerbounds, upperbounds, steps;
+      gangOperands;
   llvm::SmallVector<mlir::Attribute> privatizationRecipes, reductionRecipes;
   llvm::SmallVector<int32_t> tileOperandsSegments, gangOperandsSegments;
   llvm::SmallVector<int64_t> collapseValues;
@@ -2321,107 +2484,6 @@ static mlir::acc::LoopOp createLoopOp(
     }
   }
 
-  llvm::SmallVector<mlir::Type> ivTypes;
-  llvm::SmallVector<mlir::Location> ivLocs;
-  llvm::SmallVector<bool> inclusiveBounds;
-  llvm::SmallVector<mlir::Location> locs;
-  locs.push_back(currentLocation); // Location of the directive
-  Fortran::lower::pft::Evaluation *crtEval = &eval.getFirstNestedEvaluation();
-  bool isDoConcurrent = outerDoConstruct.IsDoConcurrent();
-  if (isDoConcurrent) {
-    locs.push_back(converter.genLocation(
-        Fortran::parser::FindSourceLocation(outerDoConstruct)));
-    const Fortran::parser::LoopControl *loopControl =
-        &*outerDoConstruct.GetLoopControl();
-    const auto &concurrent =
-        std::get<Fortran::parser::LoopControl::Concurrent>(loopControl->u);
-    if (!std::get<std::list<Fortran::parser::LocalitySpec>>(concurrent.t)
-             .empty())
-      TODO(currentLocation, "DO CONCURRENT with locality spec");
-
-    const auto &concurrentHeader =
-        std::get<Fortran::parser::ConcurrentHeader>(concurrent.t);
-    const auto &controls =
-        std::get<std::list<Fortran::parser::ConcurrentControl>>(
-            concurrentHeader.t);
-    for (const auto &control : controls) {
-      lowerbounds.push_back(fir::getBase(converter.genExprValue(
-          *Fortran::semantics::GetExpr(std::get<1>(control.t)), stmtCtx)));
-      upperbounds.push_back(fir::getBase(converter.genExprValue(
-          *Fortran::semantics::GetExpr(std::get<2>(control.t)), stmtCtx)));
-      if (const auto &expr =
-              std::get<std::optional<Fortran::parser::ScalarIntExpr>>(
-                  control.t))
-        steps.push_back(fir::getBase(converter.genExprValue(
-            *Fortran::semantics::GetExpr(*expr), stmtCtx)));
-      else // If `step` is not present, assume it is `1`.
-        steps.push_back(builder.createIntegerConstant(
-            currentLocation, upperbounds[upperbounds.size() - 1].getType(), 1));
-
-      const auto &name = std::get<Fortran::parser::Name>(control.t);
-      privatizeIv(converter, *name.symbol, currentLocation, ivTypes, ivLocs,
-                  privateOperands, ivPrivate, privatizationRecipes,
-                  isDoConcurrent);
-
-      inclusiveBounds.push_back(true);
-    }
-  } else {
-    int64_t loopCount =
-        Fortran::lower::getLoopCountForCollapseAndTile(accClauseList);
-    for (unsigned i = 0; i < loopCount; ++i) {
-      const Fortran::parser::LoopControl *loopControl;
-      if (i == 0) {
-        loopControl = &*outerDoConstruct.GetLoopControl();
-        locs.push_back(converter.genLocation(
-            Fortran::parser::FindSourceLocation(outerDoConstruct)));
-      } else {
-        auto *doCons = crtEval->getIf<Fortran::parser::DoConstruct>();
-        assert(doCons && "expect do construct");
-        loopControl = &*doCons->GetLoopControl();
-        locs.push_back(converter.genLocation(
-            Fortran::parser::FindSourceLocation(*doCons)));
-      }
-
-      const Fortran::parser::LoopControl::Bounds *bounds =
-          std::get_if<Fortran::parser::LoopControl::Bounds>(&loopControl->u);
-      assert(bounds && "Expected bounds on the loop construct");
-      lowerbounds.push_back(fir::getBase(converter.genExprValue(
-          *Fortran::semantics::GetExpr(bounds->lower), stmtCtx)));
-      upperbounds.push_back(fir::getBase(converter.genExprValue(
-          *Fortran::semantics::GetExpr(bounds->upper), stmtCtx)));
-      if (bounds->step)
-        steps.push_back(fir::getBase(converter.genExprValue(
-            *Fortran::semantics::GetExpr(bounds->step), stmtCtx)));
-      else // If `step` is not present, assume it is `1`.
-        steps.push_back(builder.createIntegerConstant(
-            currentLocation, upperbounds[upperbounds.size() - 1].getType(), 1));
-
-      Fortran::semantics::Symbol &ivSym =
-          bounds->name.thing.symbol->GetUltimate();
-      privatizeIv(converter, ivSym, currentLocation, ivTypes, ivLocs,
-                  privateOperands, ivPrivate, privatizationRecipes);
-
-      inclusiveBounds.push_back(true);
-
-      if (i < loopCount - 1)
-        crtEval = &*std::next(crtEval->getNestedEvaluations().begin());
-    }
-  }
-
-  // Prepare the operand segment size attribute and the operands value range.
-  llvm::SmallVector<mlir::Value> operands;
-  llvm::SmallVector<int32_t> operandSegments;
-  addOperands(operands, operandSegments, lowerbounds);
-  addOperands(operands, operandSegments, upperbounds);
-  addOperands(operands, operandSegments, steps);
-  addOperands(operands, operandSegments, gangOperands);
-  addOperands(operands, operandSegments, workerNumOperands);
-  addOperands(operands, operandSegments, vectorOperands);
-  addOperands(operands, operandSegments, tileOperands);
-  addOperands(operands, operandSegments, cacheOperands);
-  addOperands(operands, operandSegments, privateOperands);
-  addOperands(operands, operandSegments, reductionOperands);
-
   llvm::SmallVector<mlir::Type> retTy;
   mlir::Value yieldValue;
   if (needEarlyReturnHandling) {
@@ -2430,16 +2492,13 @@ static mlir::acc::LoopOp createLoopOp(
     retTy.push_back(i1Ty);
   }
 
-  auto loopOp = createRegionOp<mlir::acc::LoopOp, mlir::acc::YieldOp>(
-      builder, builder.getFusedLoc(locs), currentLocation, eval, operands,
-      operandSegments, /*outerCombined=*/false, retTy, yieldValue, ivTypes,
-      ivLocs);
-
-  for (auto [arg, value] : llvm::zip(
-           loopOp.getLoopRegions().front()->front().getArguments(), ivPrivate))
-    builder.create<fir::StoreOp>(currentLocation, arg, value);
-
-  loopOp.setInclusiveUpperbound(inclusiveBounds);
+  uint64_t loopsToProcess =
+      Fortran::lower::getLoopCountForCollapseAndTile(accClauseList);
+  auto loopOp = buildACCLoopOp(
+      converter, currentLocation, semanticsContext, stmtCtx, outerDoConstruct,
+      eval, privateOperands, privatizationRecipes, gangOperands,
+      workerNumOperands, vectorOperands, tileOperands, cacheOperands,
+      reductionOperands, retTy, yieldValue, loopsToProcess);
 
   if (!gangDeviceTypes.empty())
     loopOp.setGangAttr(builder.getArrayAttr(gangDeviceTypes));
@@ -4891,6 +4950,12 @@ bool Fortran::lower::isInOpenACCLoop(fir::FirOpBuilder &builder) {
   return false;
 }
 
+bool Fortran::lower::isInsideOpenACCComputeConstruct(
+    fir::FirOpBuilder &builder) {
+  return mlir::isa_and_nonnull<ACC_COMPUTE_CONSTRUCT_OPS>(
+      mlir::acc::getEnclosingComputeOp(builder.getRegion()));
+}
+
 void Fortran::lower::setInsertionPointAfterOpenACCLoopIfInside(
     fir::FirOpBuilder &builder) {
   if (auto loopOp =
@@ -4905,10 +4970,10 @@ void Fortran::lower::genEarlyReturnInOpenACCLoop(fir::FirOpBuilder &builder,
   builder.create<mlir::acc::YieldOp>(loc, yieldValue);
 }
 
-int64_t Fortran::lower::getLoopCountForCollapseAndTile(
+uint64_t Fortran::lower::getLoopCountForCollapseAndTile(
     const Fortran::parser::AccClauseList &clauseList) {
-  int64_t collapseLoopCount = 1;
-  int64_t tileLoopCount = 1;
+  uint64_t collapseLoopCount = 1;
+  uint64_t tileLoopCount = 1;
   for (const Fortran::parser::AccClause &clause : clauseList.v) {
     if (const auto *collapseClause =
             std::get_if<Fortran::parser::AccClause::Collapse>(&clause.u)) {
@@ -4927,3 +4992,91 @@ int64_t Fortran::lower::getLoopCountForCollapseAndTile(
     return tileLoopCount;
   return collapseLoopCount;
 }
+
+/// Create an ACC loop operation for a DO construct when inside ACC compute
+/// constructs This serves as a bridge between regular DO construct handling and
+/// ACC loop creation
+mlir::Operation *Fortran::lower::genOpenACCLoopFromDoConstruct(
+    AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semanticsContext,
+    Fortran::lower::SymMap &localSymbols,
+    const Fortran::parser::DoConstruct &doConstruct, pft::Evaluation &eval) {
+  // Only convert loops which have induction variables that need privatized.
+  if (!doConstruct.IsDoNormal() && !doConstruct.IsDoConcurrent())
+    return nullptr;
+
+  // If the evaluation is not structured, then we cannot convert the loop
+  // because acc loop does not have an unstructured form.
+  // TODO: There may be other strategies that can be employed such
+  // as generating acc.private for the loop variables without attaching
+  // them to acc.loop.
+  if (eval.lowerAsUnstructured())
+    return nullptr;
+
+  // Open up a new scope for the loop variables.
+  localSymbols.pushScope();
+  auto scopeGuard = llvm::make_scope_exit([&]() { localSymbols.popScope(); });
+
+  // Prepare empty operand vectors since there are no associated `acc loop`
+  // clauses with the Fortran do loops being handled here.
+  llvm::SmallVector<mlir::Value> privateOperands, gangOperands,
+      workerNumOperands, vectorOperands, tileOperands, cacheOperands,
+      reductionOperands;
+  llvm::SmallVector<mlir::Attribute> privatizationRecipes;
+  llvm::SmallVector<mlir::Type> retTy;
+  mlir::Value yieldValue;
+  uint64_t loopsToProcess = 1; // Single loop construct
+
+  // Use same mechanism that handles `acc loop` contained do loops to handle
+  // the implicit loop case.
+  Fortran::lower::StatementContext stmtCtx;
+  auto loopOp = buildACCLoopOp(
+      converter, converter.getCurrentLocation(), semanticsContext, stmtCtx,
+      doConstruct, eval, privateOperands, privatizationRecipes, gangOperands,
+      workerNumOperands, vectorOperands, tileOperands, cacheOperands,
+      reductionOperands, retTy, yieldValue, loopsToProcess);
+
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  if (!privatizationRecipes.empty())
+    loopOp.setPrivatizationRecipesAttr(mlir::ArrayAttr::get(
+        converter.getFirOpBuilder().getContext(), privatizationRecipes));
+
+  // Normal do loops which are not annotated with `acc loop` should be
+  // left for analysis by marking with `auto`. This is the case even in the case
+  // of `acc parallel` region because the normal rules of applying `independent`
+  // is only for loops marked with `acc loop`.
+  // For do concurrent loops, the spec says in section 2.17.2:
+  // "When do concurrent appears without a loop construct in a kernels construct
+  // it is treated as if it is annotated with loop auto. If it appears in a
+  // parallel construct or an accelerator routine then it is treated as if it is
+  // annotated with loop independent."
+  // So this means that in all cases we mark with `auto` unless it is a
+  // `do concurrent` in an `acc parallel` construct or it must be `seq` because
+  // it is in an `acc serial` construct.
+  mlir::Operation *accRegionOp =
+      mlir::acc::getEnclosingComputeOp(converter.getFirOpBuilder().getRegion());
+  mlir::acc::LoopParMode parMode =
+      mlir::isa_and_present<mlir::acc::ParallelOp>(accRegionOp) &&
+              doConstruct.IsDoConcurrent()
+          ? mlir::acc::LoopParMode::loop_independent
+      : mlir::isa_and_present<mlir::acc::SerialOp>(accRegionOp)
+          ? mlir::acc::LoopParMode::loop_seq
+          : mlir::acc::LoopParMode::loop_auto;
+
+  // Set the parallel mode based on the computed parMode
+  auto deviceNoneAttr = mlir::acc::DeviceTypeAttr::get(
+      builder.getContext(), mlir::acc::DeviceType::None);
+  auto arrOfDeviceNone =
+      mlir::ArrayAttr::get(builder.getContext(), deviceNoneAttr);
+  if (parMode == mlir::acc::LoopParMode::loop_independent) {
+    loopOp.setIndependentAttr(arrOfDeviceNone);
+  } else if (parMode == mlir::acc::LoopParMode::loop_seq) {
+    loopOp.setSeqAttr(arrOfDeviceNone);
+  } else if (parMode == mlir::acc::LoopParMode::loop_auto) {
+    loopOp.setAuto_Attr(arrOfDeviceNone);
+  } else {
+    llvm_unreachable("Unexpected loop par mode");
+  }
+
+  return loopOp;
+}
diff --git a/flang/test/Lower/OpenACC/do-loops-to-acc-loops.f90 b/flang/test/Lower/OpenACC/do-loops-to-acc-loops.f90
new file mode 100644
index 0000000000000..d34cd11795b0f
--- /dev/null
+++ b/flang/test/Lower/OpenACC/do-loops-to-acc-loops.f90
@@ -0,0 +1,332 @@
+! This test checks lowering of Fortran do loops and do concurrent loops to OpenACC loop constructs.
+! Tests the new functionality that converts Fortran iteration constructs to acc.loop with proper IV handling.
+
+! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
+
+! CHECK-LABEL: func.func @_QPbasic_do_loop
+subroutine basic_do_loop()
+  integer :: i
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+
+  ! Basic do loop that should be converted to acc.loop
+  !$acc kernels
+  do i = 1, n
+    a(i) = b(i) + 1.0
+  end do
+  !$acc end kernels
+
+! CHECK: acc.kernels {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: acc.yield
+! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
+
+end subroutine
+
+! CHECK-LABEL: func.func @_QPbasic_do_concurrent
+subroutine basic_do_concurrent()
+  integer :: i
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+
+  ! Basic do concurrent loop
+  !$acc kernels
+  do concurrent (i = 1:n)
+    a(i) = b(i) + 1.0
+  end do
+  !$acc end kernels
+
+! CHECK: acc.kernels {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: acc.yield
+! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
+
+end subroutine
+
+! CHECK-LABEL: func.func @_QPbasic_do_loop_parallel
+subroutine basic_do_loop_parallel()
+  integer :: i
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+
+  ! Basic do loop with acc parallel that should be converted to acc.loop
+  !$acc parallel
+  do i = 1, n
+    a(i) = b(i) + 1.0
+  end do
+  !$acc end parallel
+
+! CHECK: acc.parallel {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: acc.yield
+! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
+
+end subroutine
+
+! CHECK-LABEL: func.func @_QPbasic_do_loop_serial
+subroutine basic_do_loop_serial()
+  integer :: i
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+
+  ! Basic do loop with acc serial that should be converted to acc.loop
+  !$acc serial
+  do i = 1, n
+    a(i) = b(i) + 1.0
+  end do
+  !$acc end serial
+
+! CHECK: acc.serial {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: acc.yield
+! CHECK: attributes {inclusiveUpperbound = array<i1: true>, seq = [#acc.device_type<none>]}
+
+end subroutine
+
+! CHECK-LABEL: func.func @_QPbasic_do_concurrent_parallel
+subroutine basic_do_concurrent_parallel()
+  integer :: i
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+
+  ! Basic do concurrent loop with acc parallel
+  !$acc parallel
+  do concurrent (i = 1:n)
+    a(i) = b(i) + 1.0
+  end do
+  !$acc end parallel
+
+! CHECK: acc.parallel {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: acc.yield
+! CHECK: attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
+
+end subroutine
+
+! CHECK-LABEL: func.func @_QPbasic_do_concurrent_serial
+subroutine basic_do_concurrent_serial()
+  integer :: i
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+
+  ! Basic do concurrent loop with acc serial
+  !$acc serial
+  do concurrent (i = 1:n)
+    a(i) = b(i) + 1.0
+  end do
+  !$acc end serial
+
+! CHECK: acc.serial {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: acc.yield
+! CHECK: attributes {inclusiveUpperbound = array<i1: true>, seq = [#acc.device_type<none>]}
+
+end subroutine
+
+! CHECK-LABEL: func.func @_QPmulti_dimension_do_concurrent
+subroutine multi_dimension_do_concurrent()
+  integer :: i, j, k
+  integer, parameter :: n = 10, m = 20, l = 5
+  real, dimension(n,m,l) :: a, b
+
+  ! Multi-dimensional do concurrent with multiple iteration variables
+  !$acc kernels
+  do concurrent (i = 1:n, j = 1:m, k = 1:l)
+    a(i,j,k) = b(i,j,k) * 2.0
+  end do
+  !$acc end kernels
+
+! CHECK: acc.kernels {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32, %{{.*}} : i32, %{{.*}} : i32) = (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32) to (%{{.*}}, %{{.*}}, %{{.*}} : i32, i32, i32) step (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32)
+! CHECK: acc.yield
+! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true, true, true>}
+end subroutine
+
+
+! CHECK-LABEL: func.func @_QPnested_do_loops
+subroutine nested_do_loops()
+  integer :: i, j
+  integer, parameter :: n = 10, m = 20
+  real, dimension(n,m) :: a, b
+
+  ! Nested do loops
+  !$acc kernels
+  do i = 1, n
+    do j = 1, m
+      a(i,j) = b(i,j) + i + j
+    end do
+  end do
+  !$acc end kernels
+
+! CHECK: acc.kernels {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: acc.yield
+! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
+! CHECK: acc.yield
+! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
+
+end subroutine
+
+! CHECK-LABEL: func.func @_QPvariable_bounds_and_step
+subroutine variable_bounds_and_step(n, start_val, step_val)
+  integer, intent(in) :: n, start_val, step_val
+  integer :: i
+  real, dimension(n) :: a, b
+
+  ! Do loop with variable bounds and step
+  !$acc kernels
+  do i = start_val, n, step_val
+    a(i) = b(i) * 2.0
+  end do
+  !$acc end kernels
+
+! CHECK: acc.kernels {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: acc.yield
+! CHECK: attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
+
+end subroutine
+
+! CHECK-LABEL: func.func @_QPdifferent_iv_types
+subroutine different_iv_types()
+  integer(kind=8) :: i8
+  integer(kind=4) :: i4
+  integer(kind=2) :: i2
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b, c, d
+
+  ! Test different iteration variable types
+  !$acc kernels
+  do i8 = 1_8, int(n,8)
+    a(i8) = b(i8) + 1.0
+  end do
+  !$acc end kernels
+
+  !$acc kernels
+  do i4 = 1, n
+    b(i4) = c(i4) + 1.0
+  end do
+  !$acc end kernels
+
+  !$acc kernels
+  do i2 = 1_2, int(n,2)
+    c(i2) = d(i2) + 1.0
+  end do
+  !$acc end kernels
+
+! CHECK: acc.kernels {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i64) = (%{{.*}} : i64) to (%{{.*}} : i64) step (%{{.*}} : i64)
+! CHECK: acc.kernels {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32)
+! CHECK: acc.kernels {
+! CHECK: acc.loop {{.*}} control(%{{.*}} : i16) = (%{{.*}} : i16) to (%{{.*}} : i16) step (%{{.*}} : i16)
+
+end subroutine
+
+! -----------------------------------------------------------------------------------------
+! Tests for loops that should NOT be converted to acc.loop due to unstructured control flow
+
+! CHECK-LABEL: func.func @_QPinfinite_loop_no_iv
+subroutine infinite_loop_no_iv()
+  integer :: i
+  logical :: condition
+
+  ! Infinite loop with no induction variable - should NOT convert to acc.loop
+  !$acc kernels
+  do
+    i = i + 1
+    if (i > 100) exit
+  end do
+  !$acc end kernels
+
+! CHECK: acc.kernels {
+! CHECK-NOT: acc.loop
+
+end subroutine
+
+! CHECK-LABEL: func.func @_QPdo_loop_with_goto
+subroutine do_loop_with_goto()
+  integer :: i
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+
+  ! Do loop with goto - unstructured control flow is not converted.
+  !$acc kernels
+  do i = 1, n
+    a(i) = b(i) + 1.0
+    if (i == 5) goto 100
+    100 continue
+  end do
+  !$acc end kernels
+
+! CHECK: acc.kernels {
+! CHECK-NOT: acc.loop
+
+end subroutine
+
+
+! CHECK-LABEL: func.func @_QPdo_loop_with_cycle_goto
+subroutine do_loop_with_cycle_goto()
+  integer :: i
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+
+  ! Do loop with cycle and goto - unstructured control flow is not converted.
+  !$acc kernels
+  do i = 1, n
+    if (i == 3) cycle
+    a(i) = b(i) + 1.0
+    if (i == 7) goto 200
+    a(i) = a(i) * 2.0
+  end do
+200 continue
+  !$acc end kernels
+
+! CHECK: acc.kernels {
+! CHECK-NOT: acc.loop
+
+end subroutine
+
+! CHECK-LABEL: func.func @_QPnested_goto_loop
+subroutine nested_goto_loop()
+  integer :: i, j
+  integer, parameter :: n = 10, m = 5
+  real, dimension(n,m) :: a, b
+
+  ! Nested loop with goto from inner to outer - should NOT convert to acc.loop
+  !$acc kernels
+  do i = 1, n
+    do j = 1, m
+      a(i,j) = b(i,j) + 1.0
+      if (i * j > 20) goto 300  ! Exit both loops
+    end do
+  end do
+300 continue
+  !$acc end kernels
+
+! CHECK: acc.kernels {
+! CHECK-NOT: acc.loop
+
+end subroutine
+
+! CHECK-LABEL: func.func @_QPwhile_like_loop
+subroutine while_like_loop()
+  integer :: i
+  logical :: condition
+
+  i = 1
+  condition = .true.
+
+  ! While-like infinite loop - should NOT convert to acc.loop
+  !$acc kernels
+  do while (condition)
+    i = i + 1
+    if (i > 100) condition = .false.
+  end do
+  !$acc end kernels
+
+! CHECK: acc.kernels {
+! CHECK-NOT: acc.loop
+
+end subroutine
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 96b9adcc53b3c..19b81267c32dd 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -134,6 +134,24 @@ def OpenACC_VariableTypeCategory : I32BitEnumAttr<
   let printBitEnumPrimaryGroups = 1;
 }
 
+// These are parallelism determination modes for `acc loop`.
+// In the enum names, we use the "loop_" prefix because "auto" is
+// a language keyword - and thus for consistency all other cases
+// do the same.
+def OpenACC_LoopSeq : I32EnumAttrCase<"loop_seq", 0>;
+def OpenACC_LoopAuto : I32EnumAttrCase<"loop_auto", 1>;
+def OpenACC_LoopIndependent : I32EnumAttrCase<"loop_independent", 2>;
+
+def OpenACC_LoopParMode : I32EnumAttr<
+    "LoopParMode",
+    "Encodes the options for loop parallelism determination mode",
+    [
+      OpenACC_LoopAuto, OpenACC_LoopIndependent,
+      OpenACC_LoopSeq]> {
+  let cppNamespace = "::mlir::acc";
+  let genSpecializedAttr = 0;
+}
+
 // Type used in operation below.
 def IntOrIndex : AnyTypeOf<[AnyInteger, Index]>;
 
@@ -2404,6 +2422,53 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
   }];
 
   let hasVerifier = 1;
+
+  let builders = [
+    OpBuilder<(ins "::mlir::ValueRange":$lowerbounds,
+                   "::mlir::ValueRange":$upperbounds,
+                   "::mlir::ValueRange":$steps,
+                   "LoopParMode":$parMode), [{
+        auto deviceNoneAttr = mlir::acc::DeviceTypeAttr::get(
+          $_builder.getContext(), mlir::acc::DeviceType::None);
+        auto arrOfDeviceNone = mlir::ArrayAttr::get(
+          $_builder.getContext(), deviceNoneAttr);
+        build($_builder, $_state,
+          /*results=*/{},
+          /*lowerbound=*/lowerbounds,
+          /*upperbound=*/upperbounds,
+          /*step=*/steps,
+          /*inclusiveUpperbound=*/nullptr,
+          /*collapse=*/nullptr,
+          /*collapseDeviceType=*/nullptr,
+          /*gangOperands=*/{},
+          /*gangOperandsArgType=*/nullptr,
+          /*gangOperandsSegments=*/nullptr,
+          /*gangOperandsDeviceType=*/nullptr,
+          /*workerNumOperands=*/{},
+          /*workerNumOperandsDeviceType=*/nullptr,
+          /*vectorOperands=*/{},
+          /*vectorOperandsDeviceType=*/nullptr,
+          /*seq=*/parMode == LoopParMode::loop_seq ?
+            arrOfDeviceNone : nullptr,
+          /*independent=*/parMode == LoopParMode::loop_independent ?
+            arrOfDeviceNone : nullptr,
+          /*auto_=*/parMode == LoopParMode::loop_auto ?
+            arrOfDeviceNone : nullptr,
+          /*gang=*/nullptr,
+          /*worker=*/nullptr,
+          /*vector=*/nullptr,
+          /*tileOperands=*/{},
+          /*tileOperandsSegments=*/nullptr,
+          /*tileOperandsDeviceType=*/nullptr,
+          /*cacheOperands=*/{},
+          /*privateOperands=*/{},
+          /*privatizationRecipes=*/nullptr,
+          /*reductionOperands=*/{},
+          /*reductionRecipes=*/nullptr,
+          /*combined=*/nullptr);
+      }]
+    >
+  ];
 }
 
 // Yield operation for the acc.loop and acc.parallel operations.

From c1114f839e319cf685d9fbb095cc40d76049ea43 Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <rlupusoru@nvidia.com>
Date: Mon, 21 Jul 2025 08:19:28 -0700
Subject: [PATCH 2/5] Fix formatting

---
 flang/include/flang/Lower/OpenACC.h | 6 +++---
 flang/lib/Lower/Bridge.cpp          | 8 ++++----
 flang/lib/Lower/OpenACC.cpp         | 2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/flang/include/flang/Lower/OpenACC.h b/flang/include/flang/Lower/OpenACC.h
index 8b13ce94f7bc4..e974f3d6eef11 100644
--- a/flang/include/flang/Lower/OpenACC.h
+++ b/flang/include/flang/Lower/OpenACC.h
@@ -124,7 +124,8 @@ uint64_t getLoopCountForCollapseAndTile(const Fortran::parser::AccClauseList &);
 /// Checks whether the current insertion point is inside OpenACC loop.
 bool isInOpenACCLoop(fir::FirOpBuilder &);
 
-/// Checks whether the current insertion point is inside OpenACC compute construct.
+/// Checks whether the current insertion point is inside OpenACC compute
+/// construct.
 bool isInsideOpenACCComputeConstruct(fir::FirOpBuilder &);
 
 void setInsertionPointAfterOpenACCLoopIfInside(fir::FirOpBuilder &);
@@ -139,8 +140,7 @@ mlir::Operation *genOpenACCLoopFromDoConstruct(
     AbstractConverter &converter,
     Fortran::semantics::SemanticsContext &semanticsContext,
     Fortran::lower::SymMap &localSymbols,
-    const Fortran::parser::DoConstruct &doConstruct,
-    pft::Evaluation &eval);
+    const Fortran::parser::DoConstruct &doConstruct, pft::Evaluation &eval);
 
 } // namespace lower
 } // namespace Fortran
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 4073b8623e333..5eb1bafbb7ea2 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -2170,9 +2170,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     // Loops with induction variables inside OpenACC compute constructs
     // need special handling to ensure that the IVs are privatized.
     if (Fortran::lower::isInsideOpenACCComputeConstruct(*builder)) {
-      mlir::Operation* loopOp = Fortran::lower::genOpenACCLoopFromDoConstruct(
-                         *this, bridge.getSemanticsContext(), localSymbols,
-                         doConstruct, eval);
+      mlir::Operation *loopOp = Fortran::lower::genOpenACCLoopFromDoConstruct(
+          *this, bridge.getSemanticsContext(), localSymbols, doConstruct, eval);
       bool success = loopOp != nullptr;
       if (success) {
         // Sanity check that the builder insertion point is inside the newly
@@ -2184,7 +2183,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
 
         // Loop body code.
         auto iter = eval.getNestedEvaluations().begin();
-        for (auto end = --eval.getNestedEvaluations().end(); iter != end; ++iter)
+        for (auto end = --eval.getNestedEvaluations().end(); iter != end;
+             ++iter)
           genFIR(*iter, unstructuredContext);
         return;
       }
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 950b02501751a..5f58ce02c11b1 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -35,8 +35,8 @@
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Support/LLVM.h"
-#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/Frontend/OpenACC/ACC.h.inc"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"

From c33681da6b8802ddbdf2974a6c252aebde9c11d1 Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <rlupusoru@nvidia.com>
Date: Wed, 23 Jul 2025 15:50:24 -0700
Subject: [PATCH 3/5] Add TODO for unstructured case

---
 flang/lib/Lower/OpenACC.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index b1c68359f5baa..35db6d012573f 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -5013,13 +5013,14 @@ mlir::Operation *Fortran::lower::genOpenACCLoopFromDoConstruct(
   if (!doConstruct.IsDoNormal() && !doConstruct.IsDoConcurrent())
     return nullptr;
 
-  // If the evaluation is not structured, then we cannot convert the loop
+  // If the evaluation is unstructured, then we cannot convert the loop
   // because acc loop does not have an unstructured form.
   // TODO: There may be other strategies that can be employed such
   // as generating acc.private for the loop variables without attaching
   // them to acc.loop.
   if (eval.lowerAsUnstructured())
-    return nullptr;
+    TODO(converter.getCurrentLocation(),
+         "unstructured do loops in acc regions");
 
   // Open up a new scope for the loop variables.
   localSymbols.pushScope();

From 84c1825be53585278c971dee7c8806a8b287bc81 Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <rlupusoru@nvidia.com>
Date: Wed, 23 Jul 2025 16:59:19 -0700
Subject: [PATCH 4/5] Add TODO for unstructured in kernels

---
 flang/lib/Lower/OpenACC.cpp                   | 15 ++-
 .../Todo/do-loops-to-acc-loops-todo.f90       | 91 +++++++++++++++++++
 .../Lower/OpenACC/do-loops-to-acc-loops.f90   | 65 -------------
 3 files changed, 103 insertions(+), 68 deletions(-)
 create mode 100644 flang/test/Lower/OpenACC/Todo/do-loops-to-acc-loops-todo.f90

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 35db6d012573f..57ce1d374b496 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -5018,9 +5018,18 @@ mlir::Operation *Fortran::lower::genOpenACCLoopFromDoConstruct(
   // TODO: There may be other strategies that can be employed such
   // as generating acc.private for the loop variables without attaching
   // them to acc.loop.
-  if (eval.lowerAsUnstructured())
-    TODO(converter.getCurrentLocation(),
-         "unstructured do loops in acc regions");
+  // For now - generate a not-yet-implemented message because without
+  // privatizing the induction variable, the loop may not execute correctly.
+  // Only do this for `acc kernels` because in `acc parallel`, scalars end
+  // up as implicitly firstprivate.
+  if (eval.lowerAsUnstructured()) {
+    if (mlir::isa_and_present<mlir::acc::KernelsOp>(
+            mlir::acc::getEnclosingComputeOp(
+                converter.getFirOpBuilder().getRegion())))
+      TODO(converter.getCurrentLocation(),
+           "unstructured do loop in acc kernels");
+    return nullptr;
+  }
 
   // Open up a new scope for the loop variables.
   localSymbols.pushScope();
diff --git a/flang/test/Lower/OpenACC/Todo/do-loops-to-acc-loops-todo.f90 b/flang/test/Lower/OpenACC/Todo/do-loops-to-acc-loops-todo.f90
new file mode 100644
index 0000000000000..aa1d44365e5eb
--- /dev/null
+++ b/flang/test/Lower/OpenACC/Todo/do-loops-to-acc-loops-todo.f90
@@ -0,0 +1,91 @@
+! RUN: split-file %s %t
+! RUN: %not_todo_cmd bbc -fopenacc -emit-hlfir %t/do_loop_with_stop.f90 -o - 2>&1 | FileCheck %s --check-prefix=CHECK1
+! RUN: %not_todo_cmd bbc -fopenacc -emit-hlfir %t/do_loop_with_cycle_goto.f90 -o - 2>&1 | FileCheck %s --check-prefix=CHECK2
+! RUN: %not_todo_cmd bbc -fopenacc -emit-hlfir %t/nested_goto_loop.f90 -o - 2>&1 | FileCheck %s --check-prefix=CHECK3
+! RUN: %not_todo_cmd bbc -fopenacc -emit-hlfir %t/nested_loop_with_inner_goto.f90 -o - 2>&1 | FileCheck %s --check-prefix=CHECK4
+
+//--- do_loop_with_stop.f90
+
+subroutine do_loop_with_stop()
+  integer :: i
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+
+  !$acc kernels
+  do i = 1, n
+    a(i) = b(i) + 1.0
+    if (i == 5) stop
+  end do
+  !$acc end kernels
+
+! CHECK1: not yet implemented: unstructured do loop in acc kernels
+
+end subroutine
+
+//--- do_loop_with_cycle_goto.f90
+
+subroutine do_loop_with_cycle_goto()
+  integer :: i
+  integer, parameter :: n = 10
+  real, dimension(n) :: a, b
+
+  ! Do loop with cycle and goto - unstructured control flow is not converted.
+  !$acc kernels
+  do i = 1, n
+    if (i == 3) cycle
+    a(i) = b(i) + 1.0
+    if (i == 7) goto 200
+    a(i) = a(i) * 2.0
+  end do
+200 continue
+  !$acc end kernels
+
+! CHECK2: not yet implemented: unstructured do loop in acc kernels
+
+end subroutine
+
+//--- nested_goto_loop.f90
+
+subroutine nested_goto_loop()
+  integer :: i, j
+  integer, parameter :: n = 10, m = 5
+  real, dimension(n,m) :: a, b
+
+  ! Nested loop with goto from inner to outer - should NOT convert to acc.loop
+  !$acc kernels
+  do i = 1, n
+    do j = 1, m
+      a(i,j) = b(i,j) + 1.0
+      if (i * j > 20) goto 300  ! Exit both loops
+    end do
+  end do
+300 continue
+  !$acc end kernels
+
+! CHECK3: not yet implemented: unstructured do loop in acc kernels
+
+end subroutine
+
+//--- nested_loop_with_inner_goto.f90
+
+subroutine nested_loop_with_inner_goto()
+  integer :: ii = 0, jj = 0
+  integer, parameter :: nn = 3
+  real, dimension(nn, nn) :: aa
+  
+  aa = -1
+  
+  ! Nested loop with goto from inner loop - unstructured control flow is not converted.
+  !$acc kernels
+  do ii = 1, nn
+    do jj = 1, nn
+      if (jj > 1) goto 300
+      aa(jj, ii) = 1337
+    end do
+    300 continue
+  end do
+  !$acc end kernels
+
+! CHECK4: not yet implemented: unstructured do loop in acc kernels
+
+end subroutine
\ No newline at end of file
diff --git a/flang/test/Lower/OpenACC/do-loops-to-acc-loops.f90 b/flang/test/Lower/OpenACC/do-loops-to-acc-loops.f90
index d34cd11795b0f..5f8ea03d43a5d 100644
--- a/flang/test/Lower/OpenACC/do-loops-to-acc-loops.f90
+++ b/flang/test/Lower/OpenACC/do-loops-to-acc-loops.f90
@@ -245,71 +245,6 @@ subroutine infinite_loop_no_iv()
 
 end subroutine
 
-! CHECK-LABEL: func.func @_QPdo_loop_with_goto
-subroutine do_loop_with_goto()
-  integer :: i
-  integer, parameter :: n = 10
-  real, dimension(n) :: a, b
-
-  ! Do loop with goto - unstructured control flow is not converted.
-  !$acc kernels
-  do i = 1, n
-    a(i) = b(i) + 1.0
-    if (i == 5) goto 100
-    100 continue
-  end do
-  !$acc end kernels
-
-! CHECK: acc.kernels {
-! CHECK-NOT: acc.loop
-
-end subroutine
-
-
-! CHECK-LABEL: func.func @_QPdo_loop_with_cycle_goto
-subroutine do_loop_with_cycle_goto()
-  integer :: i
-  integer, parameter :: n = 10
-  real, dimension(n) :: a, b
-
-  ! Do loop with cycle and goto - unstructured control flow is not converted.
-  !$acc kernels
-  do i = 1, n
-    if (i == 3) cycle
-    a(i) = b(i) + 1.0
-    if (i == 7) goto 200
-    a(i) = a(i) * 2.0
-  end do
-200 continue
-  !$acc end kernels
-
-! CHECK: acc.kernels {
-! CHECK-NOT: acc.loop
-
-end subroutine
-
-! CHECK-LABEL: func.func @_QPnested_goto_loop
-subroutine nested_goto_loop()
-  integer :: i, j
-  integer, parameter :: n = 10, m = 5
-  real, dimension(n,m) :: a, b
-
-  ! Nested loop with goto from inner to outer - should NOT convert to acc.loop
-  !$acc kernels
-  do i = 1, n
-    do j = 1, m
-      a(i,j) = b(i,j) + 1.0
-      if (i * j > 20) goto 300  ! Exit both loops
-    end do
-  end do
-300 continue
-  !$acc end kernels
-
-! CHECK: acc.kernels {
-! CHECK-NOT: acc.loop
-
-end subroutine
-
 ! CHECK-LABEL: func.func @_QPwhile_like_loop
 subroutine while_like_loop()
   integer :: i

From f0fb4de0bf4d91f585b853b48ed24c3ad1e25aa0 Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <rlupusoru@nvidia.com>
Date: Fri, 25 Jul 2025 13:27:56 -0700
Subject: [PATCH 5/5] Add helper for loop to get par mode

---
 mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td |  5 +++++
 mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp         | 17 +++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 19b81267c32dd..e1e99c3e445a5 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -2391,6 +2391,11 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
     // Return whether this LoopOp has a gang, worker, or vector applying to the
     // 'default'/None device-type.
     bool hasDefaultGangWorkerVector();
+
+    // Used to obtain the parallelism mode for the requested device type.
+    // This first checks if the mode is set for the device_type requested.
+    // And if not, it returns the non-device_type mode.
+    LoopParMode getDefaultOrDeviceTypeParallelism(DeviceType);
   }];
 
   let hasCustomAssemblyFormat = 1;
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index e73bdd3e11621..9d5dfc1300909 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -2957,6 +2957,23 @@ bool acc::LoopOp::hasDefaultGangWorkerVector() {
          getGangValue(GangArgType::Dim) || getGangValue(GangArgType::Static);
 }
 
+acc::LoopParMode
+acc::LoopOp::getDefaultOrDeviceTypeParallelism(DeviceType deviceType) {
+  if (hasSeq(deviceType))
+    return LoopParMode::loop_seq;
+  if (hasAuto(deviceType))
+    return LoopParMode::loop_auto;
+  if (hasIndependent(deviceType))
+    return LoopParMode::loop_independent;
+  if (hasSeq())
+    return LoopParMode::loop_seq;
+  if (hasAuto())
+    return LoopParMode::loop_auto;
+  assert(hasIndependent() &&
+         "loop must have default auto, seq, or independent");
+  return LoopParMode::loop_independent;
+}
+
 void acc::LoopOp::addGangOperands(
     MLIRContext *context, llvm::ArrayRef<DeviceType> effectiveDeviceTypes,
     llvm::ArrayRef<GangArgType> argTypes, mlir::ValueRange values) {