[AArch64] Allow forcing unrolling of small loops

VladiKrapp-Arm · VladiKrapp-Arm · commit 18e01422bd3d · 2025-11-11T12:11:20.000Z
- Introduce the -aarch64-force-unroll-threshold option; when a loop’s
  cost is below this value we set UP.Force = true (default 0 keeps
  current behaviour)
- Add an AArch64 loop-unroll regression test that runs once at the
  default threshold and once with the flag raised, confirming forced
  unrolling
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -77,6 +77,10 @@ static cl::opt<unsigned> DMBLookaheadThreshold(
     "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
     cl::desc("The number of instructions to search for a redundant dmb"));
 
+static cl::opt<int> Aarch64ForceUnrollThreshold(
+    "aarch64-force-unroll-threshold", cl::init(0), cl::Hidden,
+    cl::desc("Threshold for forced unrolling of small loops in AArch64"));
+
 namespace {
 class TailFoldingOption {
   // These bitfields will only ever be set to something non-zero in operator=,
@@ -5250,6 +5254,7 @@ void AArch64TTIImpl::getUnrollingPreferences(
   // inlining. Don't unroll auto-vectorized loops either, though do allow
   // unrolling of the scalar remainder.
   bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
+  InstructionCost Cost = 0;
   for (auto *BB : L->getBlocks()) {
     for (auto &I : *BB) {
       // Both auto-vectorized loops and the scalar remainder have the
@@ -5264,6 +5269,10 @@ void AArch64TTIImpl::getUnrollingPreferences(
               continue;
         return;
       }
+
+      SmallVector<const Value *, 4> Operands(I.operand_values());
+      Cost += getInstructionCost(&I, Operands,
+                                 TargetTransformInfo::TCK_SizeAndLatency);
     }
   }
 
@@ -5310,6 +5319,11 @@ void AArch64TTIImpl::getUnrollingPreferences(
     UP.UnrollAndJam = true;
     UP.UnrollAndJamInnerLoopThreshold = 60;
   }
+
+  // Force unrolling small loops can be very useful because of the branch
+  // taken cost of the backedge.
+  if (Cost < Aarch64ForceUnrollThreshold)
+    UP.Force = true;
 }
 
 void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/force-unroll-threshold.ll b/llvm/test/Transforms/LoopUnroll/AArch64/force-unroll-threshold.ll
@@ -0,0 +1,90 @@
+; RUN: opt -passes=loop-unroll -S -unroll-runtime %s | FileCheck %s --check-prefix=NOFORCE
+; RUN: opt -passes=loop-unroll -S -unroll-runtime -aarch64-force-unroll-threshold=500 %s | FileCheck %s --check-prefix=FORCE
+
+; The loop has a small runtime upper bound (at most four iterations) but a
+; relatively expensive body. With runtime unrolling enabled, the cost model
+; still leaves the loop rolled. Raising the AArch64 force threshold overrides
+; that decision and unrolls.
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @force_small_loop(ptr nocapture %a, ptr nocapture %b, i32 %n) {
+entry:
+  br label %loop
+
+; NOFORCE-LABEL: @force_small_loop(
+; NOFORCE:       loop:
+; NOFORCE:         br i1 %cond, label %body, label %exit
+; NOFORCE:       body:
+; NOFORCE:         store i32 %mix15, ptr %ptrb, align 4
+; NOFORCE:       latch:
+; NOFORCE:         br i1 %cmp2, label %loop, label %exit
+; NOFORCE:       ret void
+; NOFORCE-NOT:   loop.1:
+;
+; FORCE-LABEL: @force_small_loop(
+; FORCE:       loop:
+; FORCE:         br i1 %cond, label %body, label %exit
+; FORCE:       loop.1:
+; FORCE:         br i1 true, label %body.1, label %exit
+; FORCE:       body.1:
+; FORCE:         store i32 %mix15.1, ptr %ptrb.1, align 4
+; FORCE:       latch.1:
+; FORCE:         br i1 %cmp2.1, label %loop, label %exit
+; FORCE:       ret void
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %inc, %latch ]
+  %ptra = getelementptr inbounds i32, ptr %a, i32 %i
+  %pa = load i32, ptr %ptra, align 4
+  %tmp0 = mul nsw i32 %pa, %pa
+  %tmp1 = add nsw i32 %tmp0, %pa
+  %tmp2 = shl i32 %tmp1, 1
+  %tmp3 = ashr i32 %tmp2, 1
+  %tmp4 = xor i32 %tmp3, %pa
+  %tmp5 = add nsw i32 %tmp4, 7
+  %tmp6 = mul nsw i32 %tmp5, 5
+  %tmp7 = add nsw i32 %tmp6, %tmp4
+  %tmp8 = mul nsw i32 %tmp7, %tmp3
+  %tmp9 = add nsw i32 %tmp8, %tmp7
+  %tmp10 = xor i32 %tmp9, %tmp6
+  %tmp11 = add nsw i32 %tmp10, %tmp8
+  %tmp12 = mul nsw i32 %tmp11, 9
+  %tmp13 = add nsw i32 %tmp12, %tmp10
+  %tmp14 = xor i32 %tmp13, %tmp11
+  %cond = icmp ult i32 %i, %n
+  br i1 %cond, label %body, label %exit
+
+body:
+  %ptrb = getelementptr inbounds i32, ptr %b, i32 %i
+  %pb = load i32, ptr %ptrb, align 4
+  %sum = add nsw i32 %pb, %tmp14
+  %diff = sub nsw i32 %sum, %pa
+  %mix1 = mul nsw i32 %diff, 3
+  %mix2 = add nsw i32 %mix1, %tmp3
+  %mix3 = xor i32 %mix2, %diff
+  %mix4 = add nsw i32 %mix3, %tmp0
+  %mix5 = mul nsw i32 %mix4, 11
+  %mix6 = add nsw i32 %mix5, %mix2
+  %mix7 = xor i32 %mix6, %mix5
+  %mix8 = add nsw i32 %mix7, %mix3
+  %mix9 = mul nsw i32 %mix8, 13
+  %mix10 = add nsw i32 %mix9, %mix8
+  %mix11 = xor i32 %mix10, %mix7
+  %mix12 = add nsw i32 %mix11, %mix6
+  %mix13 = mul nsw i32 %mix12, 17
+  %mix14 = add nsw i32 %mix13, %mix9
+  %mix15 = xor i32 %mix14, %mix10
+  store i32 %mix15, ptr %ptrb, align 4
+  br label %latch
+
+latch:
+  %inc = add nuw nsw i32 %i, 1
+  %cmp.limit = icmp ult i32 %n, 4
+  %upper = select i1 %cmp.limit, i32 %n, i32 4
+  %cmp2 = icmp ult i32 %inc, %upper
+  br i1 %cmp2, label %loop, label %exit
+
+exit:
+  ret void
+}