-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AArch64] Allow forcing unrolling of small loops #167488
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[AArch64] Allow forcing unrolling of small loops #167488
Conversation
VladiKrapp-Arm
commented
Nov 11, 2025
- Introduce the -aarch64-force-unroll-threshold option; when a loop’s cost is below this value we set UP.Force = true (default 0 keeps current behaviour)
- Add an AArch64 loop-unroll regression test that runs once at the default threshold and once with the flag raised, confirming forced unrolling
|
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-aarch64 Author: Vladi Krapp (VladiKrapp-Arm) Changes
Full diff: https://github.com/llvm/llvm-project/pull/167488.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 197aae6e03cb1..79ad532f73efc 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -77,6 +77,10 @@ static cl::opt<unsigned> DMBLookaheadThreshold(
"dmb-lookahead-threshold", cl::init(10), cl::Hidden,
cl::desc("The number of instructions to search for a redundant dmb"));
+static cl::opt<int> Aarch64ForceUnrollThreshold(
+ "aarch64-force-unroll-threshold", cl::init(0), cl::Hidden,
+ cl::desc("Threshold for forced unrolling of small loops in AArch64"));
+
namespace {
class TailFoldingOption {
// These bitfields will only ever be set to something non-zero in operator=,
@@ -5250,6 +5254,7 @@ void AArch64TTIImpl::getUnrollingPreferences(
// inlining. Don't unroll auto-vectorized loops either, though do allow
// unrolling of the scalar remainder.
bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
+ InstructionCost Cost = 0;
for (auto *BB : L->getBlocks()) {
for (auto &I : *BB) {
// Both auto-vectorized loops and the scalar remainder have the
@@ -5264,6 +5269,10 @@ void AArch64TTIImpl::getUnrollingPreferences(
continue;
return;
}
+
+ SmallVector<const Value*, 4> Operands(I.operand_values());
+ Cost += getInstructionCost(&I, Operands,
+ TargetTransformInfo::TCK_SizeAndLatency);
}
}
@@ -5310,6 +5319,11 @@ void AArch64TTIImpl::getUnrollingPreferences(
UP.UnrollAndJam = true;
UP.UnrollAndJamInnerLoopThreshold = 60;
}
+
+ // Force unrolling small loops can be very useful because of the branch
+ // taken cost of the backedge.
+ if (Cost < Aarch64ForceUnrollThreshold)
+ UP.Force = true;
}
void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/force-unroll-threshold.ll b/llvm/test/Transforms/LoopUnroll/AArch64/force-unroll-threshold.ll
new file mode 100644
index 0000000000000..986df8bed8462
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/force-unroll-threshold.ll
@@ -0,0 +1,90 @@
+; RUN: opt -passes=loop-unroll -S -unroll-runtime %s | FileCheck %s --check-prefix=NOFORCE
+; RUN: opt -passes=loop-unroll -S -unroll-runtime -aarch64-force-unroll-threshold=500 %s | FileCheck %s --check-prefix=FORCE
+
+; The loop has a small runtime upper bound (at most four iterations) but a
+; relatively expensive body. With runtime unrolling enabled, the cost model
+; still leaves the loop rolled. Raising the AArch64 force threshold overrides
+; that decision and unrolls.
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @force_small_loop(ptr nocapture %a, ptr nocapture %b, i32 %n) {
+entry:
+ br label %loop
+
+; NOFORCE-LABEL: @force_small_loop(
+; NOFORCE: loop:
+; NOFORCE: br i1 %cond, label %body, label %exit
+; NOFORCE: body:
+; NOFORCE: store i32 %mix15, ptr %ptrb, align 4
+; NOFORCE: latch:
+; NOFORCE: br i1 %cmp2, label %loop, label %exit
+; NOFORCE: ret void
+; NOFORCE-NOT: loop.1:
+;
+; FORCE-LABEL: @force_small_loop(
+; FORCE: loop:
+; FORCE: br i1 %cond, label %body, label %exit
+; FORCE: loop.1:
+; FORCE: br i1 true, label %body.1, label %exit
+; FORCE: body.1:
+; FORCE: store i32 %mix15.1, ptr %ptrb.1, align 4
+; FORCE: latch.1:
+; FORCE: br i1 %cmp2.1, label %loop, label %exit
+; FORCE: ret void
+
+loop:
+ %i = phi i32 [ 0, %entry ], [ %inc, %latch ]
+ %ptra = getelementptr inbounds i32, ptr %a, i32 %i
+ %pa = load i32, ptr %ptra, align 4
+ %tmp0 = mul nsw i32 %pa, %pa
+ %tmp1 = add nsw i32 %tmp0, %pa
+ %tmp2 = shl i32 %tmp1, 1
+ %tmp3 = ashr i32 %tmp2, 1
+ %tmp4 = xor i32 %tmp3, %pa
+ %tmp5 = add nsw i32 %tmp4, 7
+ %tmp6 = mul nsw i32 %tmp5, 5
+ %tmp7 = add nsw i32 %tmp6, %tmp4
+ %tmp8 = mul nsw i32 %tmp7, %tmp3
+ %tmp9 = add nsw i32 %tmp8, %tmp7
+ %tmp10 = xor i32 %tmp9, %tmp6
+ %tmp11 = add nsw i32 %tmp10, %tmp8
+ %tmp12 = mul nsw i32 %tmp11, 9
+ %tmp13 = add nsw i32 %tmp12, %tmp10
+ %tmp14 = xor i32 %tmp13, %tmp11
+ %cond = icmp ult i32 %i, %n
+ br i1 %cond, label %body, label %exit
+
+body:
+ %ptrb = getelementptr inbounds i32, ptr %b, i32 %i
+ %pb = load i32, ptr %ptrb, align 4
+ %sum = add nsw i32 %pb, %tmp14
+ %diff = sub nsw i32 %sum, %pa
+ %mix1 = mul nsw i32 %diff, 3
+ %mix2 = add nsw i32 %mix1, %tmp3
+ %mix3 = xor i32 %mix2, %diff
+ %mix4 = add nsw i32 %mix3, %tmp0
+ %mix5 = mul nsw i32 %mix4, 11
+ %mix6 = add nsw i32 %mix5, %mix2
+ %mix7 = xor i32 %mix6, %mix5
+ %mix8 = add nsw i32 %mix7, %mix3
+ %mix9 = mul nsw i32 %mix8, 13
+ %mix10 = add nsw i32 %mix9, %mix8
+ %mix11 = xor i32 %mix10, %mix7
+ %mix12 = add nsw i32 %mix11, %mix6
+ %mix13 = mul nsw i32 %mix12, 17
+ %mix14 = add nsw i32 %mix13, %mix9
+ %mix15 = xor i32 %mix14, %mix10
+ store i32 %mix15, ptr %ptrb, align 4
+ br label %latch
+
+latch:
+ %inc = add nuw nsw i32 %i, 1
+ %cmp.limit = icmp ult i32 %n, 4
+ %upper = select i1 %cmp.limit, i32 %n, i32 4
+ %cmp2 = icmp ult i32 %inc, %upper
+ br i1 %cmp2, label %loop, label %exit
+
+exit:
+ ret void
+}
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
2729997 to
b918db5
Compare
- Introduce the -aarch64-force-unroll-threshold option; when a loop’s cost is below this value we set UP.Force = true (default 0 keeps current behaviour) - Add an AArch64 loop-unroll regression test that runs once at the default threshold and once with the flag raised, confirming forced unrolling
b918db5 to
18e0142
Compare
fhahn
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would it be possible to detect small loops which are beneficial for unrolling automatically for a given CPU, instead of just having a very broad flag?
In general, yes, that's a great future direction to follow up on! Preliminary results seem to indicate that loops with pointer chasing benefit more than ones without, so there should be some heuristics we can use. There's much more research to be done though. For now, we can give a simple general tool that's following on existing practice in aarch32 |