Skip to content

Commit 18e0142

Browse files
[AArch64] Allow forcing unrolling of small loops
- Introduce the -aarch64-force-unroll-threshold option; when a loop’s cost is below this value we set UP.Force = true (default 0 keeps current behaviour) - Add an AArch64 loop-unroll regression test that runs once at the default threshold and once with the flag raised, confirming forced unrolling
1 parent 300750d commit 18e0142

File tree

2 files changed

+104
-0
lines changed

2 files changed

+104
-0
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,10 @@ static cl::opt<unsigned> DMBLookaheadThreshold(
7777
"dmb-lookahead-threshold", cl::init(10), cl::Hidden,
7878
cl::desc("The number of instructions to search for a redundant dmb"));
7979

80+
static cl::opt<int> Aarch64ForceUnrollThreshold(
81+
"aarch64-force-unroll-threshold", cl::init(0), cl::Hidden,
82+
cl::desc("Threshold for forced unrolling of small loops in AArch64"));
83+
8084
namespace {
8185
class TailFoldingOption {
8286
// These bitfields will only ever be set to something non-zero in operator=,
@@ -5250,6 +5254,7 @@ void AArch64TTIImpl::getUnrollingPreferences(
52505254
// inlining. Don't unroll auto-vectorized loops either, though do allow
52515255
// unrolling of the scalar remainder.
52525256
bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
5257+
InstructionCost Cost = 0;
52535258
for (auto *BB : L->getBlocks()) {
52545259
for (auto &I : *BB) {
52555260
// Both auto-vectorized loops and the scalar remainder have the
@@ -5264,6 +5269,10 @@ void AArch64TTIImpl::getUnrollingPreferences(
52645269
continue;
52655270
return;
52665271
}
5272+
5273+
SmallVector<const Value *, 4> Operands(I.operand_values());
5274+
Cost += getInstructionCost(&I, Operands,
5275+
TargetTransformInfo::TCK_SizeAndLatency);
52675276
}
52685277
}
52695278

@@ -5310,6 +5319,11 @@ void AArch64TTIImpl::getUnrollingPreferences(
53105319
UP.UnrollAndJam = true;
53115320
UP.UnrollAndJamInnerLoopThreshold = 60;
53125321
}
5322+
5323+
// Force unrolling small loops can be very useful because of the branch
5324+
// taken cost of the backedge.
5325+
if (Cost < Aarch64ForceUnrollThreshold)
5326+
UP.Force = true;
53135327
}
53145328

53155329
void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
; RUN: opt -passes=loop-unroll -S -unroll-runtime %s | FileCheck %s --check-prefix=NOFORCE
2+
; RUN: opt -passes=loop-unroll -S -unroll-runtime -aarch64-force-unroll-threshold=500 %s | FileCheck %s --check-prefix=FORCE
3+
4+
; The loop has a small runtime upper bound (at most four iterations) but a
5+
; relatively expensive body. With runtime unrolling enabled, the cost model
6+
; still leaves the loop rolled. Raising the AArch64 force threshold overrides
7+
; that decision and unrolls.
8+
9+
target triple = "aarch64-unknown-linux-gnu"
10+
11+
define void @force_small_loop(ptr nocapture %a, ptr nocapture %b, i32 %n) {
12+
entry:
13+
br label %loop
14+
15+
; NOFORCE-LABEL: @force_small_loop(
16+
; NOFORCE: loop:
17+
; NOFORCE: br i1 %cond, label %body, label %exit
18+
; NOFORCE: body:
19+
; NOFORCE: store i32 %mix15, ptr %ptrb, align 4
20+
; NOFORCE: latch:
21+
; NOFORCE: br i1 %cmp2, label %loop, label %exit
22+
; NOFORCE: ret void
23+
; NOFORCE-NOT: loop.1:
24+
;
25+
; FORCE-LABEL: @force_small_loop(
26+
; FORCE: loop:
27+
; FORCE: br i1 %cond, label %body, label %exit
28+
; FORCE: loop.1:
29+
; FORCE: br i1 true, label %body.1, label %exit
30+
; FORCE: body.1:
31+
; FORCE: store i32 %mix15.1, ptr %ptrb.1, align 4
32+
; FORCE: latch.1:
33+
; FORCE: br i1 %cmp2.1, label %loop, label %exit
34+
; FORCE: ret void
35+
36+
loop:
37+
%i = phi i32 [ 0, %entry ], [ %inc, %latch ]
38+
%ptra = getelementptr inbounds i32, ptr %a, i32 %i
39+
%pa = load i32, ptr %ptra, align 4
40+
%tmp0 = mul nsw i32 %pa, %pa
41+
%tmp1 = add nsw i32 %tmp0, %pa
42+
%tmp2 = shl i32 %tmp1, 1
43+
%tmp3 = ashr i32 %tmp2, 1
44+
%tmp4 = xor i32 %tmp3, %pa
45+
%tmp5 = add nsw i32 %tmp4, 7
46+
%tmp6 = mul nsw i32 %tmp5, 5
47+
%tmp7 = add nsw i32 %tmp6, %tmp4
48+
%tmp8 = mul nsw i32 %tmp7, %tmp3
49+
%tmp9 = add nsw i32 %tmp8, %tmp7
50+
%tmp10 = xor i32 %tmp9, %tmp6
51+
%tmp11 = add nsw i32 %tmp10, %tmp8
52+
%tmp12 = mul nsw i32 %tmp11, 9
53+
%tmp13 = add nsw i32 %tmp12, %tmp10
54+
%tmp14 = xor i32 %tmp13, %tmp11
55+
%cond = icmp ult i32 %i, %n
56+
br i1 %cond, label %body, label %exit
57+
58+
body:
59+
%ptrb = getelementptr inbounds i32, ptr %b, i32 %i
60+
%pb = load i32, ptr %ptrb, align 4
61+
%sum = add nsw i32 %pb, %tmp14
62+
%diff = sub nsw i32 %sum, %pa
63+
%mix1 = mul nsw i32 %diff, 3
64+
%mix2 = add nsw i32 %mix1, %tmp3
65+
%mix3 = xor i32 %mix2, %diff
66+
%mix4 = add nsw i32 %mix3, %tmp0
67+
%mix5 = mul nsw i32 %mix4, 11
68+
%mix6 = add nsw i32 %mix5, %mix2
69+
%mix7 = xor i32 %mix6, %mix5
70+
%mix8 = add nsw i32 %mix7, %mix3
71+
%mix9 = mul nsw i32 %mix8, 13
72+
%mix10 = add nsw i32 %mix9, %mix8
73+
%mix11 = xor i32 %mix10, %mix7
74+
%mix12 = add nsw i32 %mix11, %mix6
75+
%mix13 = mul nsw i32 %mix12, 17
76+
%mix14 = add nsw i32 %mix13, %mix9
77+
%mix15 = xor i32 %mix14, %mix10
78+
store i32 %mix15, ptr %ptrb, align 4
79+
br label %latch
80+
81+
latch:
82+
%inc = add nuw nsw i32 %i, 1
83+
%cmp.limit = icmp ult i32 %n, 4
84+
%upper = select i1 %cmp.limit, i32 %n, i32 4
85+
%cmp2 = icmp ult i32 %inc, %upper
86+
br i1 %cmp2, label %loop, label %exit
87+
88+
exit:
89+
ret void
90+
}

0 commit comments

Comments
 (0)