|
| 1 | +; RUN: opt -passes=loop-unroll -S -unroll-runtime %s | FileCheck %s --check-prefix=NOFORCE |
| 2 | +; RUN: opt -passes=loop-unroll -S -unroll-runtime -aarch64-force-unroll-threshold=500 %s | FileCheck %s --check-prefix=FORCE |
| 3 | + |
| 4 | +; The loop has a small runtime upper bound (at most four iterations) but a |
| 5 | +; relatively expensive body. With runtime unrolling enabled, the cost model |
| 6 | +; still leaves the loop rolled. Raising the AArch64 force threshold overrides |
| 7 | +; that decision and unrolls. |
| 8 | + |
| 9 | +target triple = "aarch64-unknown-linux-gnu" |
| 10 | + |
| 11 | +define void @force_small_loop(ptr nocapture %a, ptr nocapture %b, i32 %n) { |
| 12 | +entry: |
| 13 | + br label %loop |
| 14 | + |
| 15 | +; NOFORCE-LABEL: @force_small_loop( |
| 16 | +; NOFORCE: loop: |
| 17 | +; NOFORCE: br i1 %cond, label %body, label %exit |
| 18 | +; NOFORCE: body: |
| 19 | +; NOFORCE: store i32 %mix15, ptr %ptrb, align 4 |
| 20 | +; NOFORCE: latch: |
| 21 | +; NOFORCE: br i1 %cmp2, label %loop, label %exit |
| 22 | +; NOFORCE: ret void |
| 23 | +; NOFORCE-NOT: loop.1: |
| 24 | +; |
| 25 | +; FORCE-LABEL: @force_small_loop( |
| 26 | +; FORCE: loop: |
| 27 | +; FORCE: br i1 %cond, label %body, label %exit |
| 28 | +; FORCE: loop.1: |
| 29 | +; FORCE: br i1 true, label %body.1, label %exit |
| 30 | +; FORCE: body.1: |
| 31 | +; FORCE: store i32 %mix15.1, ptr %ptrb.1, align 4 |
| 32 | +; FORCE: latch.1: |
| 33 | +; FORCE: br i1 %cmp2.1, label %loop, label %exit |
| 34 | +; FORCE: ret void |
| 35 | + |
| 36 | +loop: |
| 37 | + %i = phi i32 [ 0, %entry ], [ %inc, %latch ] |
| 38 | + %ptra = getelementptr inbounds i32, ptr %a, i32 %i |
| 39 | + %pa = load i32, ptr %ptra, align 4 |
| 40 | + %tmp0 = mul nsw i32 %pa, %pa |
| 41 | + %tmp1 = add nsw i32 %tmp0, %pa |
| 42 | + %tmp2 = shl i32 %tmp1, 1 |
| 43 | + %tmp3 = ashr i32 %tmp2, 1 |
| 44 | + %tmp4 = xor i32 %tmp3, %pa |
| 45 | + %tmp5 = add nsw i32 %tmp4, 7 |
| 46 | + %tmp6 = mul nsw i32 %tmp5, 5 |
| 47 | + %tmp7 = add nsw i32 %tmp6, %tmp4 |
| 48 | + %tmp8 = mul nsw i32 %tmp7, %tmp3 |
| 49 | + %tmp9 = add nsw i32 %tmp8, %tmp7 |
| 50 | + %tmp10 = xor i32 %tmp9, %tmp6 |
| 51 | + %tmp11 = add nsw i32 %tmp10, %tmp8 |
| 52 | + %tmp12 = mul nsw i32 %tmp11, 9 |
| 53 | + %tmp13 = add nsw i32 %tmp12, %tmp10 |
| 54 | + %tmp14 = xor i32 %tmp13, %tmp11 |
| 55 | + %cond = icmp ult i32 %i, %n |
| 56 | + br i1 %cond, label %body, label %exit |
| 57 | + |
| 58 | +body: |
| 59 | + %ptrb = getelementptr inbounds i32, ptr %b, i32 %i |
| 60 | + %pb = load i32, ptr %ptrb, align 4 |
| 61 | + %sum = add nsw i32 %pb, %tmp14 |
| 62 | + %diff = sub nsw i32 %sum, %pa |
| 63 | + %mix1 = mul nsw i32 %diff, 3 |
| 64 | + %mix2 = add nsw i32 %mix1, %tmp3 |
| 65 | + %mix3 = xor i32 %mix2, %diff |
| 66 | + %mix4 = add nsw i32 %mix3, %tmp0 |
| 67 | + %mix5 = mul nsw i32 %mix4, 11 |
| 68 | + %mix6 = add nsw i32 %mix5, %mix2 |
| 69 | + %mix7 = xor i32 %mix6, %mix5 |
| 70 | + %mix8 = add nsw i32 %mix7, %mix3 |
| 71 | + %mix9 = mul nsw i32 %mix8, 13 |
| 72 | + %mix10 = add nsw i32 %mix9, %mix8 |
| 73 | + %mix11 = xor i32 %mix10, %mix7 |
| 74 | + %mix12 = add nsw i32 %mix11, %mix6 |
| 75 | + %mix13 = mul nsw i32 %mix12, 17 |
| 76 | + %mix14 = add nsw i32 %mix13, %mix9 |
| 77 | + %mix15 = xor i32 %mix14, %mix10 |
| 78 | + store i32 %mix15, ptr %ptrb, align 4 |
| 79 | + br label %latch |
| 80 | + |
| 81 | +latch: |
| 82 | + %inc = add nuw nsw i32 %i, 1 |
| 83 | + %cmp.limit = icmp ult i32 %n, 4 |
| 84 | + %upper = select i1 %cmp.limit, i32 %n, i32 4 |
| 85 | + %cmp2 = icmp ult i32 %inc, %upper |
| 86 | + br i1 %cmp2, label %loop, label %exit |
| 87 | + |
| 88 | +exit: |
| 89 | + ret void |
| 90 | +} |
0 commit comments