diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 31f1197b9723b..da829046cc421 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -700,18 +700,19 @@ class CombinerHelper { /// Given an G_UDIV \p MI or G_UREM \p MI expressing a divide by constant, /// return an expression that implements it by multiplying by a magic number. /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide". - MachineInstr *buildUDivorURemUsingMul(MachineInstr &MI) const; + MachineInstr *buildUDivOrURemUsingMul(MachineInstr &MI) const; /// Combine G_UDIV or G_UREM by constant into a multiply by magic constant. - bool matchUDivorURemByConst(MachineInstr &MI) const; - void applyUDivorURemByConst(MachineInstr &MI) const; - - /// Given an G_SDIV \p MI expressing a signed divide by constant, return an - /// expression that implements it by multiplying by a magic number. - /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide". - MachineInstr *buildSDivUsingMul(MachineInstr &MI) const; - /// Combine G_SDIV by constant into a multiply by magic constant. - bool matchSDivByConst(MachineInstr &MI) const; - void applySDivByConst(MachineInstr &MI) const; + bool matchUDivOrURemByConst(MachineInstr &MI) const; + void applyUDivOrURemByConst(MachineInstr &MI) const; + + /// Given an G_SDIV \p MI or G_SREM \p MI expressing a signed divide by + /// constant, return an expression that implements it by multiplying by a + /// magic number. Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's + /// Guide". + MachineInstr *buildSDivOrSRemUsingMul(MachineInstr &MI) const; + /// Combine G_SDIV or G_SREM by constant into a multiply by magic constant. + bool matchSDivOrSRemByConst(MachineInstr &MI) const; + void applySDivOrSRemByConst(MachineInstr &MI) const; /// Given an G_SDIV \p MI expressing a signed divided by a pow2 constant, /// return expressions that implements it by shifting. diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 66051d756c808..fc81ab76dc72d 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -1132,14 +1132,14 @@ def form_bitfield_extract : GICombineGroup<[bitfield_extract_from_sext_inreg, def udiv_by_const : GICombineRule< (defs root:$root), (match (G_UDIV $dst, $x, $y):$root, - [{ return Helper.matchUDivorURemByConst(*${root}); }]), - (apply [{ Helper.applyUDivorURemByConst(*${root}); }])>; + [{ return Helper.matchUDivOrURemByConst(*${root}); }]), + (apply [{ Helper.applyUDivOrURemByConst(*${root}); }])>; def sdiv_by_const : GICombineRule< (defs root:$root), (match (G_SDIV $dst, $x, $y):$root, - [{ return Helper.matchSDivByConst(*${root}); }]), - (apply [{ Helper.applySDivByConst(*${root}); }])>; + [{ return Helper.matchSDivOrSRemByConst(*${root}); }]), + (apply [{ Helper.applySDivOrSRemByConst(*${root}); }])>; def sdiv_by_pow2 : GICombineRule< (defs root:$root), @@ -1159,10 +1159,16 @@ def intdiv_combines : GICombineGroup<[udiv_by_pow2, sdiv_by_pow2, def urem_by_const : GICombineRule< (defs root:$root), (match (G_UREM $dst, $x, $y):$root, - [{ return Helper.matchUDivorURemByConst(*${root}); }]), - (apply [{ Helper.applyUDivorURemByConst(*${root}); }])>; + [{ return Helper.matchUDivOrURemByConst(*${root}); }]), + (apply [{ Helper.applyUDivOrURemByConst(*${root}); }])>; -def intrem_combines : GICombineGroup<[urem_by_const]>; +def srem_by_const : GICombineRule< + (defs root:$root), + (match (G_SREM $dst, $x, $y):$root, + [{ return Helper.matchSDivOrSRemByConst(*${root}); }]), + (apply [{ Helper.applySDivOrSRemByConst(*${root}); }])>; + +def intrem_combines : GICombineGroup<[urem_by_const, srem_by_const]>; def reassoc_ptradd : GICombineRule< (defs root:$root, build_fn_matchinfo:$matchinfo), diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 3922eba55e195..e8f513ad5a7a9 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -5300,7 +5300,7 @@ bool CombinerHelper::matchSubAddSameReg(MachineInstr &MI, return false; } -MachineInstr *CombinerHelper::buildUDivorURemUsingMul(MachineInstr &MI) const { +MachineInstr *CombinerHelper::buildUDivOrURemUsingMul(MachineInstr &MI) const { unsigned Opcode = MI.getOpcode(); assert(Opcode == TargetOpcode::G_UDIV || Opcode == TargetOpcode::G_UREM); auto &UDivorRem = cast(MI); @@ -5468,7 +5468,7 @@ MachineInstr *CombinerHelper::buildUDivorURemUsingMul(MachineInstr &MI) const { return ret; } -bool CombinerHelper::matchUDivorURemByConst(MachineInstr &MI) const { +bool CombinerHelper::matchUDivOrURemByConst(MachineInstr &MI) const { unsigned Opcode = MI.getOpcode(); assert(Opcode == TargetOpcode::G_UDIV || Opcode == TargetOpcode::G_UREM); Register Dst = MI.getOperand(0).getReg(); @@ -5517,13 +5517,14 @@ bool CombinerHelper::matchUDivorURemByConst(MachineInstr &MI) const { MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); }); } -void CombinerHelper::applyUDivorURemByConst(MachineInstr &MI) const { - auto *NewMI = buildUDivorURemUsingMul(MI); +void CombinerHelper::applyUDivOrURemByConst(MachineInstr &MI) const { + auto *NewMI = buildUDivOrURemUsingMul(MI); replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg()); } -bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const { - assert(MI.getOpcode() == TargetOpcode::G_SDIV && "Expected SDIV"); +bool CombinerHelper::matchSDivOrSRemByConst(MachineInstr &MI) const { + unsigned Opcode = MI.getOpcode(); + assert(Opcode == TargetOpcode::G_SDIV || Opcode == TargetOpcode::G_SREM); Register Dst = MI.getOperand(0).getReg(); Register RHS = MI.getOperand(2).getReg(); LLT DstTy = MRI.getType(Dst); @@ -5543,7 +5544,8 @@ bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const { return false; // If the sdiv has an 'exact' flag we can use a simpler lowering. - if (MI.getFlag(MachineInstr::MIFlag::IsExact)) { + if (Opcode == TargetOpcode::G_SDIV && + MI.getFlag(MachineInstr::MIFlag::IsExact)) { return matchUnaryPredicate( MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); }); } @@ -5559,23 +5561,28 @@ bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const { if (!isLegal({TargetOpcode::G_SMULH, {DstTy}}) && !isLegalOrHasWidenScalar({TargetOpcode::G_MUL, {WideTy, WideTy}})) return false; + if (Opcode == TargetOpcode::G_SREM && + !isLegalOrBeforeLegalizer({TargetOpcode::G_SUB, {DstTy, DstTy}})) + return false; } return matchUnaryPredicate( MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); }); } -void CombinerHelper::applySDivByConst(MachineInstr &MI) const { - auto *NewMI = buildSDivUsingMul(MI); +void CombinerHelper::applySDivOrSRemByConst(MachineInstr &MI) const { + auto *NewMI = buildSDivOrSRemUsingMul(MI); replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg()); } -MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) const { - assert(MI.getOpcode() == TargetOpcode::G_SDIV && "Expected SDIV"); - auto &SDiv = cast(MI); - Register Dst = SDiv.getReg(0); - Register LHS = SDiv.getReg(1); - Register RHS = SDiv.getReg(2); +MachineInstr *CombinerHelper::buildSDivOrSRemUsingMul(MachineInstr &MI) const { + unsigned Opcode = MI.getOpcode(); + assert(MI.getOpcode() == TargetOpcode::G_SDIV || + Opcode == TargetOpcode::G_SREM); + auto &SDivorRem = cast(MI); + Register Dst = SDivorRem.getReg(0); + Register LHS = SDivorRem.getReg(1); + Register RHS = SDivorRem.getReg(2); LLT Ty = MRI.getType(Dst); LLT ScalarTy = Ty.getScalarType(); const unsigned EltBits = ScalarTy.getScalarSizeInBits(); @@ -5705,7 +5712,13 @@ MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) const { auto SignShift = MIB.buildConstant(ShiftAmtTy, EltBits - 1); auto T = MIB.buildLShr(Ty, Q, SignShift); T = MIB.buildAnd(Ty, T, ShiftMask); - return MIB.buildAdd(Ty, Q, T); + auto ret = MIB.buildAdd(Ty, Q, T); + + if (Opcode == TargetOpcode::G_SREM) { + auto Prod = MIB.buildMul(Ty, ret, RHS); + return MIB.buildSub(Ty, LHS, Prod); + } + return ret; } bool CombinerHelper::matchDivByPow2(MachineInstr &MI, bool IsSigned) const { diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll index 1376f5d9a380d..b124042265d40 100644 --- a/llvm/test/CodeGen/AArch64/rem-by-const.ll +++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll @@ -19,8 +19,13 @@ define i8 @si8_7(i8 %a, i8 %b) { ; CHECK-GI-LABEL: si8_7: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: sxtb w8, w0 -; CHECK-GI-NEXT: mov w9, #7 // =0x7 -; CHECK-GI-NEXT: sdiv w8, w8, w9 +; CHECK-GI-NEXT: mov w9, #-109 // =0xffffff93 +; CHECK-GI-NEXT: mul w8, w8, w9 +; CHECK-GI-NEXT: sxth w8, w8 +; CHECK-GI-NEXT: add w8, w0, w8, asr #8 +; CHECK-GI-NEXT: sbfx w8, w8, #2, #6 +; CHECK-GI-NEXT: ubfx w9, w8, #7, #1 +; CHECK-GI-NEXT: add w8, w8, w9 ; CHECK-GI-NEXT: lsl w9, w8, #3 ; CHECK-GI-NEXT: sub w8, w9, w8 ; CHECK-GI-NEXT: sub w0, w0, w8 @@ -45,8 +50,14 @@ define i8 @si8_100(i8 %a, i8 %b) { ; CHECK-GI-LABEL: si8_100: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: sxtb w8, w0 +; CHECK-GI-NEXT: mov w9, #41 // =0x29 +; CHECK-GI-NEXT: mul w8, w8, w9 +; CHECK-GI-NEXT: sxth w8, w8 +; CHECK-GI-NEXT: sbfx w8, w8, #8, #8 +; CHECK-GI-NEXT: asr w8, w8, #4 +; CHECK-GI-NEXT: ubfx w9, w8, #7, #1 +; CHECK-GI-NEXT: add w8, w8, w9 ; CHECK-GI-NEXT: mov w9, #100 // =0x64 -; CHECK-GI-NEXT: sdiv w8, w8, w9 ; CHECK-GI-NEXT: msub w0, w8, w9, w0 ; CHECK-GI-NEXT: ret entry: @@ -129,8 +140,12 @@ define i16 @si16_7(i16 %a, i16 %b) { ; CHECK-GI-LABEL: si16_7: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: sxth w8, w0 -; CHECK-GI-NEXT: mov w9, #7 // =0x7 -; CHECK-GI-NEXT: sdiv w8, w8, w9 +; CHECK-GI-NEXT: mov w9, #18725 // =0x4925 +; CHECK-GI-NEXT: mul w8, w8, w9 +; CHECK-GI-NEXT: asr w8, w8, #16 +; CHECK-GI-NEXT: asr w8, w8, #1 +; CHECK-GI-NEXT: ubfx w9, w8, #15, #1 +; CHECK-GI-NEXT: add w8, w8, w9 ; CHECK-GI-NEXT: lsl w9, w8, #3 ; CHECK-GI-NEXT: sub w8, w9, w8 ; CHECK-GI-NEXT: sub w0, w0, w8 @@ -155,8 +170,13 @@ define i16 @si16_100(i16 %a, i16 %b) { ; CHECK-GI-LABEL: si16_100: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: sxth w8, w0 +; CHECK-GI-NEXT: mov w9, #5243 // =0x147b +; CHECK-GI-NEXT: mul w8, w8, w9 +; CHECK-GI-NEXT: asr w8, w8, #16 +; CHECK-GI-NEXT: asr w8, w8, #3 +; CHECK-GI-NEXT: ubfx w9, w8, #15, #1 +; CHECK-GI-NEXT: add w8, w8, w9 ; CHECK-GI-NEXT: mov w9, #100 // =0x64 -; CHECK-GI-NEXT: sdiv w8, w8, w9 ; CHECK-GI-NEXT: msub w0, w8, w9, w0 ; CHECK-GI-NEXT: ret entry: @@ -240,8 +260,13 @@ define i32 @si32_7(i32 %a, i32 %b) { ; ; CHECK-GI-LABEL: si32_7: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov w8, #7 // =0x7 -; CHECK-GI-NEXT: sdiv w8, w0, w8 +; CHECK-GI-NEXT: mov w8, #9363 // =0x2493 +; CHECK-GI-NEXT: movk w8, #37449, lsl #16 +; CHECK-GI-NEXT: smull x8, w0, w8 +; CHECK-GI-NEXT: asr x8, x8, #32 +; CHECK-GI-NEXT: add w8, w8, w0 +; CHECK-GI-NEXT: asr w8, w8, #2 +; CHECK-GI-NEXT: add w8, w8, w8, lsr #31 ; CHECK-GI-NEXT: lsl w9, w8, #3 ; CHECK-GI-NEXT: sub w8, w9, w8 ; CHECK-GI-NEXT: sub w0, w0, w8 @@ -265,9 +290,14 @@ define i32 @si32_100(i32 %a, i32 %b) { ; ; CHECK-GI-LABEL: si32_100: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov w8, #100 // =0x64 -; CHECK-GI-NEXT: sdiv w9, w0, w8 -; CHECK-GI-NEXT: msub w0, w9, w8, w0 +; CHECK-GI-NEXT: mov w8, #34079 // =0x851f +; CHECK-GI-NEXT: mov w9, #100 // =0x64 +; CHECK-GI-NEXT: movk w8, #20971, lsl #16 +; CHECK-GI-NEXT: smull x8, w0, w8 +; CHECK-GI-NEXT: asr x8, x8, #32 +; CHECK-GI-NEXT: asr w8, w8, #5 +; CHECK-GI-NEXT: add w8, w8, w8, lsr #31 +; CHECK-GI-NEXT: msub w0, w8, w9, w0 ; CHECK-GI-NEXT: ret entry: %s = srem i32 %a, 100 @@ -348,8 +378,13 @@ define i64 @si64_7(i64 %a, i64 %b) { ; ; CHECK-GI-LABEL: si64_7: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov w8, #7 // =0x7 -; CHECK-GI-NEXT: sdiv x8, x0, x8 +; CHECK-GI-NEXT: mov x8, #18725 // =0x4925 +; CHECK-GI-NEXT: movk x8, #9362, lsl #16 +; CHECK-GI-NEXT: movk x8, #37449, lsl #32 +; CHECK-GI-NEXT: movk x8, #18724, lsl #48 +; CHECK-GI-NEXT: smulh x8, x0, x8 +; CHECK-GI-NEXT: asr x8, x8, #1 +; CHECK-GI-NEXT: add x8, x8, x8, lsr #63 ; CHECK-GI-NEXT: lsl x9, x8, #3 ; CHECK-GI-NEXT: sub x8, x9, x8 ; CHECK-GI-NEXT: sub x0, x0, x8 @@ -376,9 +411,16 @@ define i64 @si64_100(i64 %a, i64 %b) { ; ; CHECK-GI-LABEL: si64_100: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov w8, #100 // =0x64 -; CHECK-GI-NEXT: sdiv x9, x0, x8 -; CHECK-GI-NEXT: msub x0, x9, x8, x0 +; CHECK-GI-NEXT: mov x8, #55051 // =0xd70b +; CHECK-GI-NEXT: mov w9, #100 // =0x64 +; CHECK-GI-NEXT: movk x8, #28835, lsl #16 +; CHECK-GI-NEXT: movk x8, #2621, lsl #32 +; CHECK-GI-NEXT: movk x8, #41943, lsl #48 +; CHECK-GI-NEXT: smulh x8, x0, x8 +; CHECK-GI-NEXT: add x8, x8, x0 +; CHECK-GI-NEXT: asr x8, x8, #6 +; CHECK-GI-NEXT: add x8, x8, x8, lsr #63 +; CHECK-GI-NEXT: msub x0, x8, x9, x0 ; CHECK-GI-NEXT: ret entry: %s = srem i64 %a, 100 @@ -644,25 +686,49 @@ define <2 x i8> @sv2i8_7(<2 x i8> %d, <2 x i8> %e) { ; ; CHECK-GI-LABEL: sv2i8_7: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov w8, #7 // =0x7 -; CHECK-GI-NEXT: shl v0.2s, v0.2s, #24 +; CHECK-GI-NEXT: mov w8, #65427 // =0xff93 ; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #24 ; CHECK-GI-NEXT: mov v1.h[1], w8 -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: mov w10, v0.s[1] ; CHECK-GI-NEXT: shl v1.4h, v1.4h, #8 -; CHECK-GI-NEXT: sdiv w9, w9, w8 ; CHECK-GI-NEXT: sshr v1.4h, v1.4h, #8 -; CHECK-GI-NEXT: smov w11, v1.h[1] -; CHECK-GI-NEXT: sdiv w8, w10, w8 -; CHECK-GI-NEXT: smov w10, v1.h[0] +; CHECK-GI-NEXT: smov w8, v1.h[0] +; CHECK-GI-NEXT: smov w9, v1.h[1] +; CHECK-GI-NEXT: shl v1.2s, v0.2s, #24 +; CHECK-GI-NEXT: sshr v1.2s, v1.2s, #24 +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov w8, #8 // =0x8 +; CHECK-GI-NEXT: mov v2.s[1], w9 +; CHECK-GI-NEXT: mul v1.2s, v1.2s, v2.2s +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.h[1], w8 +; CHECK-GI-NEXT: mov w8, #2 // =0x2 +; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h +; CHECK-GI-NEXT: neg v2.4h, v2.4h +; CHECK-GI-NEXT: sshl v1.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: mov v2.b[1], w8 +; CHECK-GI-NEXT: mov w8, #7 // =0x7 +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: add v1.2s, v1.2s, v0.2s +; CHECK-GI-NEXT: mov v3.b[1], w8 +; CHECK-GI-NEXT: neg v2.8b, v2.8b +; CHECK-GI-NEXT: mov w9, v1.s[1] +; CHECK-GI-NEXT: mov v1.b[1], w9 +; CHECK-GI-NEXT: sshl v1.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: neg v2.8b, v3.8b +; CHECK-GI-NEXT: movi v3.2s, #7 +; CHECK-GI-NEXT: ushl v2.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: umov w8, v1.b[0] +; CHECK-GI-NEXT: umov w10, v1.b[1] +; CHECK-GI-NEXT: umov w9, v2.b[0] +; CHECK-GI-NEXT: umov w11, v2.b[1] +; CHECK-GI-NEXT: fmov s1, w8 ; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: fmov s1, w10 -; CHECK-GI-NEXT: mov v1.s[1], w11 -; CHECK-GI-NEXT: mov v2.s[1], w8 -; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: mov v1.s[1], w10 +; CHECK-GI-NEXT: mov v2.s[1], w11 +; CHECK-GI-NEXT: add v1.2s, v1.2s, v2.2s +; CHECK-GI-NEXT: mls v0.2s, v1.2s, v3.2s ; CHECK-GI-NEXT: ret entry: %s = srem <2 x i8> %d, @@ -687,25 +753,46 @@ define <2 x i8> @sv2i8_100(<2 x i8> %d, <2 x i8> %e) { ; ; CHECK-GI-LABEL: sv2i8_100: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov w8, #100 // =0x64 -; CHECK-GI-NEXT: shl v0.2s, v0.2s, #24 +; CHECK-GI-NEXT: mov w8, #41 // =0x29 ; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #24 ; CHECK-GI-NEXT: mov v1.h[1], w8 -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: mov w10, v0.s[1] ; CHECK-GI-NEXT: shl v1.4h, v1.4h, #8 -; CHECK-GI-NEXT: sdiv w9, w9, w8 ; CHECK-GI-NEXT: sshr v1.4h, v1.4h, #8 -; CHECK-GI-NEXT: smov w11, v1.h[1] -; CHECK-GI-NEXT: sdiv w8, w10, w8 -; CHECK-GI-NEXT: smov w10, v1.h[0] +; CHECK-GI-NEXT: smov w8, v1.h[0] +; CHECK-GI-NEXT: smov w9, v1.h[1] +; CHECK-GI-NEXT: shl v1.2s, v0.2s, #24 +; CHECK-GI-NEXT: sshr v1.2s, v1.2s, #24 +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov w8, #8 // =0x8 +; CHECK-GI-NEXT: mov v2.s[1], w9 +; CHECK-GI-NEXT: mul v1.2s, v1.2s, v2.2s +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.h[1], w8 +; CHECK-GI-NEXT: mov w8, #4 // =0x4 +; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: neg v2.4h, v2.4h +; CHECK-GI-NEXT: mov v3.b[1], w8 +; CHECK-GI-NEXT: mov w8, #7 // =0x7 +; CHECK-GI-NEXT: sshl v1.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: neg v3.8b, v3.8b +; CHECK-GI-NEXT: uzp1 v1.8b, v1.8b, v0.8b +; CHECK-GI-NEXT: mov v2.b[1], w8 +; CHECK-GI-NEXT: sshl v1.8b, v1.8b, v3.8b +; CHECK-GI-NEXT: neg v2.8b, v2.8b +; CHECK-GI-NEXT: movi v3.2s, #100 +; CHECK-GI-NEXT: ushl v2.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: umov w8, v1.b[0] +; CHECK-GI-NEXT: umov w10, v1.b[1] +; CHECK-GI-NEXT: umov w9, v2.b[0] +; CHECK-GI-NEXT: umov w11, v2.b[1] +; CHECK-GI-NEXT: fmov s1, w8 ; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: fmov s1, w10 -; CHECK-GI-NEXT: mov v1.s[1], w11 -; CHECK-GI-NEXT: mov v2.s[1], w8 -; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: mov v1.s[1], w10 +; CHECK-GI-NEXT: mov v2.s[1], w11 +; CHECK-GI-NEXT: add v1.2s, v1.2s, v2.2s +; CHECK-GI-NEXT: mls v0.2s, v1.2s, v3.2s ; CHECK-GI-NEXT: ret entry: %s = srem <2 x i8> %d, @@ -872,30 +959,37 @@ define <4 x i8> @sv4i8_7(<4 x i8> %d, <4 x i8> %e) { ; ; CHECK-GI-LABEL: sv4i8_7: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: mov w8, #7 // =0x7 -; CHECK-GI-NEXT: movi v3.4h, #7 -; CHECK-GI-NEXT: fmov s2, w8 -; CHECK-GI-NEXT: shl v0.4s, v0.4s, #24 -; CHECK-GI-NEXT: mov v2.h[1], w8 -; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0 -; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #24 -; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: mov w10, v0.s[1] -; CHECK-GI-NEXT: mov w11, v0.s[2] -; CHECK-GI-NEXT: mov w12, v0.s[3] -; CHECK-GI-NEXT: mov v3.d[1], v2.d[0] -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: sdiv w11, w11, w8 -; CHECK-GI-NEXT: mov v1.s[1], w10 -; CHECK-GI-NEXT: sdiv w9, w12, w8 -; CHECK-GI-NEXT: mov v1.s[2], w11 -; CHECK-GI-NEXT: mov v1.s[3], w9 -; CHECK-GI-NEXT: mls v0.4s, v1.4s, v3.4s -; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: mov w8, #147 // =0x93 +; CHECK-GI-NEXT: shl v2.4h, v0.4h, #8 +; CHECK-GI-NEXT: mov w9, #7 // =0x7 +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: sshr v2.4h, v2.4h, #8 +; CHECK-GI-NEXT: mov v1.b[1], w8 +; CHECK-GI-NEXT: mov v4.b[1], w9 +; CHECK-GI-NEXT: mov v1.b[2], w8 +; CHECK-GI-NEXT: mov v4.b[2], w9 +; CHECK-GI-NEXT: mov v1.b[3], w8 +; CHECK-GI-NEXT: mov w8, #2 // =0x2 +; CHECK-GI-NEXT: mov v4.b[3], w9 +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v3.b[1], w8 +; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: mul v1.4h, v2.4h, v1.4h +; CHECK-GI-NEXT: fmov d2, d0 +; CHECK-GI-NEXT: mov v3.b[2], w8 +; CHECK-GI-NEXT: ssra v2.4h, v1.4h, #8 +; CHECK-GI-NEXT: mov v3.b[3], w8 +; CHECK-GI-NEXT: uzp1 v1.8b, v2.8b, v0.8b +; CHECK-GI-NEXT: neg v2.8b, v3.8b +; CHECK-GI-NEXT: dup v3.4h, w9 +; CHECK-GI-NEXT: sshl v1.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: neg v2.8b, v4.8b +; CHECK-GI-NEXT: ushl v2.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-GI-NEXT: add v1.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: mls v0.4h, v1.4h, v3.4h ; CHECK-GI-NEXT: ret entry: %s = srem <4 x i8> %d, @@ -943,30 +1037,37 @@ define <4 x i8> @sv4i8_100(<4 x i8> %d, <4 x i8> %e) { ; ; CHECK-GI-LABEL: sv4i8_100: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: mov w8, #41 // =0x29 +; CHECK-GI-NEXT: shl v2.4h, v0.4h, #8 +; CHECK-GI-NEXT: mov w9, #7 // =0x7 +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: sshr v2.4h, v2.4h, #8 +; CHECK-GI-NEXT: mov v1.b[1], w8 +; CHECK-GI-NEXT: mov v4.b[1], w9 +; CHECK-GI-NEXT: mov v1.b[2], w8 +; CHECK-GI-NEXT: mov v4.b[2], w9 +; CHECK-GI-NEXT: mov v1.b[3], w8 +; CHECK-GI-NEXT: mov w8, #4 // =0x4 +; CHECK-GI-NEXT: mov v4.b[3], w9 +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v3.b[1], w8 +; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: mul v1.4h, v2.4h, v1.4h +; CHECK-GI-NEXT: mov v3.b[2], w8 +; CHECK-GI-NEXT: sshr v1.4h, v1.4h, #8 +; CHECK-GI-NEXT: mov v3.b[3], w8 ; CHECK-GI-NEXT: mov w8, #100 // =0x64 -; CHECK-GI-NEXT: movi v3.4h, #100 -; CHECK-GI-NEXT: fmov s2, w8 -; CHECK-GI-NEXT: shl v0.4s, v0.4s, #24 -; CHECK-GI-NEXT: mov v2.h[1], w8 -; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0 -; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #24 -; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: mov w10, v0.s[1] -; CHECK-GI-NEXT: mov w11, v0.s[2] -; CHECK-GI-NEXT: mov w12, v0.s[3] -; CHECK-GI-NEXT: mov v3.d[1], v2.d[0] -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: sdiv w11, w11, w8 -; CHECK-GI-NEXT: mov v1.s[1], w10 -; CHECK-GI-NEXT: sdiv w9, w12, w8 -; CHECK-GI-NEXT: mov v1.s[2], w11 -; CHECK-GI-NEXT: mov v1.s[3], w9 -; CHECK-GI-NEXT: mls v0.4s, v1.4s, v3.4s -; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: uzp1 v1.8b, v1.8b, v0.8b +; CHECK-GI-NEXT: neg v2.8b, v3.8b +; CHECK-GI-NEXT: dup v3.4h, w8 +; CHECK-GI-NEXT: sshl v1.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: neg v2.8b, v4.8b +; CHECK-GI-NEXT: ushl v2.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-GI-NEXT: add v1.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: mls v0.4h, v1.4h, v3.4h ; CHECK-GI-NEXT: ret entry: %s = srem <4 x i8> %d, @@ -988,42 +1089,15 @@ define <8 x i8> @sv8i8_7(<8 x i8> %d, <8 x i8> %e) { ; ; CHECK-GI-LABEL: sv8i8_7: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-GI-NEXT: mov w8, #7 // =0x7 -; CHECK-GI-NEXT: movi v4.8b, #7 -; CHECK-GI-NEXT: sshll v1.4s, v0.4h, #0 -; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0 -; CHECK-GI-NEXT: sshll v4.8h, v4.8b, #0 -; CHECK-GI-NEXT: fmov w9, s1 -; CHECK-GI-NEXT: fmov w13, s0 -; CHECK-GI-NEXT: mov w10, v1.s[1] -; CHECK-GI-NEXT: mov w14, v0.s[1] -; CHECK-GI-NEXT: mov w11, v1.s[2] -; CHECK-GI-NEXT: mov w15, v0.s[2] -; CHECK-GI-NEXT: mov w12, v1.s[3] -; CHECK-GI-NEXT: mov w16, v0.s[3] -; CHECK-GI-NEXT: sshll v5.4s, v4.4h, #0 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sshll2 v4.4s, v4.8h, #0 -; CHECK-GI-NEXT: sdiv w13, w13, w8 -; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: fmov s3, w13 -; CHECK-GI-NEXT: sdiv w14, w14, w8 -; CHECK-GI-NEXT: mov v2.s[1], w10 -; CHECK-GI-NEXT: sdiv w11, w11, w8 -; CHECK-GI-NEXT: mov v3.s[1], w14 -; CHECK-GI-NEXT: sdiv w15, w15, w8 -; CHECK-GI-NEXT: mov v2.s[2], w11 -; CHECK-GI-NEXT: sdiv w12, w12, w8 -; CHECK-GI-NEXT: mov v3.s[2], w15 -; CHECK-GI-NEXT: sdiv w8, w16, w8 -; CHECK-GI-NEXT: mov v2.s[3], w12 -; CHECK-GI-NEXT: mls v1.4s, v2.4s, v5.4s -; CHECK-GI-NEXT: mov v3.s[3], w8 -; CHECK-GI-NEXT: mls v0.4s, v3.4s, v4.4s -; CHECK-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h -; CHECK-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-GI-NEXT: movi v1.8b, #147 +; CHECK-GI-NEXT: movi v3.8b, #7 +; CHECK-GI-NEXT: smull v1.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: shrn v1.8b, v1.8h, #8 +; CHECK-GI-NEXT: add v1.8b, v1.8b, v0.8b +; CHECK-GI-NEXT: sshr v2.8b, v1.8b, #2 +; CHECK-GI-NEXT: ushr v2.8b, v2.8b, #7 +; CHECK-GI-NEXT: ssra v2.8b, v1.8b, #2 +; CHECK-GI-NEXT: mls v0.8b, v2.8b, v3.8b ; CHECK-GI-NEXT: ret entry: %s = srem <8 x i8> %d, @@ -1044,42 +1118,14 @@ define <8 x i8> @sv8i8_100(<8 x i8> %d, <8 x i8> %e) { ; ; CHECK-GI-LABEL: sv8i8_100: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-GI-NEXT: mov w8, #100 // =0x64 -; CHECK-GI-NEXT: movi v4.8b, #100 -; CHECK-GI-NEXT: sshll v1.4s, v0.4h, #0 -; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0 -; CHECK-GI-NEXT: sshll v4.8h, v4.8b, #0 -; CHECK-GI-NEXT: fmov w9, s1 -; CHECK-GI-NEXT: fmov w13, s0 -; CHECK-GI-NEXT: mov w10, v1.s[1] -; CHECK-GI-NEXT: mov w14, v0.s[1] -; CHECK-GI-NEXT: mov w11, v1.s[2] -; CHECK-GI-NEXT: mov w15, v0.s[2] -; CHECK-GI-NEXT: mov w12, v1.s[3] -; CHECK-GI-NEXT: mov w16, v0.s[3] -; CHECK-GI-NEXT: sshll v5.4s, v4.4h, #0 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sshll2 v4.4s, v4.8h, #0 -; CHECK-GI-NEXT: sdiv w13, w13, w8 -; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: fmov s3, w13 -; CHECK-GI-NEXT: sdiv w14, w14, w8 -; CHECK-GI-NEXT: mov v2.s[1], w10 -; CHECK-GI-NEXT: sdiv w11, w11, w8 -; CHECK-GI-NEXT: mov v3.s[1], w14 -; CHECK-GI-NEXT: sdiv w15, w15, w8 -; CHECK-GI-NEXT: mov v2.s[2], w11 -; CHECK-GI-NEXT: sdiv w12, w12, w8 -; CHECK-GI-NEXT: mov v3.s[2], w15 -; CHECK-GI-NEXT: sdiv w8, w16, w8 -; CHECK-GI-NEXT: mov v2.s[3], w12 -; CHECK-GI-NEXT: mls v1.4s, v2.4s, v5.4s -; CHECK-GI-NEXT: mov v3.s[3], w8 -; CHECK-GI-NEXT: mls v0.4s, v3.4s, v4.4s -; CHECK-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h -; CHECK-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-GI-NEXT: movi v1.8b, #41 +; CHECK-GI-NEXT: movi v3.8b, #100 +; CHECK-GI-NEXT: smull v1.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: shrn v1.8b, v1.8h, #8 +; CHECK-GI-NEXT: sshr v2.8b, v1.8b, #4 +; CHECK-GI-NEXT: ushr v2.8b, v2.8b, #7 +; CHECK-GI-NEXT: ssra v2.8b, v1.8b, #4 +; CHECK-GI-NEXT: mls v0.8b, v2.8b, v3.8b ; CHECK-GI-NEXT: ret entry: %s = srem <8 x i8> %d, @@ -1102,72 +1148,16 @@ define <16 x i8> @sv16i8_7(<16 x i8> %d, <16 x i8> %e) { ; ; CHECK-GI-LABEL: sv16i8_7: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0 -; CHECK-GI-NEXT: sshll2 v3.8h, v0.16b, #0 -; CHECK-GI-NEXT: mov w8, #7 // =0x7 -; CHECK-GI-NEXT: movi v16.8b, #7 -; CHECK-GI-NEXT: sshll v1.4s, v2.4h, #0 -; CHECK-GI-NEXT: sshll2 v2.4s, v2.8h, #0 -; CHECK-GI-NEXT: sshll v0.4s, v3.4h, #0 -; CHECK-GI-NEXT: sshll2 v3.4s, v3.8h, #0 -; CHECK-GI-NEXT: sshll v16.8h, v16.8b, #0 -; CHECK-GI-NEXT: fmov w9, s1 -; CHECK-GI-NEXT: fmov w13, s2 -; CHECK-GI-NEXT: fmov w17, s0 -; CHECK-GI-NEXT: fmov w2, s3 -; CHECK-GI-NEXT: mov w14, v2.s[1] -; CHECK-GI-NEXT: mov w18, v0.s[1] -; CHECK-GI-NEXT: mov w3, v3.s[1] -; CHECK-GI-NEXT: mov w15, v2.s[2] -; CHECK-GI-NEXT: mov w0, v0.s[2] -; CHECK-GI-NEXT: sdiv w11, w9, w8 -; CHECK-GI-NEXT: mov w9, v1.s[1] -; CHECK-GI-NEXT: mov w4, v3.s[2] -; CHECK-GI-NEXT: mov w16, v2.s[3] -; CHECK-GI-NEXT: mov w1, v0.s[3] -; CHECK-GI-NEXT: mov w5, v3.s[3] -; CHECK-GI-NEXT: sshll v17.4s, v16.4h, #0 -; CHECK-GI-NEXT: sshll2 v16.4s, v16.8h, #0 -; CHECK-GI-NEXT: sdiv w13, w13, w8 -; CHECK-GI-NEXT: fmov s4, w11 -; CHECK-GI-NEXT: sdiv w17, w17, w8 -; CHECK-GI-NEXT: fmov s5, w13 -; CHECK-GI-NEXT: sdiv w2, w2, w8 -; CHECK-GI-NEXT: fmov s6, w17 -; CHECK-GI-NEXT: sdiv w12, w9, w8 -; CHECK-GI-NEXT: mov w9, v1.s[2] -; CHECK-GI-NEXT: fmov s7, w2 -; CHECK-GI-NEXT: sdiv w14, w14, w8 -; CHECK-GI-NEXT: mov v4.s[1], w12 -; CHECK-GI-NEXT: sdiv w18, w18, w8 -; CHECK-GI-NEXT: mov v5.s[1], w14 -; CHECK-GI-NEXT: sdiv w3, w3, w8 -; CHECK-GI-NEXT: mov v6.s[1], w18 -; CHECK-GI-NEXT: sdiv w10, w9, w8 -; CHECK-GI-NEXT: mov w9, v1.s[3] -; CHECK-GI-NEXT: mov v7.s[1], w3 -; CHECK-GI-NEXT: sdiv w15, w15, w8 -; CHECK-GI-NEXT: mov v4.s[2], w10 -; CHECK-GI-NEXT: sdiv w0, w0, w8 -; CHECK-GI-NEXT: mov v5.s[2], w15 -; CHECK-GI-NEXT: sdiv w4, w4, w8 -; CHECK-GI-NEXT: mov v6.s[2], w0 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: mov v7.s[2], w4 -; CHECK-GI-NEXT: sdiv w16, w16, w8 -; CHECK-GI-NEXT: mov v4.s[3], w9 -; CHECK-GI-NEXT: mls v1.4s, v4.4s, v17.4s -; CHECK-GI-NEXT: sdiv w1, w1, w8 -; CHECK-GI-NEXT: mov v5.s[3], w16 -; CHECK-GI-NEXT: mls v2.4s, v5.4s, v16.4s -; CHECK-GI-NEXT: sdiv w8, w5, w8 -; CHECK-GI-NEXT: mov v6.s[3], w1 -; CHECK-GI-NEXT: uzp1 v1.8h, v1.8h, v2.8h -; CHECK-GI-NEXT: mls v0.4s, v6.4s, v17.4s -; CHECK-GI-NEXT: mov v7.s[3], w8 -; CHECK-GI-NEXT: mls v3.4s, v7.4s, v16.4s -; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v3.8h -; CHECK-GI-NEXT: uzp1 v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: movi v1.16b, #147 +; CHECK-GI-NEXT: movi v3.16b, #7 +; CHECK-GI-NEXT: smull2 v2.8h, v0.16b, v1.16b +; CHECK-GI-NEXT: smull v1.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: uzp2 v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: add v1.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: sshr v2.16b, v1.16b, #2 +; CHECK-GI-NEXT: ushr v2.16b, v2.16b, #7 +; CHECK-GI-NEXT: ssra v2.16b, v1.16b, #2 +; CHECK-GI-NEXT: mls v0.16b, v2.16b, v3.16b ; CHECK-GI-NEXT: ret entry: %s = srem <16 x i8> %d, @@ -1189,72 +1179,15 @@ define <16 x i8> @sv16i8_100(<16 x i8> %d, <16 x i8> %e) { ; ; CHECK-GI-LABEL: sv16i8_100: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0 -; CHECK-GI-NEXT: sshll2 v3.8h, v0.16b, #0 -; CHECK-GI-NEXT: mov w8, #100 // =0x64 -; CHECK-GI-NEXT: movi v16.8b, #100 -; CHECK-GI-NEXT: sshll v1.4s, v2.4h, #0 -; CHECK-GI-NEXT: sshll2 v2.4s, v2.8h, #0 -; CHECK-GI-NEXT: sshll v0.4s, v3.4h, #0 -; CHECK-GI-NEXT: sshll2 v3.4s, v3.8h, #0 -; CHECK-GI-NEXT: sshll v16.8h, v16.8b, #0 -; CHECK-GI-NEXT: fmov w9, s1 -; CHECK-GI-NEXT: fmov w13, s2 -; CHECK-GI-NEXT: fmov w17, s0 -; CHECK-GI-NEXT: fmov w2, s3 -; CHECK-GI-NEXT: mov w14, v2.s[1] -; CHECK-GI-NEXT: mov w18, v0.s[1] -; CHECK-GI-NEXT: mov w3, v3.s[1] -; CHECK-GI-NEXT: mov w15, v2.s[2] -; CHECK-GI-NEXT: mov w0, v0.s[2] -; CHECK-GI-NEXT: sdiv w11, w9, w8 -; CHECK-GI-NEXT: mov w9, v1.s[1] -; CHECK-GI-NEXT: mov w4, v3.s[2] -; CHECK-GI-NEXT: mov w16, v2.s[3] -; CHECK-GI-NEXT: mov w1, v0.s[3] -; CHECK-GI-NEXT: mov w5, v3.s[3] -; CHECK-GI-NEXT: sshll v17.4s, v16.4h, #0 -; CHECK-GI-NEXT: sshll2 v16.4s, v16.8h, #0 -; CHECK-GI-NEXT: sdiv w13, w13, w8 -; CHECK-GI-NEXT: fmov s4, w11 -; CHECK-GI-NEXT: sdiv w17, w17, w8 -; CHECK-GI-NEXT: fmov s5, w13 -; CHECK-GI-NEXT: sdiv w2, w2, w8 -; CHECK-GI-NEXT: fmov s6, w17 -; CHECK-GI-NEXT: sdiv w12, w9, w8 -; CHECK-GI-NEXT: mov w9, v1.s[2] -; CHECK-GI-NEXT: fmov s7, w2 -; CHECK-GI-NEXT: sdiv w14, w14, w8 -; CHECK-GI-NEXT: mov v4.s[1], w12 -; CHECK-GI-NEXT: sdiv w18, w18, w8 -; CHECK-GI-NEXT: mov v5.s[1], w14 -; CHECK-GI-NEXT: sdiv w3, w3, w8 -; CHECK-GI-NEXT: mov v6.s[1], w18 -; CHECK-GI-NEXT: sdiv w10, w9, w8 -; CHECK-GI-NEXT: mov w9, v1.s[3] -; CHECK-GI-NEXT: mov v7.s[1], w3 -; CHECK-GI-NEXT: sdiv w15, w15, w8 -; CHECK-GI-NEXT: mov v4.s[2], w10 -; CHECK-GI-NEXT: sdiv w0, w0, w8 -; CHECK-GI-NEXT: mov v5.s[2], w15 -; CHECK-GI-NEXT: sdiv w4, w4, w8 -; CHECK-GI-NEXT: mov v6.s[2], w0 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: mov v7.s[2], w4 -; CHECK-GI-NEXT: sdiv w16, w16, w8 -; CHECK-GI-NEXT: mov v4.s[3], w9 -; CHECK-GI-NEXT: mls v1.4s, v4.4s, v17.4s -; CHECK-GI-NEXT: sdiv w1, w1, w8 -; CHECK-GI-NEXT: mov v5.s[3], w16 -; CHECK-GI-NEXT: mls v2.4s, v5.4s, v16.4s -; CHECK-GI-NEXT: sdiv w8, w5, w8 -; CHECK-GI-NEXT: mov v6.s[3], w1 -; CHECK-GI-NEXT: uzp1 v1.8h, v1.8h, v2.8h -; CHECK-GI-NEXT: mls v0.4s, v6.4s, v17.4s -; CHECK-GI-NEXT: mov v7.s[3], w8 -; CHECK-GI-NEXT: mls v3.4s, v7.4s, v16.4s -; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v3.8h -; CHECK-GI-NEXT: uzp1 v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: movi v1.16b, #41 +; CHECK-GI-NEXT: movi v3.16b, #100 +; CHECK-GI-NEXT: smull2 v2.8h, v0.16b, v1.16b +; CHECK-GI-NEXT: smull v1.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: uzp2 v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: sshr v2.16b, v1.16b, #4 +; CHECK-GI-NEXT: ushr v2.16b, v2.16b, #7 +; CHECK-GI-NEXT: ssra v2.16b, v1.16b, #4 +; CHECK-GI-NEXT: mls v0.16b, v2.16b, v3.16b ; CHECK-GI-NEXT: ret entry: %s = srem <16 x i8> %d, @@ -1754,20 +1687,31 @@ define <2 x i16> @sv2i16_7(<2 x i16> %d, <2 x i16> %e) { ; ; CHECK-GI-LABEL: sv2i16_7: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: shl v0.2s, v0.2s, #16 -; CHECK-GI-NEXT: mov w8, #7 // =0x7 +; CHECK-GI-NEXT: mov w8, #18725 // =0x4925 +; CHECK-GI-NEXT: shl v2.2s, v0.2s, #16 ; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #16 +; CHECK-GI-NEXT: sshr v2.2s, v2.2s, #16 ; CHECK-GI-NEXT: mov v1.h[1], w8 -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: mov w10, v0.s[1] +; CHECK-GI-NEXT: mov w8, #1 // =0x1 ; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: mov v2.s[1], w10 -; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: mul v1.2s, v2.2s, v1.2s +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.h[1], w8 +; CHECK-GI-NEXT: mov w8, #15 // =0xf +; CHECK-GI-NEXT: sshr v1.2s, v1.2s, #16 +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h +; CHECK-GI-NEXT: mov v3.h[1], w8 +; CHECK-GI-NEXT: neg v2.4h, v2.4h +; CHECK-GI-NEXT: mov w8, #7 // =0x7 +; CHECK-GI-NEXT: sshl v1.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: neg v2.4h, v3.4h +; CHECK-GI-NEXT: dup v3.2s, w8 +; CHECK-GI-NEXT: ushl v2.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: add v1.2s, v1.2s, v2.2s +; CHECK-GI-NEXT: mls v0.2s, v1.2s, v3.2s ; CHECK-GI-NEXT: ret entry: %s = srem <2 x i16> %d, @@ -1792,20 +1736,31 @@ define <2 x i16> @sv2i16_100(<2 x i16> %d, <2 x i16> %e) { ; ; CHECK-GI-LABEL: sv2i16_100: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: shl v0.2s, v0.2s, #16 -; CHECK-GI-NEXT: mov w8, #100 // =0x64 +; CHECK-GI-NEXT: mov w8, #5243 // =0x147b +; CHECK-GI-NEXT: shl v2.2s, v0.2s, #16 ; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #16 +; CHECK-GI-NEXT: sshr v2.2s, v2.2s, #16 ; CHECK-GI-NEXT: mov v1.h[1], w8 -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: mov w10, v0.s[1] +; CHECK-GI-NEXT: mov w8, #3 // =0x3 ; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: mov v2.s[1], w10 -; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: mul v1.2s, v2.2s, v1.2s +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.h[1], w8 +; CHECK-GI-NEXT: mov w8, #15 // =0xf +; CHECK-GI-NEXT: sshr v1.2s, v1.2s, #16 +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h +; CHECK-GI-NEXT: mov v3.h[1], w8 +; CHECK-GI-NEXT: neg v2.4h, v2.4h +; CHECK-GI-NEXT: mov w8, #100 // =0x64 +; CHECK-GI-NEXT: sshl v1.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: neg v2.4h, v3.4h +; CHECK-GI-NEXT: dup v3.2s, w8 +; CHECK-GI-NEXT: ushl v2.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: add v1.2s, v1.2s, v2.2s +; CHECK-GI-NEXT: mls v0.2s, v1.2s, v3.2s ; CHECK-GI-NEXT: ret entry: %s = srem <2 x i16> %d, @@ -1949,24 +1904,15 @@ define <4 x i16> @sv4i16_7(<4 x i16> %d, <4 x i16> %e) { ; ; CHECK-GI-LABEL: sv4i16_7: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: mov w8, #7 // =0x7 -; CHECK-GI-NEXT: movi v2.4h, #7 -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: mov w10, v0.s[1] -; CHECK-GI-NEXT: mov w11, v0.s[2] -; CHECK-GI-NEXT: mov w12, v0.s[3] -; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: sdiv w11, w11, w8 -; CHECK-GI-NEXT: mov v1.s[1], w10 -; CHECK-GI-NEXT: sdiv w8, w12, w8 -; CHECK-GI-NEXT: mov v1.s[2], w11 -; CHECK-GI-NEXT: mov v1.s[3], w8 -; CHECK-GI-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: adrp x8, .LCPI44_0 +; CHECK-GI-NEXT: movi v3.4h, #7 +; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI44_0] +; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-GI-NEXT: sshr v2.4h, v1.4h, #1 +; CHECK-GI-NEXT: ushr v2.4h, v2.4h, #15 +; CHECK-GI-NEXT: ssra v2.4h, v1.4h, #1 +; CHECK-GI-NEXT: mls v0.4h, v2.4h, v3.4h ; CHECK-GI-NEXT: ret entry: %s = srem <4 x i16> %d, @@ -1988,24 +1934,15 @@ define <4 x i16> @sv4i16_100(<4 x i16> %d, <4 x i16> %e) { ; ; CHECK-GI-LABEL: sv4i16_100: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: mov w8, #100 // =0x64 -; CHECK-GI-NEXT: movi v2.4h, #100 -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: mov w10, v0.s[1] -; CHECK-GI-NEXT: mov w11, v0.s[2] -; CHECK-GI-NEXT: mov w12, v0.s[3] -; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: sdiv w11, w11, w8 -; CHECK-GI-NEXT: mov v1.s[1], w10 -; CHECK-GI-NEXT: sdiv w8, w12, w8 -; CHECK-GI-NEXT: mov v1.s[2], w11 -; CHECK-GI-NEXT: mov v1.s[3], w8 -; CHECK-GI-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: adrp x8, .LCPI45_0 +; CHECK-GI-NEXT: movi v3.4h, #100 +; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI45_0] +; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-GI-NEXT: sshr v2.4h, v1.4h, #3 +; CHECK-GI-NEXT: ushr v2.4h, v2.4h, #15 +; CHECK-GI-NEXT: ssra v2.4h, v1.4h, #3 +; CHECK-GI-NEXT: mls v0.4h, v2.4h, v3.4h ; CHECK-GI-NEXT: ret entry: %s = srem <4 x i16> %d, @@ -2028,38 +1965,16 @@ define <8 x i16> @sv8i16_7(<8 x i16> %d, <8 x i16> %e) { ; ; CHECK-GI-LABEL: sv8i16_7: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v1.4s, v0.4h, #0 -; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0 -; CHECK-GI-NEXT: mov w8, #7 // =0x7 -; CHECK-GI-NEXT: movi v4.4h, #7 -; CHECK-GI-NEXT: fmov w9, s1 -; CHECK-GI-NEXT: fmov w13, s0 -; CHECK-GI-NEXT: mov w10, v1.s[1] -; CHECK-GI-NEXT: mov w14, v0.s[1] -; CHECK-GI-NEXT: mov w11, v1.s[2] -; CHECK-GI-NEXT: mov w15, v0.s[2] -; CHECK-GI-NEXT: mov w12, v1.s[3] -; CHECK-GI-NEXT: mov w16, v0.s[3] -; CHECK-GI-NEXT: sshll v4.4s, v4.4h, #0 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w13, w13, w8 -; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: fmov s3, w13 -; CHECK-GI-NEXT: sdiv w14, w14, w8 -; CHECK-GI-NEXT: mov v2.s[1], w10 -; CHECK-GI-NEXT: sdiv w11, w11, w8 -; CHECK-GI-NEXT: mov v3.s[1], w14 -; CHECK-GI-NEXT: sdiv w15, w15, w8 -; CHECK-GI-NEXT: mov v2.s[2], w11 -; CHECK-GI-NEXT: sdiv w12, w12, w8 -; CHECK-GI-NEXT: mov v3.s[2], w15 -; CHECK-GI-NEXT: sdiv w8, w16, w8 -; CHECK-GI-NEXT: mov v2.s[3], w12 -; CHECK-GI-NEXT: mls v1.4s, v2.4s, v4.4s -; CHECK-GI-NEXT: mov v3.s[3], w8 -; CHECK-GI-NEXT: mls v0.4s, v3.4s, v4.4s -; CHECK-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: adrp x8, .LCPI46_0 +; CHECK-GI-NEXT: movi v3.8h, #7 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI46_0] +; CHECK-GI-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: sshr v2.8h, v1.8h, #1 +; CHECK-GI-NEXT: ushr v2.8h, v2.8h, #15 +; CHECK-GI-NEXT: ssra v2.8h, v1.8h, #1 +; CHECK-GI-NEXT: mls v0.8h, v2.8h, v3.8h ; CHECK-GI-NEXT: ret entry: %s = srem <8 x i16> %d, @@ -2082,38 +1997,16 @@ define <8 x i16> @sv8i16_100(<8 x i16> %d, <8 x i16> %e) { ; ; CHECK-GI-LABEL: sv8i16_100: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v1.4s, v0.4h, #0 -; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0 -; CHECK-GI-NEXT: mov w8, #100 // =0x64 -; CHECK-GI-NEXT: movi v4.4h, #100 -; CHECK-GI-NEXT: fmov w9, s1 -; CHECK-GI-NEXT: fmov w13, s0 -; CHECK-GI-NEXT: mov w10, v1.s[1] -; CHECK-GI-NEXT: mov w14, v0.s[1] -; CHECK-GI-NEXT: mov w11, v1.s[2] -; CHECK-GI-NEXT: mov w15, v0.s[2] -; CHECK-GI-NEXT: mov w12, v1.s[3] -; CHECK-GI-NEXT: mov w16, v0.s[3] -; CHECK-GI-NEXT: sshll v4.4s, v4.4h, #0 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w13, w13, w8 -; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: fmov s3, w13 -; CHECK-GI-NEXT: sdiv w14, w14, w8 -; CHECK-GI-NEXT: mov v2.s[1], w10 -; CHECK-GI-NEXT: sdiv w11, w11, w8 -; CHECK-GI-NEXT: mov v3.s[1], w14 -; CHECK-GI-NEXT: sdiv w15, w15, w8 -; CHECK-GI-NEXT: mov v2.s[2], w11 -; CHECK-GI-NEXT: sdiv w12, w12, w8 -; CHECK-GI-NEXT: mov v3.s[2], w15 -; CHECK-GI-NEXT: sdiv w8, w16, w8 -; CHECK-GI-NEXT: mov v2.s[3], w12 -; CHECK-GI-NEXT: mls v1.4s, v2.4s, v4.4s -; CHECK-GI-NEXT: mov v3.s[3], w8 -; CHECK-GI-NEXT: mls v0.4s, v3.4s, v4.4s -; CHECK-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: adrp x8, .LCPI47_0 +; CHECK-GI-NEXT: movi v3.8h, #100 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI47_0] +; CHECK-GI-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: sshr v2.8h, v1.8h, #3 +; CHECK-GI-NEXT: ushr v2.8h, v2.8h, #15 +; CHECK-GI-NEXT: ssra v2.8h, v1.8h, #3 +; CHECK-GI-NEXT: mls v0.8h, v2.8h, v3.8h ; CHECK-GI-NEXT: ret entry: %s = srem <8 x i16> %d, @@ -2499,17 +2392,16 @@ define <2 x i32> @sv2i32_7(<2 x i32> %d, <2 x i32> %e) { ; ; CHECK-GI-LABEL: sv2i32_7: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: mov w8, #7 // =0x7 -; CHECK-GI-NEXT: mov w10, v0.s[1] -; CHECK-GI-NEXT: movi v2.2s, #7 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w8, w10, w8 -; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: mov v1.s[1], w8 -; CHECK-GI-NEXT: mls v0.2s, v1.2s, v2.2s -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: adrp x8, .LCPI56_0 +; CHECK-GI-NEXT: movi v3.2s, #7 +; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI56_0] +; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #32 +; CHECK-GI-NEXT: add v1.2s, v1.2s, v0.2s +; CHECK-GI-NEXT: sshr v2.2s, v1.2s, #2 +; CHECK-GI-NEXT: ushr v2.2s, v2.2s, #31 +; CHECK-GI-NEXT: ssra v2.2s, v1.2s, #2 +; CHECK-GI-NEXT: mls v0.2s, v2.2s, v3.2s ; CHECK-GI-NEXT: ret entry: %s = srem <2 x i32> %d, @@ -2532,17 +2424,15 @@ define <2 x i32> @sv2i32_100(<2 x i32> %d, <2 x i32> %e) { ; ; CHECK-GI-LABEL: sv2i32_100: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: mov w8, #100 // =0x64 -; CHECK-GI-NEXT: mov w10, v0.s[1] -; CHECK-GI-NEXT: movi v2.2s, #100 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w8, w10, w8 -; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: mov v1.s[1], w8 -; CHECK-GI-NEXT: mls v0.2s, v1.2s, v2.2s -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: adrp x8, .LCPI57_0 +; CHECK-GI-NEXT: movi v3.2s, #100 +; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI57_0] +; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #32 +; CHECK-GI-NEXT: sshr v2.2s, v1.2s, #5 +; CHECK-GI-NEXT: ushr v2.2s, v2.2s, #31 +; CHECK-GI-NEXT: ssra v2.2s, v1.2s, #5 +; CHECK-GI-NEXT: mls v0.2s, v2.2s, v3.2s ; CHECK-GI-NEXT: ret entry: %s = srem <2 x i32> %d, @@ -2664,21 +2554,17 @@ define <4 x i32> @sv4i32_7(<4 x i32> %d, <4 x i32> %e) { ; ; CHECK-GI-LABEL: sv4i32_7: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: mov w8, #7 // =0x7 -; CHECK-GI-NEXT: mov w10, v0.s[1] -; CHECK-GI-NEXT: mov w11, v0.s[2] -; CHECK-GI-NEXT: mov w12, v0.s[3] -; CHECK-GI-NEXT: movi v2.4s, #7 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: sdiv w11, w11, w8 -; CHECK-GI-NEXT: mov v1.s[1], w10 -; CHECK-GI-NEXT: sdiv w8, w12, w8 -; CHECK-GI-NEXT: mov v1.s[2], w11 -; CHECK-GI-NEXT: mov v1.s[3], w8 -; CHECK-GI-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: adrp x8, .LCPI60_0 +; CHECK-GI-NEXT: movi v3.4s, #7 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI60_0] +; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: add v1.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: sshr v2.4s, v1.4s, #2 +; CHECK-GI-NEXT: ushr v2.4s, v2.4s, #31 +; CHECK-GI-NEXT: ssra v2.4s, v1.4s, #2 +; CHECK-GI-NEXT: mls v0.4s, v2.4s, v3.4s ; CHECK-GI-NEXT: ret entry: %s = srem <4 x i32> %d, @@ -2702,21 +2588,16 @@ define <4 x i32> @sv4i32_100(<4 x i32> %d, <4 x i32> %e) { ; ; CHECK-GI-LABEL: sv4i32_100: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: mov w8, #100 // =0x64 -; CHECK-GI-NEXT: mov w10, v0.s[1] -; CHECK-GI-NEXT: mov w11, v0.s[2] -; CHECK-GI-NEXT: mov w12, v0.s[3] -; CHECK-GI-NEXT: movi v2.4s, #100 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: sdiv w11, w11, w8 -; CHECK-GI-NEXT: mov v1.s[1], w10 -; CHECK-GI-NEXT: sdiv w8, w12, w8 -; CHECK-GI-NEXT: mov v1.s[2], w11 -; CHECK-GI-NEXT: mov v1.s[3], w8 -; CHECK-GI-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: adrp x8, .LCPI61_0 +; CHECK-GI-NEXT: movi v3.4s, #100 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI61_0] +; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: sshr v2.4s, v1.4s, #5 +; CHECK-GI-NEXT: ushr v2.4s, v2.4s, #31 +; CHECK-GI-NEXT: ssra v2.4s, v1.4s, #5 +; CHECK-GI-NEXT: mls v0.4s, v2.4s, v3.4s ; CHECK-GI-NEXT: ret entry: %s = srem <4 x i32> %d, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll index 530f4cf53321e..1eb8457cd4a5d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll @@ -254,27 +254,13 @@ define i32 @v_srem_i32_pow2k_denom(i32 %num) { ; CHECK-LABEL: v_srem_i32_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, 0x45800000 -; CHECK-NEXT: v_mov_b32_e32 v3, 0xfffff000 -; CHECK-NEXT: v_mov_b32_e32 v4, 0x1000 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v3, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 12, v2 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 0xfffff000, v0 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 0xfffff000, v0 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 +; CHECK-NEXT: v_mov_b32_e32 v1, 0x80000001 +; CHECK-NEXT: v_mul_hi_i32 v1, v0, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v1, 11, v1 +; CHECK-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 12, v1 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i32 %num, 4096 @@ -327,42 +313,21 @@ define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) { ; CGP-LABEL: v_srem_v2i32_pow2k_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, 0x45800000 -; CGP-NEXT: v_mov_b32_e32 v4, 0xfffff000 -; CGP-NEXT: v_mov_b32_e32 v5, 0x1000 -; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_mul_lo_u32 v7, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v0, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v1, v3 -; CGP-NEXT: v_lshlrev_b32_e32 v7, 12, v7 +; CGP-NEXT: v_mov_b32_e32 v2, 0x80000001 +; CGP-NEXT: v_mul_hi_i32 v3, v0, v2 +; CGP-NEXT: v_mul_hi_i32 v2, v1, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v0 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v1 +; CGP-NEXT: v_ashrrev_i32_e32 v3, 11, v3 +; CGP-NEXT: v_ashrrev_i32_e32 v2, 11, v2 +; CGP-NEXT: v_lshrrev_b32_e32 v4, 31, v3 +; CGP-NEXT: v_lshrrev_b32_e32 v5, 31, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CGP-NEXT: v_lshlrev_b32_e32 v3, 12, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v0, v4 -; CGP-NEXT: v_add_i32_e32 v7, vcc, 0xfffff000, v1 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v0, v4 -; CGP-NEXT: v_add_i32_e32 v4, vcc, 0xfffff000, v1 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 +; CGP-NEXT: v_lshlrev_b32_e32 v2, 12, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i32> %num, ret <2 x i32> %result @@ -372,27 +337,14 @@ define i32 @v_srem_i32_oddk_denom(i32 %num) { ; CHECK-LABEL: v_srem_i32_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, 0x4996c7d8 -; CHECK-NEXT: v_mov_b32_e32 v3, 0xffed2705 -; CHECK-NEXT: v_mov_b32_e32 v4, 0x12d8fb -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v3, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 -; CHECK-NEXT: v_mul_lo_u32 v2, v2, v4 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 0xffed2705, v0 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 0xffed2705, v0 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 +; CHECK-NEXT: v_mov_b32_e32 v1, 0xd9528441 +; CHECK-NEXT: v_mul_hi_i32 v1, v0, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v1, 20, v1 +; CHECK-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_mov_b32_e32 v2, 0x12d8fb +; CHECK-NEXT: v_mul_lo_u32 v1, v1, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i32 %num, 1235195 @@ -445,42 +397,22 @@ define <2 x i32> @v_srem_v2i32_oddk_denom(<2 x i32> %num) { ; CGP-LABEL: v_srem_v2i32_oddk_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, 0x4996c7d8 -; CGP-NEXT: v_mov_b32_e32 v4, 0xffed2705 -; CGP-NEXT: v_mov_b32_e32 v5, 0x12d8fb -; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_mul_lo_u32 v7, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v0, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v1, v3 -; CGP-NEXT: v_mul_lo_u32 v7, v7, v5 -; CGP-NEXT: v_mul_lo_u32 v3, v3, v5 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v0, v4 -; CGP-NEXT: v_add_i32_e32 v7, vcc, 0xffed2705, v1 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v0, v4 -; CGP-NEXT: v_add_i32_e32 v4, vcc, 0xffed2705, v1 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 +; CGP-NEXT: v_mov_b32_e32 v2, 0xd9528441 +; CGP-NEXT: v_mov_b32_e32 v3, 0x12d8fb +; CGP-NEXT: v_mul_hi_i32 v4, v0, v2 +; CGP-NEXT: v_mul_hi_i32 v2, v1, v2 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v0 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v1 +; CGP-NEXT: v_ashrrev_i32_e32 v4, 20, v4 +; CGP-NEXT: v_ashrrev_i32_e32 v2, 20, v2 +; CGP-NEXT: v_lshrrev_b32_e32 v5, 31, v4 +; CGP-NEXT: v_lshrrev_b32_e32 v6, 31, v2 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v3 +; CGP-NEXT: v_mul_lo_u32 v2, v2, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i32> %num, ret <2 x i32> %result