-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[GlobalISel] Allow expansion of srem by constant in prelegalizer #148845
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-aarch64 @llvm/pr-subscribers-llvm-globalisel Author: None (jyli0116) ChangesThis patch allows srem by a constant to be expanded more efficiently to avoid the need for expensive sdiv instructions. This is the last part of the patches which fixes #118090 Patch is 56.43 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/148845.diff 5 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 31f1197b9723b..da829046cc421 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -700,18 +700,19 @@ class CombinerHelper {
/// Given an G_UDIV \p MI or G_UREM \p MI expressing a divide by constant,
/// return an expression that implements it by multiplying by a magic number.
/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
- MachineInstr *buildUDivorURemUsingMul(MachineInstr &MI) const;
+ MachineInstr *buildUDivOrURemUsingMul(MachineInstr &MI) const;
/// Combine G_UDIV or G_UREM by constant into a multiply by magic constant.
- bool matchUDivorURemByConst(MachineInstr &MI) const;
- void applyUDivorURemByConst(MachineInstr &MI) const;
-
- /// Given an G_SDIV \p MI expressing a signed divide by constant, return an
- /// expression that implements it by multiplying by a magic number.
- /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
- MachineInstr *buildSDivUsingMul(MachineInstr &MI) const;
- /// Combine G_SDIV by constant into a multiply by magic constant.
- bool matchSDivByConst(MachineInstr &MI) const;
- void applySDivByConst(MachineInstr &MI) const;
+ bool matchUDivOrURemByConst(MachineInstr &MI) const;
+ void applyUDivOrURemByConst(MachineInstr &MI) const;
+
+ /// Given an G_SDIV \p MI or G_SREM \p MI expressing a signed divide by
+ /// constant, return an expression that implements it by multiplying by a
+ /// magic number. Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's
+ /// Guide".
+ MachineInstr *buildSDivOrSRemUsingMul(MachineInstr &MI) const;
+ /// Combine G_SDIV or G_SREM by constant into a multiply by magic constant.
+ bool matchSDivOrSRemByConst(MachineInstr &MI) const;
+ void applySDivOrSRemByConst(MachineInstr &MI) const;
/// Given an G_SDIV \p MI expressing a signed divided by a pow2 constant,
/// return expressions that implements it by shifting.
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 66051d756c808..fc81ab76dc72d 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1132,14 +1132,14 @@ def form_bitfield_extract : GICombineGroup<[bitfield_extract_from_sext_inreg,
def udiv_by_const : GICombineRule<
(defs root:$root),
(match (G_UDIV $dst, $x, $y):$root,
- [{ return Helper.matchUDivorURemByConst(*${root}); }]),
- (apply [{ Helper.applyUDivorURemByConst(*${root}); }])>;
+ [{ return Helper.matchUDivOrURemByConst(*${root}); }]),
+ (apply [{ Helper.applyUDivOrURemByConst(*${root}); }])>;
def sdiv_by_const : GICombineRule<
(defs root:$root),
(match (G_SDIV $dst, $x, $y):$root,
- [{ return Helper.matchSDivByConst(*${root}); }]),
- (apply [{ Helper.applySDivByConst(*${root}); }])>;
+ [{ return Helper.matchSDivOrSRemByConst(*${root}); }]),
+ (apply [{ Helper.applySDivOrSRemByConst(*${root}); }])>;
def sdiv_by_pow2 : GICombineRule<
(defs root:$root),
@@ -1159,10 +1159,16 @@ def intdiv_combines : GICombineGroup<[udiv_by_pow2, sdiv_by_pow2,
def urem_by_const : GICombineRule<
(defs root:$root),
(match (G_UREM $dst, $x, $y):$root,
- [{ return Helper.matchUDivorURemByConst(*${root}); }]),
- (apply [{ Helper.applyUDivorURemByConst(*${root}); }])>;
+ [{ return Helper.matchUDivOrURemByConst(*${root}); }]),
+ (apply [{ Helper.applyUDivOrURemByConst(*${root}); }])>;
-def intrem_combines : GICombineGroup<[urem_by_const]>;
+def srem_by_const : GICombineRule<
+ (defs root:$root),
+ (match (G_SREM $dst, $x, $y):$root,
+ [{ return Helper.matchSDivOrSRemByConst(*${root}); }]),
+ (apply [{ Helper.applySDivOrSRemByConst(*${root}); }])>;
+
+def intrem_combines : GICombineGroup<[urem_by_const, srem_by_const]>;
def reassoc_ptradd : GICombineRule<
(defs root:$root, build_fn_matchinfo:$matchinfo),
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 3922eba55e195..e8f513ad5a7a9 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -5300,7 +5300,7 @@ bool CombinerHelper::matchSubAddSameReg(MachineInstr &MI,
return false;
}
-MachineInstr *CombinerHelper::buildUDivorURemUsingMul(MachineInstr &MI) const {
+MachineInstr *CombinerHelper::buildUDivOrURemUsingMul(MachineInstr &MI) const {
unsigned Opcode = MI.getOpcode();
assert(Opcode == TargetOpcode::G_UDIV || Opcode == TargetOpcode::G_UREM);
auto &UDivorRem = cast<GenericMachineInstr>(MI);
@@ -5468,7 +5468,7 @@ MachineInstr *CombinerHelper::buildUDivorURemUsingMul(MachineInstr &MI) const {
return ret;
}
-bool CombinerHelper::matchUDivorURemByConst(MachineInstr &MI) const {
+bool CombinerHelper::matchUDivOrURemByConst(MachineInstr &MI) const {
unsigned Opcode = MI.getOpcode();
assert(Opcode == TargetOpcode::G_UDIV || Opcode == TargetOpcode::G_UREM);
Register Dst = MI.getOperand(0).getReg();
@@ -5517,13 +5517,14 @@ bool CombinerHelper::matchUDivorURemByConst(MachineInstr &MI) const {
MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
}
-void CombinerHelper::applyUDivorURemByConst(MachineInstr &MI) const {
- auto *NewMI = buildUDivorURemUsingMul(MI);
+void CombinerHelper::applyUDivOrURemByConst(MachineInstr &MI) const {
+ auto *NewMI = buildUDivOrURemUsingMul(MI);
replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg());
}
-bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const {
- assert(MI.getOpcode() == TargetOpcode::G_SDIV && "Expected SDIV");
+bool CombinerHelper::matchSDivOrSRemByConst(MachineInstr &MI) const {
+ unsigned Opcode = MI.getOpcode();
+ assert(Opcode == TargetOpcode::G_SDIV || Opcode == TargetOpcode::G_SREM);
Register Dst = MI.getOperand(0).getReg();
Register RHS = MI.getOperand(2).getReg();
LLT DstTy = MRI.getType(Dst);
@@ -5543,7 +5544,8 @@ bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const {
return false;
// If the sdiv has an 'exact' flag we can use a simpler lowering.
- if (MI.getFlag(MachineInstr::MIFlag::IsExact)) {
+ if (Opcode == TargetOpcode::G_SDIV &&
+ MI.getFlag(MachineInstr::MIFlag::IsExact)) {
return matchUnaryPredicate(
MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
}
@@ -5559,23 +5561,28 @@ bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const {
if (!isLegal({TargetOpcode::G_SMULH, {DstTy}}) &&
!isLegalOrHasWidenScalar({TargetOpcode::G_MUL, {WideTy, WideTy}}))
return false;
+ if (Opcode == TargetOpcode::G_SREM &&
+ !isLegalOrBeforeLegalizer({TargetOpcode::G_SUB, {DstTy, DstTy}}))
+ return false;
}
return matchUnaryPredicate(
MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
}
-void CombinerHelper::applySDivByConst(MachineInstr &MI) const {
- auto *NewMI = buildSDivUsingMul(MI);
+void CombinerHelper::applySDivOrSRemByConst(MachineInstr &MI) const {
+ auto *NewMI = buildSDivOrSRemUsingMul(MI);
replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg());
}
-MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) const {
- assert(MI.getOpcode() == TargetOpcode::G_SDIV && "Expected SDIV");
- auto &SDiv = cast<GenericMachineInstr>(MI);
- Register Dst = SDiv.getReg(0);
- Register LHS = SDiv.getReg(1);
- Register RHS = SDiv.getReg(2);
+MachineInstr *CombinerHelper::buildSDivOrSRemUsingMul(MachineInstr &MI) const {
+ unsigned Opcode = MI.getOpcode();
+ assert(MI.getOpcode() == TargetOpcode::G_SDIV ||
+ Opcode == TargetOpcode::G_SREM);
+ auto &SDivorRem = cast<GenericMachineInstr>(MI);
+ Register Dst = SDivorRem.getReg(0);
+ Register LHS = SDivorRem.getReg(1);
+ Register RHS = SDivorRem.getReg(2);
LLT Ty = MRI.getType(Dst);
LLT ScalarTy = Ty.getScalarType();
const unsigned EltBits = ScalarTy.getScalarSizeInBits();
@@ -5705,7 +5712,13 @@ MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) const {
auto SignShift = MIB.buildConstant(ShiftAmtTy, EltBits - 1);
auto T = MIB.buildLShr(Ty, Q, SignShift);
T = MIB.buildAnd(Ty, T, ShiftMask);
- return MIB.buildAdd(Ty, Q, T);
+ auto ret = MIB.buildAdd(Ty, Q, T);
+
+ if (Opcode == TargetOpcode::G_SREM) {
+ auto Prod = MIB.buildMul(Ty, ret, RHS);
+ return MIB.buildSub(Ty, LHS, Prod);
+ }
+ return ret;
}
bool CombinerHelper::matchDivByPow2(MachineInstr &MI, bool IsSigned) const {
diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll
index 1376f5d9a380d..b124042265d40 100644
--- a/llvm/test/CodeGen/AArch64/rem-by-const.ll
+++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll
@@ -19,8 +19,13 @@ define i8 @si8_7(i8 %a, i8 %b) {
; CHECK-GI-LABEL: si8_7:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: sxtb w8, w0
-; CHECK-GI-NEXT: mov w9, #7 // =0x7
-; CHECK-GI-NEXT: sdiv w8, w8, w9
+; CHECK-GI-NEXT: mov w9, #-109 // =0xffffff93
+; CHECK-GI-NEXT: mul w8, w8, w9
+; CHECK-GI-NEXT: sxth w8, w8
+; CHECK-GI-NEXT: add w8, w0, w8, asr #8
+; CHECK-GI-NEXT: sbfx w8, w8, #2, #6
+; CHECK-GI-NEXT: ubfx w9, w8, #7, #1
+; CHECK-GI-NEXT: add w8, w8, w9
; CHECK-GI-NEXT: lsl w9, w8, #3
; CHECK-GI-NEXT: sub w8, w9, w8
; CHECK-GI-NEXT: sub w0, w0, w8
@@ -45,8 +50,14 @@ define i8 @si8_100(i8 %a, i8 %b) {
; CHECK-GI-LABEL: si8_100:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: sxtb w8, w0
+; CHECK-GI-NEXT: mov w9, #41 // =0x29
+; CHECK-GI-NEXT: mul w8, w8, w9
+; CHECK-GI-NEXT: sxth w8, w8
+; CHECK-GI-NEXT: sbfx w8, w8, #8, #8
+; CHECK-GI-NEXT: asr w8, w8, #4
+; CHECK-GI-NEXT: ubfx w9, w8, #7, #1
+; CHECK-GI-NEXT: add w8, w8, w9
; CHECK-GI-NEXT: mov w9, #100 // =0x64
-; CHECK-GI-NEXT: sdiv w8, w8, w9
; CHECK-GI-NEXT: msub w0, w8, w9, w0
; CHECK-GI-NEXT: ret
entry:
@@ -129,8 +140,12 @@ define i16 @si16_7(i16 %a, i16 %b) {
; CHECK-GI-LABEL: si16_7:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: sxth w8, w0
-; CHECK-GI-NEXT: mov w9, #7 // =0x7
-; CHECK-GI-NEXT: sdiv w8, w8, w9
+; CHECK-GI-NEXT: mov w9, #18725 // =0x4925
+; CHECK-GI-NEXT: mul w8, w8, w9
+; CHECK-GI-NEXT: asr w8, w8, #16
+; CHECK-GI-NEXT: asr w8, w8, #1
+; CHECK-GI-NEXT: ubfx w9, w8, #15, #1
+; CHECK-GI-NEXT: add w8, w8, w9
; CHECK-GI-NEXT: lsl w9, w8, #3
; CHECK-GI-NEXT: sub w8, w9, w8
; CHECK-GI-NEXT: sub w0, w0, w8
@@ -155,8 +170,13 @@ define i16 @si16_100(i16 %a, i16 %b) {
; CHECK-GI-LABEL: si16_100:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: sxth w8, w0
+; CHECK-GI-NEXT: mov w9, #5243 // =0x147b
+; CHECK-GI-NEXT: mul w8, w8, w9
+; CHECK-GI-NEXT: asr w8, w8, #16
+; CHECK-GI-NEXT: asr w8, w8, #3
+; CHECK-GI-NEXT: ubfx w9, w8, #15, #1
+; CHECK-GI-NEXT: add w8, w8, w9
; CHECK-GI-NEXT: mov w9, #100 // =0x64
-; CHECK-GI-NEXT: sdiv w8, w8, w9
; CHECK-GI-NEXT: msub w0, w8, w9, w0
; CHECK-GI-NEXT: ret
entry:
@@ -240,8 +260,13 @@ define i32 @si32_7(i32 %a, i32 %b) {
;
; CHECK-GI-LABEL: si32_7:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov w8, #7 // =0x7
-; CHECK-GI-NEXT: sdiv w8, w0, w8
+; CHECK-GI-NEXT: mov w8, #9363 // =0x2493
+; CHECK-GI-NEXT: movk w8, #37449, lsl #16
+; CHECK-GI-NEXT: smull x8, w0, w8
+; CHECK-GI-NEXT: asr x8, x8, #32
+; CHECK-GI-NEXT: add w8, w8, w0
+; CHECK-GI-NEXT: asr w8, w8, #2
+; CHECK-GI-NEXT: add w8, w8, w8, lsr #31
; CHECK-GI-NEXT: lsl w9, w8, #3
; CHECK-GI-NEXT: sub w8, w9, w8
; CHECK-GI-NEXT: sub w0, w0, w8
@@ -265,9 +290,14 @@ define i32 @si32_100(i32 %a, i32 %b) {
;
; CHECK-GI-LABEL: si32_100:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov w8, #100 // =0x64
-; CHECK-GI-NEXT: sdiv w9, w0, w8
-; CHECK-GI-NEXT: msub w0, w9, w8, w0
+; CHECK-GI-NEXT: mov w8, #34079 // =0x851f
+; CHECK-GI-NEXT: mov w9, #100 // =0x64
+; CHECK-GI-NEXT: movk w8, #20971, lsl #16
+; CHECK-GI-NEXT: smull x8, w0, w8
+; CHECK-GI-NEXT: asr x8, x8, #32
+; CHECK-GI-NEXT: asr w8, w8, #5
+; CHECK-GI-NEXT: add w8, w8, w8, lsr #31
+; CHECK-GI-NEXT: msub w0, w8, w9, w0
; CHECK-GI-NEXT: ret
entry:
%s = srem i32 %a, 100
@@ -348,8 +378,13 @@ define i64 @si64_7(i64 %a, i64 %b) {
;
; CHECK-GI-LABEL: si64_7:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov w8, #7 // =0x7
-; CHECK-GI-NEXT: sdiv x8, x0, x8
+; CHECK-GI-NEXT: mov x8, #18725 // =0x4925
+; CHECK-GI-NEXT: movk x8, #9362, lsl #16
+; CHECK-GI-NEXT: movk x8, #37449, lsl #32
+; CHECK-GI-NEXT: movk x8, #18724, lsl #48
+; CHECK-GI-NEXT: smulh x8, x0, x8
+; CHECK-GI-NEXT: asr x8, x8, #1
+; CHECK-GI-NEXT: add x8, x8, x8, lsr #63
; CHECK-GI-NEXT: lsl x9, x8, #3
; CHECK-GI-NEXT: sub x8, x9, x8
; CHECK-GI-NEXT: sub x0, x0, x8
@@ -376,9 +411,16 @@ define i64 @si64_100(i64 %a, i64 %b) {
;
; CHECK-GI-LABEL: si64_100:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov w8, #100 // =0x64
-; CHECK-GI-NEXT: sdiv x9, x0, x8
-; CHECK-GI-NEXT: msub x0, x9, x8, x0
+; CHECK-GI-NEXT: mov x8, #55051 // =0xd70b
+; CHECK-GI-NEXT: mov w9, #100 // =0x64
+; CHECK-GI-NEXT: movk x8, #28835, lsl #16
+; CHECK-GI-NEXT: movk x8, #2621, lsl #32
+; CHECK-GI-NEXT: movk x8, #41943, lsl #48
+; CHECK-GI-NEXT: smulh x8, x0, x8
+; CHECK-GI-NEXT: add x8, x8, x0
+; CHECK-GI-NEXT: asr x8, x8, #6
+; CHECK-GI-NEXT: add x8, x8, x8, lsr #63
+; CHECK-GI-NEXT: msub x0, x8, x9, x0
; CHECK-GI-NEXT: ret
entry:
%s = srem i64 %a, 100
@@ -644,25 +686,49 @@ define <2 x i8> @sv2i8_7(<2 x i8> %d, <2 x i8> %e) {
;
; CHECK-GI-LABEL: sv2i8_7:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov w8, #7 // =0x7
-; CHECK-GI-NEXT: shl v0.2s, v0.2s, #24
+; CHECK-GI-NEXT: mov w8, #65427 // =0xff93
; CHECK-GI-NEXT: fmov s1, w8
-; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #24
; CHECK-GI-NEXT: mov v1.h[1], w8
-; CHECK-GI-NEXT: fmov w9, s0
-; CHECK-GI-NEXT: mov w10, v0.s[1]
; CHECK-GI-NEXT: shl v1.4h, v1.4h, #8
-; CHECK-GI-NEXT: sdiv w9, w9, w8
; CHECK-GI-NEXT: sshr v1.4h, v1.4h, #8
-; CHECK-GI-NEXT: smov w11, v1.h[1]
-; CHECK-GI-NEXT: sdiv w8, w10, w8
-; CHECK-GI-NEXT: smov w10, v1.h[0]
+; CHECK-GI-NEXT: smov w8, v1.h[0]
+; CHECK-GI-NEXT: smov w9, v1.h[1]
+; CHECK-GI-NEXT: shl v1.2s, v0.2s, #24
+; CHECK-GI-NEXT: sshr v1.2s, v1.2s, #24
+; CHECK-GI-NEXT: fmov s2, w8
+; CHECK-GI-NEXT: mov w8, #8 // =0x8
+; CHECK-GI-NEXT: mov v2.s[1], w9
+; CHECK-GI-NEXT: mul v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT: fmov s2, w8
+; CHECK-GI-NEXT: mov v2.h[1], w8
+; CHECK-GI-NEXT: mov w8, #2 // =0x2
+; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT: neg v2.4h, v2.4h
+; CHECK-GI-NEXT: sshl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT: fmov s2, w8
+; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT: mov v2.b[1], w8
+; CHECK-GI-NEXT: mov w8, #7 // =0x7
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: add v1.2s, v1.2s, v0.2s
+; CHECK-GI-NEXT: mov v3.b[1], w8
+; CHECK-GI-NEXT: neg v2.8b, v2.8b
+; CHECK-GI-NEXT: mov w9, v1.s[1]
+; CHECK-GI-NEXT: mov v1.b[1], w9
+; CHECK-GI-NEXT: sshl v1.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT: neg v2.8b, v3.8b
+; CHECK-GI-NEXT: movi v3.2s, #7
+; CHECK-GI-NEXT: ushl v2.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT: umov w8, v1.b[0]
+; CHECK-GI-NEXT: umov w10, v1.b[1]
+; CHECK-GI-NEXT: umov w9, v2.b[0]
+; CHECK-GI-NEXT: umov w11, v2.b[1]
+; CHECK-GI-NEXT: fmov s1, w8
; CHECK-GI-NEXT: fmov s2, w9
-; CHECK-GI-NEXT: fmov s1, w10
-; CHECK-GI-NEXT: mov v1.s[1], w11
-; CHECK-GI-NEXT: mov v2.s[1], w8
-; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: mov v1.s[1], w10
+; CHECK-GI-NEXT: mov v2.s[1], w11
+; CHECK-GI-NEXT: add v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT: mls v0.2s, v1.2s, v3.2s
; CHECK-GI-NEXT: ret
entry:
%s = srem <2 x i8> %d, <i8 7, i8 7>
@@ -687,25 +753,46 @@ define <2 x i8> @sv2i8_100(<2 x i8> %d, <2 x i8> %e) {
;
; CHECK-GI-LABEL: sv2i8_100:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: mov w8, #100 // =0x64
-; CHECK-GI-NEXT: shl v0.2s, v0.2s, #24
+; CHECK-GI-NEXT: mov w8, #41 // =0x29
; CHECK-GI-NEXT: fmov s1, w8
-; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #24
; CHECK-GI-NEXT: mov v1.h[1], w8
-; CHECK-GI-NEXT: fmov w9, s0
-; CHECK-GI-NEXT: mov w10, v0.s[1]
; CHECK-GI-NEXT: shl v1.4h, v1.4h, #8
-; CHECK-GI-NEXT: sdiv w9, w9, w8
; CHECK-GI-NEXT: sshr v1.4h, v1.4h, #8
-; CHECK-GI-NEXT: smov w11, v1.h[1]
-; CHECK-GI-NEXT: sdiv w8, w10, w8
-; CHECK-GI-NEXT: smov w10, v1.h[0]
+; CHECK-GI-NEXT: smov w8, v1.h[0]
+; CHECK-GI-NEXT: smov w9, v1.h[1]
+; CHECK-GI-NEXT: shl v1.2s, v0.2s, #24
+; CHECK-GI-NEXT: sshr v1.2s, v1.2s, #24
+; CHECK-GI-NEXT: fmov s2, w8
+; CHECK-GI-NEXT: mov w8, #8 // =0x8
+; CHECK-GI-NEXT: mov v2.s[1], w9
+; CHECK-GI-NEXT: mul v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT: fmov s2, w8
+; CHECK-GI-NEXT: mov v2.h[1], w8
+; CHECK-GI-NEXT: mov w8, #4 // =0x4
+; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: neg v2.4h, v2.4h
+; CHECK-GI-NEXT: mov v3.b[1], w8
+; CHECK-GI-NEXT: mov w8, #7 // =0x7
+; CHECK-GI-NEXT: sshl v1.4h, v1.4h, v2.4h
+; CHECK-GI-NEXT: fmov s2, w8
+; CHECK-GI-NEXT: neg v3.8b, v3.8b
+; CHECK-GI-NEXT: uzp1 v1.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT: mov v2.b[1], w8
+; CHECK-GI-NEXT: sshl v1.8b, v1.8b, v3.8b
+; CHECK-GI-NEXT: neg v2.8b, v2.8b
+; CHECK-GI-NEXT: movi v3.2s, #100
+; CHECK-GI-NEXT: ushl v2.8b, v1.8b, v2.8b
+; CHECK-GI-NEXT: umov w8, v1.b[0]
+; CHECK-GI-NEXT: umov w10, v1.b[1]
+; CHECK-GI-NEXT: umov w9, v2.b[0]
+; CHECK-GI-NEXT: umov w11, v2.b[1]
+; CHECK-GI-NEXT: fmov s1, w8
; CHECK-GI-NEXT: fmov s2, w9
-; CHECK-GI-NEXT: fmov s1, w10
-; CHECK-GI-NEXT: mov v1.s[1], w11
-; CHECK-GI-NEXT: mov v2.s[1], w8
-; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: mov v1.s[1], w10
+; CHECK-GI-NEXT: mov v2.s[1], w11
+; CHECK-GI-NEXT: add v1.2s, v1.2s, v2.2s
+; CHECK-GI-NEXT: mls v0.2s, v1.2s, v3.2s
; CHECK-GI-NEXT: ret
entry:
%s = srem <2 x i8> %d, <i8 100, i8 100>
@@ -872,30 +959,37 @@ define <4 x i8> @sv4i8_7(<4 x i8> %d, <4 x i8> %e) {
;
; CHECK-GI-LABEL: sv4i8_7:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT: mov w8, #7 // =0x7
-; CHECK-GI-NEXT: movi v3.4h, #7
-; CHECK-GI-NEXT: fmov s2, w8
-; CHECK-GI-NEXT: shl v0.4s, v0.4s, #24
-; CHECK-GI-NEXT: mov v2.h[1], w8
-; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0
-; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #24
-; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0
-; CHECK-GI-NEXT: fmov w9, s0
-; CHECK-GI-NEXT: mov w10, v0.s[1]
-; CHECK-GI-NEXT: mov w11, v0.s[2]
-; CHECK-GI-NEXT: mov w12, v0.s[3]
-; CHECK-GI-NEXT: mov v3.d[1], v2.d[0]
-; CHECK-GI-NEXT: sdiv w9, w9, w8
-; CHECK-GI-NEXT: sdiv w10, w10, w8
-; CHECK-GI-NEXT: fmov s1, w9
-; CHECK-GI-NEXT: sdiv w11, w11, w8
-; CHECK-GI-NEXT: mov v1.s[1], w10
-; CHECK-GI-NEXT: sdiv w9, w12, w8
-; CHECK-GI-NEXT: mov v1.s[2], w11
-; CHECK-GI-NEXT: mov v1.s[3], w9
-; CHECK-GI-NEXT: mls v0.4s, v1.4s, v3.4s
-; CHECK-GI-NEXT: xtn v0.4h, v0.4s
+; CHECK-GI-NEXT: mov w8, #147 // =0x93
+; CHECK-GI-NEXT: shl v2.4h, v0.4h, #8
+; CHECK-GI-NEXT: mov w9, #7 // =0x7
+; CHECK-GI-NEXT: fmov s1, w8
+; CHECK-GI-NEXT: fmov s4, w9
+; CHECK-GI-NEXT: sshr v2.4h, v2.4h, #8
+; CHECK-GI-NEXT: mov v1.b[1], w8
+; CHECK-GI-NEXT: mov v4.b[1], w9
+; CHECK-GI-NEXT: mov v1.b[2], w8
+; CHECK-GI-NEXT: mov v4.b[2], w9
+; CHECK-GI-NEXT: mov v1.b[3], w8
+; CHECK-GI-NEXT: mov w8, #2 // =0x2
+; CHECK-GI-NEXT: mov v4.b[3], w9
+; CHECK-GI-NEXT: fmov s3, w8
+; CHECK-GI-NEXT: mov v3.b[1], w8
+; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT: mul v1.4h, v2.4h, v1.4h
+; CHECK-GI-NEXT: fmov d2, d0
+; CHECK-GI-NEXT: mov v3.b[2], w8
+; CHECK-GI-NEXT: ssra v2.4h, v1.4h...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks for updating matchUDivOrURemByConst too.
Build bots are unhappy: https://lab.llvm.org/buildbot/#/builders/207/builds/4106
|
Hi - There is a problem with cmake that is missing dependencies on certain tablegen files. I think older versions of cmake don't create all the correct dependences in ninja. Is is possible to try a clean build? |
Hiya @melver , thanks for bringing this up - AFAIK, it's a thing where sometimes the .inc files don't regenerate after tablegen files are edited. It's come up before, where it fixes itself a few hours after the merge :( but I'll keep an eye on the buildbots in case I need to revert. |
This patch allows srem by a constant to be expanded more efficiently to avoid the need for expensive sdiv instructions. This is the last part of the patches which fixes #118090