@@ -44364,8 +44364,8 @@ static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
4436444364/// A horizontal-op B, for some already available A and B, and if so then LHS is
4436544365/// set to A, RHS to B, and the routine returns 'true'.
4436644366static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
44367- const X86Subtarget &Subtarget,
44368- bool IsCommutative ) {
44367+ const X86Subtarget &Subtarget, bool IsCommutative,
44368+ SmallVectorImpl<int> &PostShuffleMask ) {
4436944369 // If either operand is undef, bail out. The binop should be simplified.
4437044370 if (LHS.isUndef() || RHS.isUndef())
4437144371 return false;
@@ -44458,6 +44458,12 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
4445844458 RMask.push_back(i);
4445944459 }
4446044460
44461+ // Avoid 128-bit lane crossing if pre-AVX2 and FP (integer will split).
44462+ if (!Subtarget.hasAVX2() && VT.isFloatingPoint() &&
44463+ (isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), LMask) ||
44464+ isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), RMask)))
44465+ return false;
44466+
4446144467 // If A and B occur in reverse order in RHS, then canonicalize by commuting
4446244468 // RHS operands and shuffle mask.
4446344469 if (A != C) {
@@ -44468,6 +44474,9 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
4446844474 if (!(A == C && B == D))
4446944475 return false;
4447044476
44477+ PostShuffleMask.clear();
44478+ PostShuffleMask.append(NumElts, SM_SentinelUndef);
44479+
4447144480 // LHS and RHS are now:
4447244481 // LHS = shuffle A, B, LMask
4447344482 // RHS = shuffle A, B, RMask
@@ -44476,6 +44485,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
4447644485 // so we just repeat the inner loop if this is a 256-bit op.
4447744486 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
4447844487 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
44488+ unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
4447944489 assert((NumEltsPer128BitChunk % 2 == 0) &&
4448044490 "Vector type should have an even number of elements in each lane");
4448144491 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
@@ -44487,25 +44497,40 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
4448744497 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
4448844498 continue;
4448944499
44500+ // Check that successive odd/even elements are being operated on. If not,
44501+ // this is not a horizontal operation.
44502+ if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
44503+ !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
44504+ return false;
44505+
44506+ // Compute the post-shuffle mask index based on where the element
44507+ // is stored in the HOP result, and where it needs to be moved to.
44508+ int Base = LIdx & ~1u;
44509+ int Index = ((Base % NumEltsPer128BitChunk) / 2) +
44510+ ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
44511+
4449044512 // The low half of the 128-bit result must choose from A.
4449144513 // The high half of the 128-bit result must choose from B,
4449244514 // unless B is undef. In that case, we are always choosing from A.
44493- unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
44494- unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0;
44495-
44496- // Check that successive elements are being operated on. If not, this is
44497- // not a horizontal operation.
44498- int Index = 2 * (i % NumEltsPer64BitChunk) + NumElts * Src + j;
44499- if (!(LIdx == Index && RIdx == Index + 1) &&
44500- !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
44501- return false;
44515+ if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
44516+ Index += NumEltsPer64BitChunk;
44517+ PostShuffleMask[i + j] = Index;
4450244518 }
4450344519 }
4450444520
4450544521 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
4450644522 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
4450744523
44508- if (!shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget))
44524+ bool IsIdentityPostShuffle =
44525+ isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
44526+ if (IsIdentityPostShuffle)
44527+ PostShuffleMask.clear();
44528+
44529+ // Assume a SingleSource HOP if we only shuffle one input and don't need to
44530+ // shuffle the result.
44531+ if (!shouldUseHorizontalOp(LHS == RHS &&
44532+ (NumShuffles < 2 || !IsIdentityPostShuffle),
44533+ DAG, Subtarget))
4450944534 return false;
4451044535
4451144536 LHS = DAG.getBitcast(VT, LHS);
@@ -44524,10 +44549,16 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
4452444549 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
4452544550
4452644551 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
44552+ SmallVector<int, 8> PostShuffleMask;
4452744553 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
4452844554 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
44529- isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd))
44530- return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
44555+ isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd, PostShuffleMask)) {
44556+ SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
44557+ if (!PostShuffleMask.empty())
44558+ HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
44559+ DAG.getUNDEF(VT), PostShuffleMask);
44560+ return HorizBinOp;
44561+ }
4453144562
4453244563 // NOTE: isHorizontalBinOp may have changed LHS/RHS variables.
4453344564
@@ -47620,17 +47651,22 @@ static SDValue combineAddOrSubToHADDorHSUB(SDNode *N, SelectionDAG &DAG,
4762047651 bool IsAdd = N->getOpcode() == ISD::ADD;
4762147652 assert((IsAdd || N->getOpcode() == ISD::SUB) && "Wrong opcode");
4762247653
47654+ SmallVector<int, 8> PostShuffleMask;
4762347655 if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
4762447656 VT == MVT::v8i32) &&
4762547657 Subtarget.hasSSSE3() &&
47626- isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd)) {
47658+ isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd, PostShuffleMask )) {
4762747659 auto HOpBuilder = [IsAdd](SelectionDAG &DAG, const SDLoc &DL,
4762847660 ArrayRef<SDValue> Ops) {
47629- return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB,
47630- DL, Ops[0].getValueType(), Ops);
47661+ return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB, DL,
47662+ Ops[0].getValueType(), Ops);
4763147663 };
47632- return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
47633- HOpBuilder);
47664+ SDValue HorizBinOp =
47665+ SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HOpBuilder);
47666+ if (!PostShuffleMask.empty())
47667+ HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
47668+ DAG.getUNDEF(VT), PostShuffleMask);
47669+ return HorizBinOp;
4763447670 }
4763547671
4763647672 return SDValue();
0 commit comments