Skip to content

Commit 881b3fd

Browse files
authored
[RISCV][IA] Support masked.load for deinterleaveN matching (#149556)
This builds on the whole series of recent API reworks to implement support for deinterleaveN of masked.load. The goal is to be able to enable masked interleave groups in the vectorizer once all the codegen and costing pieces are in place. I considered including the shuffle path support in this review as well (since the RISCV target specific stuff should be common), but decided to separate it into it's own review just to focus attention on one thing at a time.
1 parent e202dba commit 881b3fd

File tree

3 files changed

+62
-95
lines changed

3 files changed

+62
-95
lines changed

llvm/lib/CodeGen/InterleavedAccessPass.cpp

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -601,31 +601,47 @@ static Value *getMask(Value *WideMask, unsigned Factor,
601601
bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
602602
IntrinsicInst *DI, SmallSetVector<Instruction *, 32> &DeadInsts) {
603603
Value *LoadedVal = DI->getOperand(0);
604-
if (!LoadedVal->hasOneUse() || !isa<LoadInst, VPIntrinsic>(LoadedVal))
604+
if (!LoadedVal->hasOneUse())
605+
return false;
606+
607+
auto *LI = dyn_cast<LoadInst>(LoadedVal);
608+
auto *II = dyn_cast<IntrinsicInst>(LoadedVal);
609+
if (!LI && !II)
605610
return false;
606611

607612
const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
608613
assert(Factor && "unexpected deinterleave intrinsic");
609614

610615
Value *Mask = nullptr;
611-
if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadedVal)) {
612-
if (VPLoad->getIntrinsicID() != Intrinsic::vp_load)
616+
if (LI) {
617+
if (!LI->isSimple())
613618
return false;
619+
620+
LLVM_DEBUG(dbgs() << "IA: Found a load with deinterleave intrinsic " << *DI
621+
<< " and factor = " << Factor << "\n");
622+
} else {
623+
assert(II);
624+
614625
// Check mask operand. Handle both all-true/false and interleaved mask.
615-
Value *WideMask = VPLoad->getOperand(1);
616-
Mask = getMask(WideMask, Factor, getDeinterleavedVectorType(DI));
617-
if (!Mask)
626+
Value *WideMask;
627+
switch (II->getIntrinsicID()) {
628+
default:
618629
return false;
630+
case Intrinsic::vp_load:
631+
WideMask = II->getOperand(1);
632+
break;
633+
case Intrinsic::masked_load:
634+
WideMask = II->getOperand(2);
635+
break;
636+
}
619637

620-
LLVM_DEBUG(dbgs() << "IA: Found a vp.load with deinterleave intrinsic "
621-
<< *DI << " and factor = " << Factor << "\n");
622-
} else {
623-
auto *LI = cast<LoadInst>(LoadedVal);
624-
if (!LI->isSimple())
638+
Mask = getMask(WideMask, Factor, getDeinterleavedVectorType(DI));
639+
if (!Mask)
625640
return false;
626641

627-
LLVM_DEBUG(dbgs() << "IA: Found a load with deinterleave intrinsic " << *DI
628-
<< " and factor = " << Factor << "\n");
642+
LLVM_DEBUG(dbgs() << "IA: Found a vp.load or masked.load with deinterleave"
643+
<< " intrinsic " << *DI << " and factor = "
644+
<< Factor << "\n");
629645
}
630646

631647
// Try and match this with target specific intrinsics.

llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp

Lines changed: 30 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -131,24 +131,40 @@ static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy,
131131
: Constant::getAllOnesValue(XLenTy);
132132
return true;
133133
}
134-
auto *VPLdSt = cast<VPIntrinsic>(I);
135-
assert((VPLdSt->getIntrinsicID() == Intrinsic::vp_load ||
136-
VPLdSt->getIntrinsicID() == Intrinsic::vp_store) &&
137-
"Unexpected intrinsic");
138-
Ptr = VPLdSt->getMemoryPointerParam();
139-
Alignment = VPLdSt->getPointerAlignment().value_or(
140-
DL.getABITypeAlign(VTy->getElementType()));
134+
if (auto *VPLdSt = dyn_cast<VPIntrinsic>(I)) {
135+
assert((VPLdSt->getIntrinsicID() == Intrinsic::vp_load ||
136+
VPLdSt->getIntrinsicID() == Intrinsic::vp_store) &&
137+
"Unexpected intrinsic");
138+
Ptr = VPLdSt->getMemoryPointerParam();
139+
Alignment = VPLdSt->getPointerAlignment().value_or(
140+
DL.getABITypeAlign(VTy->getElementType()));
141+
142+
assert(Mask && "vp.load and vp.store needs a mask!");
143+
144+
Value *WideEVL = VPLdSt->getVectorLengthParam();
145+
// Conservatively check if EVL is a multiple of factor, otherwise some
146+
// (trailing) elements might be lost after the transformation.
147+
if (!isMultipleOfN(WideEVL, I->getDataLayout(), Factor))
148+
return false;
141149

142-
assert(Mask && "vp.load and vp.store needs a mask!");
150+
auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
151+
VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
152+
return true;
153+
}
154+
auto *II = cast<IntrinsicInst>(I);
155+
assert(II->getIntrinsicID() == Intrinsic::masked_load &&
156+
"Unexpected intrinsic");
157+
Ptr = II->getOperand(0);
158+
Alignment = cast<ConstantInt>(II->getArgOperand(1))->getAlignValue();
143159

144-
Value *WideEVL = VPLdSt->getVectorLengthParam();
145-
// Conservatively check if EVL is a multiple of factor, otherwise some
146-
// (trailing) elements might be lost after the transformation.
147-
if (!isMultipleOfN(WideEVL, I->getDataLayout(), Factor))
160+
if (!isa<UndefValue>(II->getOperand(3)))
148161
return false;
149162

150-
auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor);
151-
VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy);
163+
assert(Mask && "masked.load needs a mask!");
164+
165+
VL = isa<FixedVectorType>(VTy)
166+
? Builder.CreateElementCount(XLenTy, VTy->getElementCount())
167+
: Constant::getAllOnesValue(XLenTy);
152168
return true;
153169
}
154170

llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll

Lines changed: 3 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -542,10 +542,8 @@ define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x
542542
define {<vscale x 16 x i8>, <vscale x 16 x i8>} @masked_load_factor2(ptr %p) {
543543
; CHECK-LABEL: masked_load_factor2:
544544
; CHECK: # %bb.0:
545-
; CHECK-NEXT: vl4r.v v12, (a0)
546-
; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
547-
; CHECK-NEXT: vnsrl.wi v8, v12, 0
548-
; CHECK-NEXT: vnsrl.wi v10, v12, 8
545+
; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma
546+
; CHECK-NEXT: vlseg2e8.v v8, (a0)
549547
; CHECK-NEXT: ret
550548
%vec = call <vscale x 32 x i8> @llvm.masked.load(ptr %p, i32 4, <vscale x 32 x i1> splat (i1 true), <vscale x 32 x i8> poison)
551549
%deinterleaved.results = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
@@ -555,23 +553,8 @@ define {<vscale x 16 x i8>, <vscale x 16 x i8>} @masked_load_factor2(ptr %p) {
555553
define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @masked_loat_factor4(ptr %p) {
556554
; CHECK-LABEL: masked_loat_factor4:
557555
; CHECK: # %bb.0:
558-
; CHECK-NEXT: addi sp, sp, -16
559-
; CHECK-NEXT: .cfi_def_cfa_offset 16
560-
; CHECK-NEXT: csrr a1, vlenb
561-
; CHECK-NEXT: slli a1, a1, 2
562-
; CHECK-NEXT: sub sp, sp, a1
563-
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
564-
; CHECK-NEXT: vl4r.v v8, (a0)
565-
; CHECK-NEXT: addi a0, sp, 16
566-
; CHECK-NEXT: vs4r.v v8, (a0)
567556
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
568557
; CHECK-NEXT: vlseg4e8.v v8, (a0)
569-
; CHECK-NEXT: csrr a0, vlenb
570-
; CHECK-NEXT: slli a0, a0, 2
571-
; CHECK-NEXT: add sp, sp, a0
572-
; CHECK-NEXT: .cfi_def_cfa sp, 16
573-
; CHECK-NEXT: addi sp, sp, 16
574-
; CHECK-NEXT: .cfi_def_cfa_offset 0
575558
; CHECK-NEXT: ret
576559
%vec = call <vscale x 32 x i8> @llvm.masked.load(ptr %p, i32 4, <vscale x 32 x i1> splat (i1 true), <vscale x 32 x i8> poison)
577560
%deinterleaved.results = call {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave4.nxv32i8(<vscale x 32 x i8> %vec)
@@ -581,56 +564,8 @@ define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i
581564
define {<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>} @masked_loat_factor4_mask(ptr %p, <vscale x 8 x i1> %mask) {
582565
; CHECK-LABEL: masked_loat_factor4_mask:
583566
; CHECK: # %bb.0:
584-
; CHECK-NEXT: addi sp, sp, -16
585-
; CHECK-NEXT: .cfi_def_cfa_offset 16
586-
; CHECK-NEXT: csrr a1, vlenb
587-
; CHECK-NEXT: slli a1, a1, 3
588-
; CHECK-NEXT: sub sp, sp, a1
589-
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
590567
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
591-
; CHECK-NEXT: vmv.v.i v8, 0
592-
; CHECK-NEXT: addi a1, sp, 16
593-
; CHECK-NEXT: csrr a2, vlenb
594-
; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
595-
; CHECK-NEXT: add a3, a1, a2
596-
; CHECK-NEXT: vmv.v.v v9, v8
597-
; CHECK-NEXT: srli a4, a2, 2
598-
; CHECK-NEXT: vmv.v.v v10, v8
599-
; CHECK-NEXT: srli a5, a2, 3
600-
; CHECK-NEXT: vmv.v.v v11, v8
601-
; CHECK-NEXT: vsseg4e8.v v8, (a1)
602-
; CHECK-NEXT: vl1r.v v8, (a1)
603-
; CHECK-NEXT: add a1, a4, a5
604-
; CHECK-NEXT: vl1r.v v9, (a3)
605-
; CHECK-NEXT: add a3, a3, a2
606-
; CHECK-NEXT: add a2, a3, a2
607-
; CHECK-NEXT: vl1r.v v10, (a3)
608-
; CHECK-NEXT: vl1r.v v11, (a2)
609-
; CHECK-NEXT: vmsne.vi v9, v9, 0
610-
; CHECK-NEXT: vmsne.vi v0, v8, 0
611-
; CHECK-NEXT: vmsne.vi v8, v10, 0
612-
; CHECK-NEXT: vmsne.vi v10, v11, 0
613-
; CHECK-NEXT: vsetvli zero, a4, e8, mf2, tu, ma
614-
; CHECK-NEXT: vslideup.vx v0, v9, a5
615-
; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, ma
616-
; CHECK-NEXT: vslideup.vx v0, v8, a4
617-
; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
618-
; CHECK-NEXT: vslideup.vx v0, v10, a1
619-
; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma
620-
; CHECK-NEXT: vle8.v v8, (a0), v0.t
621-
; CHECK-NEXT: csrr a0, vlenb
622-
; CHECK-NEXT: slli a0, a0, 2
623-
; CHECK-NEXT: add a0, sp, a0
624-
; CHECK-NEXT: addi a0, a0, 16
625-
; CHECK-NEXT: vs4r.v v8, (a0)
626-
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
627-
; CHECK-NEXT: vlseg4e8.v v8, (a0)
628-
; CHECK-NEXT: csrr a0, vlenb
629-
; CHECK-NEXT: slli a0, a0, 3
630-
; CHECK-NEXT: add sp, sp, a0
631-
; CHECK-NEXT: .cfi_def_cfa sp, 16
632-
; CHECK-NEXT: addi sp, sp, 16
633-
; CHECK-NEXT: .cfi_def_cfa_offset 0
568+
; CHECK-NEXT: vlseg4e8.v v8, (a0), v0.t
634569
; CHECK-NEXT: ret
635570
%interleaved.mask = tail call <vscale x 32 x i1> @llvm.vector.interleave4.nxv32i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask)
636571
%vec = call <vscale x 32 x i8> @llvm.masked.load(ptr %p, i32 4, <vscale x 32 x i1> %interleaved.mask, <vscale x 32 x i8> poison)

0 commit comments

Comments
 (0)