Skip to content

Commit 46f497d

Browse files
mshelegoigcbot
authored andcommitted
GenXPromoteArray opaque pointers fix
Do not rely on bitcasts when deciding whether an index adjustment is necessary. In opaque pointers mode types can change between instructions without bitcasts.
1 parent 6072b2c commit 46f497d

File tree

2 files changed

+127
-27
lines changed

2 files changed

+127
-27
lines changed

IGC/VectorCompiler/lib/GenXCodeGen/GenXPromoteArray.cpp

Lines changed: 35 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*========================== begin_copyright_notice ============================
22
3-
Copyright (C) 2019-2024 Intel Corporation
3+
Copyright (C) 2019-2025 Intel Corporation
44
55
SPDX-License-Identifier: MIT
66
@@ -148,15 +148,14 @@ namespace {
148148
// a considered element in a considered vector.
149149
struct GenericVectorIndex {
150150
Value *Index;
151-
int ElementSizeInBits;
152-
bool NeedAdjust = false;
151+
unsigned ElementSizeInBits;
153152

154-
int getElementSizeInBytes() const {
153+
unsigned getElementSizeInBytes() const {
155154
return ElementSizeInBits / genx::ByteBits;
156155
}
157156

158157
template <typename FolderT = ConstantFolder>
159-
void adjustIndex(Type *Ty, IRBuilder<FolderT> &IRB);
158+
void adjust(Type *Ty, IRBuilder<FolderT> &IRB);
160159
};
161160

162161
class TransposeHelper {
@@ -228,25 +227,35 @@ Type *getBaseType(Type *Ty, Type *BaseTy) {
228227
}
229228

230229
template <typename FolderT>
231-
void GenericVectorIndex::adjustIndex(Type *Ty, IRBuilder<FolderT> &IRB) {
232-
if (!NeedAdjust)
233-
return;
230+
void GenericVectorIndex::adjust(Type *Ty, IRBuilder<FolderT> &IRB) {
234231
auto *BaseTy = getBaseType(Ty, nullptr);
235232
IGC_ASSERT_EXIT(BaseTy);
236-
if (BaseTy->getScalarSizeInBits() == ElementSizeInBits ||
233+
unsigned NewElementSizeInBits = BaseTy->getScalarSizeInBits();
234+
if (NewElementSizeInBits == ElementSizeInBits ||
237235
vc::isFunctionPointerType(BaseTy))
238236
return;
239-
IGC_ASSERT_EXIT(BaseTy->getScalarSizeInBits() == 8);
240-
Constant *Scale =
241-
IRB.getInt32(ElementSizeInBits / BaseTy->getScalarSizeInBits());
242-
if (Index->getType()->isVectorTy()) {
243-
auto Width =
244-
cast<IGCLLVM::FixedVectorType>(Index->getType())->getNumElements();
245-
Scale = ConstantVector::getSplat(IGCLLVM::getElementCount(Width), Scale);
237+
if (NewElementSizeInBits < ElementSizeInBits) {
238+
IGC_ASSERT_MESSAGE(ElementSizeInBits % NewElementSizeInBits == 0,
239+
"New element size is not a divisor of the current one");
240+
Constant *Scale = IRB.getInt32(ElementSizeInBits / NewElementSizeInBits);
241+
if (Index->getType()->isVectorTy()) {
242+
auto Width =
243+
cast<IGCLLVM::FixedVectorType>(Index->getType())->getNumElements();
244+
Scale = ConstantVector::getSplat(IGCLLVM::getElementCount(Width), Scale);
245+
}
246+
Index = IRB.CreateMul(Index, Scale);
247+
} else {
248+
IGC_ASSERT_MESSAGE(NewElementSizeInBits % ElementSizeInBits == 0,
249+
"New element size is not a multiple of the current one");
250+
Constant *Scale = IRB.getInt32(NewElementSizeInBits / ElementSizeInBits);
251+
if (Index->getType()->isVectorTy()) {
252+
auto Width =
253+
cast<IGCLLVM::FixedVectorType>(Index->getType())->getNumElements();
254+
Scale = ConstantVector::getSplat(IGCLLVM::getElementCount(Width), Scale);
255+
}
256+
Index = IRB.CreateUDiv(Index, Scale);
246257
}
247-
Index = IRB.CreateMul(Index, Scale);
248-
ElementSizeInBits = BaseTy->getScalarSizeInBits();
249-
NeedAdjust = false;
258+
ElementSizeInBits = NewElementSizeInBits;
250259
}
251260

252261
template <typename FolderT>
@@ -291,7 +300,6 @@ void TransposeHelper::EraseDeadCode() {
291300
}
292301

293302
void TransposeHelper::handleBCInst(BitCastInst &BC, GenericVectorIndex Idx) {
294-
Idx.NeedAdjust = true;
295303
ToBeRemoved.push_back(&BC);
296304
handleAllocaSources(BC, Idx);
297305
}
@@ -375,7 +383,7 @@ void TransposeHelper::handleGEPInst(GetElementPtrInst *GEP,
375383
GenericVectorIndex Idx) {
376384
ToBeRemoved.push_back(GEP);
377385
IRBuilder<> IRB(GEP);
378-
Idx.adjustIndex(GEP->getSourceElementType(), IRB);
386+
Idx.adjust(GEP->getSourceElementType(), IRB);
379387
Value *PtrOp = GEP->getPointerOperand();
380388
PointerType *PtrTy = dyn_cast<PointerType>(PtrOp->getType());
381389
IGC_ASSERT_MESSAGE(PtrTy, "Only accept scalar pointer!");
@@ -499,7 +507,7 @@ void TransposeHelper::handlePHINode(PHINode *Phi, GenericVectorIndex Idx,
499507
void TransposeHelper::handleLoadInst(LoadInst *Load, GenericVectorIndex Idx) {
500508
IGC_ASSERT(Load->isSimple());
501509
IRBuilder<> IRB(Load);
502-
Idx.adjustIndex(Load->getType(), IRB);
510+
Idx.adjust(Load->getType(), IRB);
503511
auto *ScalarizedIdx =
504512
IRB.CreateMul(Idx.Index, ConstantInt::get(Idx.Index->getType(),
505513
Idx.getElementSizeInBytes()));
@@ -559,7 +567,7 @@ void TransposeHelper::handleStoreInst(StoreInst *Store,
559567
IGC_ASSERT(Store->isSimple());
560568
IRBuilder<> IRB(Store);
561569
Value *StoreVal = Store->getValueOperand();
562-
Idx.adjustIndex(StoreVal->getType(), IRB);
570+
Idx.adjust(StoreVal->getType(), IRB);
563571
auto *ScalarizedIdx =
564572
IRB.CreateMul(Idx.Index, ConstantInt::get(Idx.Index->getType(),
565573
Idx.getElementSizeInBytes()));
@@ -626,7 +634,7 @@ void TransposeHelper::handleStoreInst(StoreInst *Store,
626634
void TransposeHelper::handleGather(IntrinsicInst *Inst, GenericVectorIndex Idx,
627635
unsigned MaskIndex, unsigned ValueIndex) {
628636
IRBuilder<> IRB(Inst);
629-
Idx.adjustIndex(Inst->getType(), IRB);
637+
Idx.adjust(Type::getInt8Ty(Inst->getContext()), IRB);
630638
auto *ScalarizedIdx =
631639
IRB.CreateMul(Idx.Index, ConstantInt::get(Idx.Index->getType(),
632640
Idx.getElementSizeInBytes()));
@@ -666,8 +674,8 @@ void TransposeHelper::handleGather(IntrinsicInst *Inst, GenericVectorIndex Idx,
666674
void TransposeHelper::handleScatter(IntrinsicInst *Inst, GenericVectorIndex Idx,
667675
unsigned MaskIndex, unsigned ValueIndex) {
668676
IRBuilder<> IRB(Inst);
677+
Idx.adjust(Type::getInt8Ty(Inst->getContext()), IRB);
669678
auto *StoreVal = Inst->getArgOperand(ValueIndex);
670-
Idx.adjustIndex(StoreVal->getType(), IRB);
671679
auto *ScalarizedIdx =
672680
IRB.CreateMul(Idx.Index, ConstantInt::get(Idx.Index->getType(),
673681
Idx.getElementSizeInBytes()));
@@ -1122,8 +1130,8 @@ void GenXPromoteArray::handleAllocaInst(AllocaInst *Alloca) {
11221130
return;
11231131

11241132
IRBuilder<> IRB(VecAlloca);
1125-
GenericVectorIndex StartIdx{IRB.getInt32(0),
1126-
static_cast<int>(DL->getTypeSizeInBits(BaseTy))};
1133+
GenericVectorIndex StartIdx{
1134+
IRB.getInt32(0), static_cast<unsigned>(DL->getTypeSizeInBits(BaseTy))};
11271135
TransposeHelper Helper(VecAlloca, DL);
11281136
Helper.handleAllocaSources(*Alloca, StartIdx);
11291137
Helper.EraseDeadCode();
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2025 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
9+
; RUN: %opt_opaque_ptrs %use_old_pass_manager% -GenXPromoteArray -march=genx64 -mcpu=XeLP -S < %s | FileCheck %s --check-prefixes=CHECK
10+
11+
define dllexport spir_kernel void @f_f(ptr addrspace(1) %out) {
12+
; CHECK: [[ALLOCA:%.*]] = alloca <4 x i32>
13+
%alloca = alloca [4 x i32], align 64
14+
; CHECK-NEXT: [[LOAD0:%.*]] = load <4 x i32>, ptr [[ALLOCA]]
15+
; CHECK-NEXT: [[INS0:%.*]] = insertelement <4 x i32> [[LOAD0]], i32 0, i32 0
16+
; CHECK-NEXT: [[INS1:%.*]] = insertelement <4 x i32> [[INS0]], i32 1, i32 1
17+
; CHECK-NEXT: store <4 x i32> [[INS1]], ptr [[ALLOCA]]
18+
store <2 x i32> <i32 0, i32 1>, ptr %alloca
19+
; CHECK-NEXT: [[LOAD1:%.*]] = load <4 x i32>, ptr [[ALLOCA]]
20+
; CHECK-NEXT: [[INS2:%.*]] = insertelement <4 x i32> [[LOAD1]], i32 2, i32 2
21+
; CHECK-NEXT: [[INS3:%.*]] = insertelement <4 x i32> [[INS2]], i32 3, i32 3
22+
; CHECK-NEXT: store <4 x i32> [[INS3]], ptr [[ALLOCA]]
23+
%gep1 = getelementptr i8, ptr %alloca, i64 8
24+
store <2 x i32> <i32 2, i32 3>, ptr %gep1
25+
; CHECK-NEXT: [[LOAD2:%.*]] = load <4 x i32>, ptr [[ALLOCA]]
26+
; CHECK-NEXT: [[BC:%.*]] = bitcast <4 x i32> [[LOAD2]] to <16 x i8>
27+
; CHECK-NEXT: [[EX0:%.*]] = extractelement <16 x i8> [[BC]], i32 0
28+
; CHECK-NEXT: [[INS4:%.*]] = insertelement <12 x i8> undef, i8 [[EX0]], i32 0
29+
; CHECK-NEXT: [[EX1:%.*]] = extractelement <16 x i8> [[BC]], i32 1
30+
; CHECK-NEXT: [[INS5:%.*]] = insertelement <12 x i8> [[INS4]], i8 [[EX1]], i32 1
31+
; CHECK-NEXT: [[EX2:%.*]] = extractelement <16 x i8> [[BC]], i32 2
32+
; CHECK-NEXT: [[INS6:%.*]] = insertelement <12 x i8> [[INS5]], i8 [[EX2]], i32 2
33+
; CHECK-NEXT: [[EX3:%.*]] = extractelement <16 x i8> [[BC]], i32 3
34+
; CHECK-NEXT: [[INS7:%.*]] = insertelement <12 x i8> [[INS6]], i8 [[EX3]], i32 3
35+
; CHECK-NEXT: [[EX4:%.*]] = extractelement <16 x i8> [[BC]], i32 4
36+
; CHECK-NEXT: [[INS8:%.*]] = insertelement <12 x i8> [[INS7]], i8 [[EX4]], i32 4
37+
; CHECK-NEXT: [[EX5:%.*]] = extractelement <16 x i8> [[BC]], i32 5
38+
; CHECK-NEXT: [[INS9:%.*]] = insertelement <12 x i8> [[INS8]], i8 [[EX5]], i32 5
39+
; CHECK-NEXT: [[EX6:%.*]] = extractelement <16 x i8> [[BC]], i32 6
40+
; CHECK-NEXT: [[INS10:%.*]] = insertelement <12 x i8> [[INS9]], i8 [[EX6]], i32 6
41+
; CHECK-NEXT: [[EX7:%.*]] = extractelement <16 x i8> [[BC]], i32 7
42+
; CHECK-NEXT: [[INS11:%.*]] = insertelement <12 x i8> [[INS10]], i8 [[EX7]], i32 7
43+
; CHECK-NEXT: [[EX8:%.*]] = extractelement <16 x i8> [[BC]], i32 8
44+
; CHECK-NEXT: [[INS12:%.*]] = insertelement <12 x i8> [[INS11]], i8 [[EX8]], i32 8
45+
; CHECK-NEXT: [[EX9:%.*]] = extractelement <16 x i8> [[BC]], i32 9
46+
; CHECK-NEXT: [[INS13:%.*]] = insertelement <12 x i8> [[INS12]], i8 [[EX9]], i32 9
47+
; CHECK-NEXT: [[EX10:%.*]] = extractelement <16 x i8> [[BC]], i32 10
48+
; CHECK-NEXT: [[INS14:%.*]] = insertelement <12 x i8> [[INS13]], i8 [[EX10]], i32 10
49+
; CHECK-NEXT: [[EX11:%.*]] = extractelement <16 x i8> [[BC]], i32 11
50+
; CHECK-NEXT: [[INS15:%.*]] = insertelement <12 x i8> [[INS14]], i8 [[EX11]], i32 11
51+
%gep2 = getelementptr i8, ptr %alloca, i64 4
52+
%load1 = load <12 x i8>, ptr %alloca
53+
; CHECK-NEXT: [[LOAD3:%.*]] = load <4 x i32>, ptr [[ALLOCA]]
54+
; CHECK-NEXT: [[BC1:%.*]] = bitcast <4 x i32> [[LOAD3]] to <16 x i8>
55+
; CHECK-NEXT: [[EX12:%.*]] = extractelement <12 x i8> [[INS15]], i32 0
56+
; CHECK-NEXT: [[INS16:%.*]] = insertelement <16 x i8> [[BC1]], i8 [[EX12]], i32 4
57+
; CHECK-NEXT: [[EX13:%.*]] = extractelement <12 x i8> [[INS15]], i32 1
58+
; CHECK-NEXT: [[INS17:%.*]] = insertelement <16 x i8> [[INS16]], i8 [[EX13]], i32 5
59+
; CHECK-NEXT: [[EX14:%.*]] = extractelement <12 x i8> [[INS15]], i32 2
60+
; CHECK-NEXT: [[INS18:%.*]] = insertelement <16 x i8> [[INS17]], i8 [[EX14]], i32 6
61+
; CHECK-NEXT: [[EX15:%.*]] = extractelement <12 x i8> [[INS15]], i32 3
62+
; CHECK-NEXT: [[INS19:%.*]] = insertelement <16 x i8> [[INS18]], i8 [[EX15]], i32 7
63+
; CHECK-NEXT: [[EX16:%.*]] = extractelement <12 x i8> [[INS15]], i32 4
64+
; CHECK-NEXT: [[INS20:%.*]] = insertelement <16 x i8> [[INS19]], i8 [[EX16]], i32 8
65+
; CHECK-NEXT: [[EX17:%.*]] = extractelement <12 x i8> [[INS15]], i32 5
66+
; CHECK-NEXT: [[INS21:%.*]] = insertelement <16 x i8> [[INS20]], i8 [[EX17]], i32 9
67+
; CHECK-NEXT: [[EX18:%.*]] = extractelement <12 x i8> [[INS15]], i32 6
68+
; CHECK-NEXT: [[INS22:%.*]] = insertelement <16 x i8> [[INS21]], i8 [[EX18]], i32 10
69+
; CHECK-NEXT: [[EX19:%.*]] = extractelement <12 x i8> [[INS15]], i32 7
70+
; CHECK-NEXT: [[INS23:%.*]] = insertelement <16 x i8> [[INS22]], i8 [[EX19]], i32 11
71+
; CHECK-NEXT: [[EX20:%.*]] = extractelement <12 x i8> [[INS15]], i32 8
72+
; CHECK-NEXT: [[INS24:%.*]] = insertelement <16 x i8> [[INS23]], i8 [[EX20]], i32 12
73+
; CHECK-NEXT: [[EX21:%.*]] = extractelement <12 x i8> [[INS15]], i32 9
74+
; CHECK-NEXT: [[INS25:%.*]] = insertelement <16 x i8> [[INS24]], i8 [[EX21]], i32 13
75+
; CHECK-NEXT: [[EX22:%.*]] = extractelement <12 x i8> [[INS15]], i32 10
76+
; CHECK-NEXT: [[INS26:%.*]] = insertelement <16 x i8> [[INS25]], i8 [[EX22]], i32 14
77+
; CHECK-NEXT: [[EX23:%.*]] = extractelement <12 x i8> [[INS15]], i32 11
78+
; CHECK-NEXT: [[INS27:%.*]] = insertelement <16 x i8> [[INS26]], i8 [[EX23]], i32 15
79+
; CHECK-NEXT: [[BC2:%.*]] = bitcast <16 x i8> [[INS27]] to <4 x i32>
80+
; CHECK-NEXT: store <4 x i32> [[BC2]], ptr [[ALLOCA]]
81+
store <12 x i8> %load1, ptr %gep2
82+
; CHECK-NEXT: [[LOAD4:%.*]] = load <4 x i32>, ptr [[ALLOCA]]
83+
; CHECK-NEXT: [[EX24:%.*]] = extractelement <4 x i32> [[LOAD4]], i32 0
84+
; CHECK-NEXT: [[INS28:%.*]] = insertelement <2 x i32> undef, i32 [[EX24]], i32 0
85+
; CHECK-NEXT: [[EX25:%.*]] = extractelement <4 x i32> [[LOAD4]], i32 1
86+
; CHECK-NEXT: [[INS29:%.*]] = insertelement <2 x i32> [[INS28]], i32 [[EX25]], i32 1
87+
%load2 = load <2 x i32>, ptr %alloca
88+
; CHECK-NEXT: store <2 x i32> [[INS29]], ptr addrspace(1) %out
89+
store <2 x i32> %load2, ptr addrspace(1) %out
90+
; CHECK-NEXT: ret void
91+
ret void
92+
}

0 commit comments

Comments
 (0)