Skip to content

Commit 7968172

Browse files
authored
[SM6.9] Scalarize native vector intrinsics on linking pre6.9 (#7690)
When shaders are compiled into libraries as 6.9+ they need to be downgraded for previous shader models upon linking. This adapts the load/store scalarization pass that serves this purpose for those operations to work for any DXIL intrinsics that need it. Incidentally clarifies the role of and tightens up the testing of vector intrinsics that are still scalarized for now. Fixes #7344
1 parent a47537f commit 7968172

11 files changed

+571
-44
lines changed

include/dxc/HLSL/DxilGenerationPass.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ ModulePass *createResumePassesPass();
8181
FunctionPass *createMatrixBitcastLowerPass();
8282
ModulePass *createDxilCleanupAddrSpaceCastPass();
8383
ModulePass *createDxilRenameResourcesPass();
84-
ModulePass *createDxilScalarizeVectorLoadStoresPass();
84+
ModulePass *createDxilScalarizeVectorIntrinsicsPass();
8585

8686
void initializeDxilLowerCreateHandleForLibPass(llvm::PassRegistry &);
8787
void initializeDxilAllocateResourcesForLibPass(llvm::PassRegistry &);
@@ -116,7 +116,7 @@ void initializeResumePassesPass(llvm::PassRegistry &);
116116
void initializeMatrixBitcastLowerPassPass(llvm::PassRegistry &);
117117
void initializeDxilCleanupAddrSpaceCastPass(llvm::PassRegistry &);
118118
void initializeDxilRenameResourcesPass(llvm::PassRegistry &);
119-
void initializeDxilScalarizeVectorLoadStoresPass(llvm::PassRegistry &);
119+
void initializeDxilScalarizeVectorIntrinsicsPass(llvm::PassRegistry &);
120120

121121
ModulePass *createDxilValidateWaveSensitivityPass();
122122
void initializeDxilValidateWaveSensitivityPass(llvm::PassRegistry &);

lib/HLSL/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ add_llvm_library(LLVMHLSL
2525
DxilNoops.cpp
2626
DxilPreserveAllOutputs.cpp
2727
DxilRenameResourcesPass.cpp
28-
DxilScalarizeVectorLoadStores.cpp
28+
DxilScalarizeVectorIntrinsics.cpp
2929
DxilSimpleGVNHoist.cpp
3030
DxilSignatureValidation.cpp
3131
DxilTargetLowering.cpp

lib/HLSL/DxilLinker.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1249,7 +1249,7 @@ void DxilLinkJob::RunPreparePass(Module &M) {
12491249

12501250
// If we need SROA and dynamicindexvector to array,
12511251
// do it early to allow following scalarization to go forward.
1252-
PM.add(createDxilScalarizeVectorLoadStoresPass());
1252+
PM.add(createDxilScalarizeVectorIntrinsicsPass());
12531253

12541254
// Remove unused functions.
12551255
PM.add(createDxilDeadFunctionEliminationPass());

lib/HLSL/DxilScalarizeVectorLoadStores.cpp renamed to lib/HLSL/DxilScalarizeVectorIntrinsics.cpp

Lines changed: 68 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
///////////////////////////////////////////////////////////////////////////////
22
// //
3-
// DxilScalarizeVectorLoadStores.cpp //
3+
// DxilScalarizeVectorIntrinsics.cpp //
44
// Copyright (C) Microsoft Corporation. All rights reserved. //
55
// This file is distributed under the University of Illinois Open Source //
66
// License. See LICENSE.TXT for details. //
@@ -28,11 +28,12 @@ static void scalarizeVectorLoad(hlsl::OP *HlslOP, const DataLayout &DL,
2828
CallInst *CI);
2929
static void scalarizeVectorStore(hlsl::OP *HlslOP, const DataLayout &DL,
3030
CallInst *CI);
31+
static void scalarizeVectorIntrinsic(hlsl::OP *HlslOP, CallInst *CI);
3132

32-
class DxilScalarizeVectorLoadStores : public ModulePass {
33+
class DxilScalarizeVectorIntrinsics : public ModulePass {
3334
public:
3435
static char ID; // Pass identification, replacement for typeid
35-
explicit DxilScalarizeVectorLoadStores() : ModulePass(ID) {}
36+
explicit DxilScalarizeVectorIntrinsics() : ModulePass(ID) {}
3637

3738
StringRef getPassName() const override {
3839
return "DXIL scalarize vector load/stores";
@@ -47,24 +48,29 @@ class DxilScalarizeVectorLoadStores : public ModulePass {
4748
bool Changed = false;
4849

4950
hlsl::OP *HlslOP = DM.GetOP();
50-
for (auto FIt : HlslOP->GetOpFuncList(DXIL::OpCode::RawBufferVectorLoad)) {
51-
Function *Func = FIt.second;
52-
if (!Func)
53-
continue;
54-
for (auto U = Func->user_begin(), UE = Func->user_end(); U != UE;) {
55-
CallInst *CI = cast<CallInst>(*(U++));
56-
scalarizeVectorLoad(HlslOP, M.getDataLayout(), CI);
57-
Changed = true;
58-
}
59-
}
60-
for (auto FIt : HlslOP->GetOpFuncList(DXIL::OpCode::RawBufferVectorStore)) {
61-
Function *Func = FIt.second;
62-
if (!Func)
63-
continue;
64-
for (auto U = Func->user_begin(), UE = Func->user_end(); U != UE;) {
65-
CallInst *CI = cast<CallInst>(*(U++));
66-
scalarizeVectorStore(HlslOP, M.getDataLayout(), CI);
67-
Changed = true;
51+
52+
// Iterate and scalarize native vector loads, stores, and other intrinsics.
53+
for (auto F = M.functions().begin(); F != M.functions().end();) {
54+
Function *Func = &*(F++);
55+
DXIL::OpCodeClass OpClass;
56+
if (HlslOP->GetOpCodeClass(Func, OpClass)) {
57+
if (OpClass == DXIL::OpCodeClass::RawBufferVectorLoad)
58+
for (auto U = Func->user_begin(), UE = Func->user_end(); U != UE;) {
59+
CallInst *CI = cast<CallInst>(*(U++));
60+
scalarizeVectorLoad(HlslOP, M.getDataLayout(), CI);
61+
Changed = true;
62+
}
63+
else if (OpClass == DXIL::OpCodeClass::RawBufferVectorStore)
64+
for (auto U = Func->user_begin(), UE = Func->user_end(); U != UE;) {
65+
CallInst *CI = cast<CallInst>(*(U++));
66+
scalarizeVectorStore(HlslOP, M.getDataLayout(), CI);
67+
Changed = true;
68+
}
69+
else if (Func->getReturnType()->isVectorTy())
70+
for (auto U = Func->user_begin(), UE = Func->user_end(); U != UE;) {
71+
CallInst *CI = cast<CallInst>(*(U++));
72+
scalarizeVectorIntrinsic(HlslOP, CI);
73+
}
6874
}
6975
}
7076
return Changed;
@@ -220,12 +226,47 @@ static void scalarizeVectorStore(hlsl::OP *HlslOP, const DataLayout &DL,
220226
CI->eraseFromParent();
221227
}
222228

223-
char DxilScalarizeVectorLoadStores::ID = 0;
229+
// Scalarize native vector operation represented by `CI`, generating
230+
// scalar calls for each element of the its vector parameters.
231+
// Use `HlslOP` to retrieve the associated scalar op function.
232+
static void scalarizeVectorIntrinsic(hlsl::OP *HlslOP, CallInst *CI) {
233+
234+
IRBuilder<> Builder(CI);
235+
VectorType *VT = cast<VectorType>(CI->getType());
236+
unsigned VecSize = VT->getNumElements();
237+
unsigned ArgNum = CI->getNumArgOperands();
238+
OP::OpCode Opcode = OP::getOpCode(CI);
239+
Type *Ty = OP::GetOverloadType(Opcode, CI->getCalledFunction());
240+
Function *Func = HlslOP->GetOpFunc(Opcode, Ty->getScalarType());
241+
SmallVector<Value *, 4> Args(ArgNum);
242+
Args[0] = CI->getArgOperand(0); // Copy opcode over.
243+
244+
// For each element in the vector, generate a new call instruction.
245+
// Insert results into a result vector.
246+
Value *RetVal = UndefValue::get(CI->getType());
247+
for (unsigned ElIx = 0; ElIx < VecSize; ElIx++) {
248+
// Replace each vector argument with the result of an extraction.
249+
// Skip known opcode arg as it can't be a vector.
250+
for (unsigned ArgIx = 1; ArgIx < ArgNum; ArgIx++) {
251+
Value *Arg = CI->getArgOperand(ArgIx);
252+
if (Arg->getType()->isVectorTy())
253+
Args[ArgIx] = Builder.CreateExtractElement(Arg, ElIx);
254+
else
255+
Args[ArgIx] = Arg;
256+
}
257+
Value *ElCI = Builder.CreateCall(Func, Args, CI->getName());
258+
RetVal = Builder.CreateInsertElement(RetVal, ElCI, ElIx);
259+
}
260+
CI->replaceAllUsesWith(RetVal);
261+
}
262+
263+
char DxilScalarizeVectorIntrinsics::ID = 0;
224264

225-
ModulePass *llvm::createDxilScalarizeVectorLoadStoresPass() {
226-
return new DxilScalarizeVectorLoadStores();
265+
ModulePass *llvm::createDxilScalarizeVectorIntrinsicsPass() {
266+
return new DxilScalarizeVectorIntrinsics();
227267
}
228268

229-
INITIALIZE_PASS(DxilScalarizeVectorLoadStores,
230-
"hlsl-dxil-scalarize-vector-load-stores",
231-
"DXIL scalarize vector load/stores", false, false)
269+
INITIALIZE_PASS(
270+
DxilScalarizeVectorIntrinsics, "hlsl-dxil-scalarize-vector-intrinsics",
271+
"Scalarize native vector DXIL loads, stores, and other intrinsics", false,
272+
false)
Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
// RUN: %dxc -T lib_6_8 %s | FileCheck %s
2+
// RUN: %dxc -T lib_6_9 %s -Fo %t.1
3+
// RUN: %dxl -T ps_6_8 %t.1 | FileCheck %s --check-prefixes=CHECK,UNARY
4+
5+
// Tests non-native-vector behavior for vec ops that scalarize to something
6+
// more complex than a simple repetition of the same dx.op calls.
7+
8+
StructuredBuffer< vector<float, 4> > buf;
9+
ByteAddressBuffer rbuf;
10+
11+
// CHECK-LABEL: define void @main()
12+
[shader("pixel")]
13+
float4 main(uint i : SV_PrimitiveID, uint4 m : M) : SV_Target {
14+
15+
vector<float, 4> vec1 = rbuf.Load< vector<float, 4> >(i++*32);
16+
vector<float, 4> vec2 = rbuf.Load< vector<float, 4> >(i++*32);
17+
vector<float, 4> vec3 = rbuf.Load< vector<float, 4> >(i++*32);
18+
vector<bool, 4> bvec = rbuf.Load< vector<bool, 4> >(i++*32);
19+
vector<uint, 4> ivec1 = rbuf.Load< vector<uint, 4> >(i++*32);
20+
vector<uint, 4> ivec2 = rbuf.Load< vector<uint, 4> >(i++*32);
21+
vector<float, 4> res = 0;
22+
23+
// CHECK: fdiv fast float
24+
// CHECK: fdiv fast float
25+
// CHECK: fdiv fast float
26+
// CHECK: fdiv fast float
27+
// CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
28+
// CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
29+
// CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
30+
// CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
31+
// CHECK: fadd fast float %{{.*}}, 0x40
32+
// CHECK: fadd fast float %{{.*}}, 0x40
33+
// CHECK: fadd fast float %{{.*}}, 0x40
34+
// CHECK: fadd fast float %{{.*}}, 0x40
35+
// CHECK: fadd fast float %{{.*}}, 0xC0
36+
// CHECK: fadd fast float %{{.*}}, 0xC0
37+
// CHECK: fadd fast float %{{.*}}, 0xC0
38+
// CHECK: fadd fast float %{{.*}}, 0xC0
39+
// CHECK: fcmp fast olt float %{{.*}}, 0
40+
// CHECK: fcmp fast olt float %{{.*}}, 0
41+
// CHECK: fcmp fast olt float %{{.*}}, 0
42+
// CHECK: fcmp fast olt float %{{.*}}, 0
43+
// CHECK: fcmp fast oeq float %{{.*}}, 0
44+
// CHECK: fcmp fast oeq float %{{.*}}, 0
45+
// CHECK: fcmp fast oeq float %{{.*}}, 0
46+
// CHECK: fcmp fast oeq float %{{.*}}, 0
47+
// CHECK: fcmp fast oge float %{{.*}}, 0
48+
// CHECK: fcmp fast oge float %{{.*}}, 0
49+
// CHECK: fcmp fast oge float %{{.*}}, 0
50+
// CHECK: fcmp fast oge float %{{.*}}, 0
51+
// CHECK: fcmp fast olt float %{{.*}}, 0
52+
// CHECK: fcmp fast olt float %{{.*}}, 0
53+
// CHECK: fcmp fast olt float %{{.*}}, 0
54+
// CHECK: fcmp fast olt float %{{.*}}, 0
55+
// CHECK: and i1
56+
// CHECK: and i1
57+
// CHECK: and i1
58+
// CHECK: and i1
59+
// CHECK: select i1 %{{.*}}, float %{{.*}}, float
60+
// CHECK: select i1 %{{.*}}, float %{{.*}}, float
61+
// CHECK: select i1 %{{.*}}, float %{{.*}}, float
62+
// CHECK: select i1 %{{.*}}, float %{{.*}}, float
63+
// CHECK: and i1
64+
// CHECK: and i1
65+
// CHECK: and i1
66+
// CHECK: and i1
67+
// CHECK: select i1 %{{.*}}, float %{{.*}}, float
68+
// CHECK: select i1 %{{.*}}, float %{{.*}}, float
69+
// CHECK: select i1 %{{.*}}, float %{{.*}}, float
70+
// CHECK: select i1 %{{.*}}, float %{{.*}}, float
71+
// CHECK: and i1
72+
// CHECK: and i1
73+
// CHECK: and i1
74+
// CHECK: and i1
75+
// CHECK: select i1 %{{.*}}, float 0x
76+
// CHECK: select i1 %{{.*}}, float 0x
77+
// CHECK: select i1 %{{.*}}, float 0x
78+
// CHECK: select i1 %{{.*}}, float 0x
79+
// CHECK: and i1
80+
// CHECK: and i1
81+
// CHECK: and i1
82+
// CHECK: and i1
83+
// CHECK: select i1 %{{.*}}, float 0x
84+
// CHECK: select i1 %{{.*}}, float 0x
85+
// CHECK: select i1 %{{.*}}, float 0x
86+
// CHECK: select i1 %{{.*}}, float 0x
87+
res += atan2(vec1, vec2);
88+
89+
// CHECK: fdiv fast float
90+
// CHECK: fdiv fast float
91+
// CHECK: fdiv fast float
92+
// CHECK: fdiv fast float
93+
// CHECK: fsub fast float
94+
// CHECK: fsub fast float
95+
// CHECK: fsub fast float
96+
// CHECK: fsub fast float
97+
// CHECK: fcmp fast oge float
98+
// CHECK: fcmp fast oge float
99+
// CHECK: fcmp fast oge float
100+
// CHECK: fcmp fast oge float
101+
// CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
102+
// CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
103+
// CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
104+
// CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
105+
// CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
106+
// CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
107+
// CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
108+
// CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
109+
110+
// CHECK: fsub fast float
111+
// CHECK: fsub fast float
112+
// CHECK: fsub fast float
113+
// CHECK: fsub fast float
114+
// CHECK: select i1 %{{.*}}, float %{{.*}}, float
115+
// CHECK: select i1 %{{.*}}, float %{{.*}}, float
116+
// CHECK: select i1 %{{.*}}, float %{{.*}}, float
117+
// CHECK: select i1 %{{.*}}, float %{{.*}}, float
118+
// CHECK: fmul fast float
119+
// CHECK: fmul fast float
120+
// CHECK: fmul fast float
121+
// CHECK: fmul fast float
122+
res += fmod(vec1, vec3);
123+
124+
// CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
125+
// CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
126+
// CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
127+
// CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
128+
// CHECK: fmul fast float
129+
// CHECK: fmul fast float
130+
// CHECK: fmul fast float
131+
// CHECK: fmul fast float
132+
res += ldexp(vec1, vec2);
133+
134+
// CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
135+
// CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
136+
// CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
137+
// CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
138+
// CHECK: fmul fast float
139+
// CHECK: fmul fast float
140+
// CHECK: fmul fast float
141+
// CHECK: fmul fast float
142+
// CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
143+
// CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
144+
// CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
145+
// CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
146+
res += pow(vec1, vec2);
147+
148+
// CHECK: mul i32
149+
// CHECK: call i32 @dx.op.tertiary.i32(i32 49, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) ; UMad(a,b,c)
150+
// CHECK: call i32 @dx.op.tertiary.i32(i32 49, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) ; UMad(a,b,c)
151+
// CHECK: call i32 @dx.op.tertiary.i32(i32 49, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) ; UMad(a,b,c)
152+
res += dot(ivec1, ivec2);
153+
154+
// CHECK: call float @dx.op.unary.f32(i32 29, float %{{.*}}) ; Round_z(value)
155+
// CHECK: call float @dx.op.unary.f32(i32 29, float %{{.*}}) ; Round_z(value)
156+
// CHECK: call float @dx.op.unary.f32(i32 29, float %{{.*}}) ; Round_z(value)
157+
// CHECK: call float @dx.op.unary.f32(i32 29, float %{{.*}}) ; Round_z(value)
158+
// CHECK: fsub fast float
159+
// CHECK: fsub fast float
160+
// CHECK: fsub fast float
161+
// CHECK: fsub fast float
162+
res *= modf(vec2, vec3);
163+
164+
// CHECK: = or i1
165+
// CHECK: = or i1
166+
// CHECK: = or i1
167+
bvec ^= any(vec1);
168+
169+
// CHECK: = and i1
170+
// CHECK: = and i1
171+
// CHECK: = and i1
172+
bvec ^= all(vec1);
173+
174+
// CHECK: call {{.*}} @dx.op.wave
175+
// CHECK: call {{.*}} @dx.op.wave
176+
// CHECK: call {{.*}} @dx.op.wave
177+
// CHECK: call {{.*}} @dx.op.wave
178+
uint4 match = WaveMatch(bvec);
179+
180+
return select(match, res, vec3);
181+
182+
}

tools/clang/test/CodeGenDXIL/hlsl/types/longvec-scalarized-intrinsics.hlsl

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22

33
// Long vector tests for vec ops that scalarize to something more complex
44
// than a simple repetition of the same dx.op calls.
5+
// This is a temporary measure to verify that intrinsics are not lowered
6+
// to native vectors in SM6.9 unintentionally.
7+
// Ultimately, this file will be deleted when all are correctly lowered.
58

69
// CHECK-LABEL: test_atan2
710
// CHECK: fdiv fast <8 x float>
@@ -101,15 +104,14 @@ export void test_all(vector<float, 8> vec1, inout vector<bool, 8> bvec) {
101104
}
102105

103106
// CHECK-LABEL: test_WaveMatch
104-
// call {{.*}} @dx.op.wave
105-
// call {{.*}} @dx.op.wave
106-
// call {{.*}} @dx.op.wave
107-
// call {{.*}} @dx.op.wave
108-
// call {{.*}} @dx.op.wave
109-
// call {{.*}} @dx.op.wave
110-
// call {{.*}} @dx.op.wave
111-
// call {{.*}} @dx.op.wave
112-
// call {{.*}} @dx.op.wave
107+
// CHECK: call {{.*}} @dx.op.waveMatch
108+
// CHECK: call {{.*}} @dx.op.waveMatch
109+
// CHECK: call {{.*}} @dx.op.waveMatch
110+
// CHECK: call {{.*}} @dx.op.waveMatch
111+
// CHECK: call {{.*}} @dx.op.waveMatch
112+
// CHECK: call {{.*}} @dx.op.waveMatch
113+
// CHECK: call {{.*}} @dx.op.waveMatch
114+
// CHECK: call {{.*}} @dx.op.waveMatch
113115
export uint4 test_WaveMatch(vector<bool, 8> bvec) {
114116
return WaveMatch(bvec);
115117
}

0 commit comments

Comments
 (0)