microsoft
diff --git a/‎include/dxc/HLSL/DxilGenerationPass.h
Lines changed: 2 additions & 2 deletions b/‎include/dxc/HLSL/DxilGenerationPass.h
Lines changed: 2 additions & 2 deletions
diff --git a/‎lib/HLSL/CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎lib/HLSL/CMakeLists.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/HLSL/DxilLinker.cpp
Lines changed: 1 addition & 1 deletion b/‎lib/HLSL/DxilLinker.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/HLSL/DxilScalarizeVectorLoadStores.cpp renamed to ‎lib/HLSL/DxilScalarizeVectorIntrinsics.cpp
Lines changed: 68 additions & 27 deletions b/‎lib/HLSL/DxilScalarizeVectorLoadStores.cpp renamed to ‎lib/HLSL/DxilScalarizeVectorIntrinsics.cpp
Lines changed: 68 additions & 27 deletions
diff --git a/‎tools/clang/test/CodeGenDXIL/hlsl/types/longvec-scalarized-intrinsics-sm68.hlsl
Lines changed: 182 additions & 0 deletions b/‎tools/clang/test/CodeGenDXIL/hlsl/types/longvec-scalarized-intrinsics-sm68.hlsl
Lines changed: 182 additions & 0 deletions
diff --git a/‎tools/clang/test/CodeGenDXIL/hlsl/types/longvec-scalarized-intrinsics.hlsl
Lines changed: 11 additions & 9 deletions b/‎tools/clang/test/CodeGenDXIL/hlsl/types/longvec-scalarized-intrinsics.hlsl
Lines changed: 11 additions & 9 deletions
@@ -81,7 +81,7 @@ ModulePass *createResumePassesPass();
 FunctionPass *createMatrixBitcastLowerPass();
 ModulePass *createDxilCleanupAddrSpaceCastPass();
 ModulePass *createDxilRenameResourcesPass();
-ModulePass *createDxilScalarizeVectorLoadStoresPass();
+ModulePass *createDxilScalarizeVectorIntrinsicsPass();
 
 void initializeDxilLowerCreateHandleForLibPass(llvm::PassRegistry &);
 void initializeDxilAllocateResourcesForLibPass(llvm::PassRegistry &);
@@ -116,7 +116,7 @@ void initializeResumePassesPass(llvm::PassRegistry &);
 void initializeMatrixBitcastLowerPassPass(llvm::PassRegistry &);
 void initializeDxilCleanupAddrSpaceCastPass(llvm::PassRegistry &);
 void initializeDxilRenameResourcesPass(llvm::PassRegistry &);
-void initializeDxilScalarizeVectorLoadStoresPass(llvm::PassRegistry &);
+void initializeDxilScalarizeVectorIntrinsicsPass(llvm::PassRegistry &);
 
 ModulePass *createDxilValidateWaveSensitivityPass();
 void initializeDxilValidateWaveSensitivityPass(llvm::PassRegistry &);
 
@@ -25,7 +25,7 @@ add_llvm_library(LLVMHLSL
   DxilNoops.cpp
   DxilPreserveAllOutputs.cpp
   DxilRenameResourcesPass.cpp
-  DxilScalarizeVectorLoadStores.cpp
+  DxilScalarizeVectorIntrinsics.cpp
   DxilSimpleGVNHoist.cpp
   DxilSignatureValidation.cpp
   DxilTargetLowering.cpp
 
@@ -1249,7 +1249,7 @@ void DxilLinkJob::RunPreparePass(Module &M) {
 
   // If we need SROA and dynamicindexvector to array,
   // do it early to allow following scalarization to go forward.
-  PM.add(createDxilScalarizeVectorLoadStoresPass());
+  PM.add(createDxilScalarizeVectorIntrinsicsPass());
 
   // Remove unused functions.
   PM.add(createDxilDeadFunctionEliminationPass());
 
@@ -1,6 +1,6 @@
 ///////////////////////////////////////////////////////////////////////////////
 //                                                                           //
-// DxilScalarizeVectorLoadStores.cpp                                         //
+// DxilScalarizeVectorIntrinsics.cpp                                         //
 // Copyright (C) Microsoft Corporation. All rights reserved.                 //
 // This file is distributed under the University of Illinois Open Source     //
 // License. See LICENSE.TXT for details.                                     //
@@ -28,11 +28,12 @@ static void scalarizeVectorLoad(hlsl::OP *HlslOP, const DataLayout &DL,
                                 CallInst *CI);
 static void scalarizeVectorStore(hlsl::OP *HlslOP, const DataLayout &DL,
                                  CallInst *CI);
+static void scalarizeVectorIntrinsic(hlsl::OP *HlslOP, CallInst *CI);
 
-class DxilScalarizeVectorLoadStores : public ModulePass {
+class DxilScalarizeVectorIntrinsics : public ModulePass {
 public:
   static char ID; // Pass identification, replacement for typeid
-  explicit DxilScalarizeVectorLoadStores() : ModulePass(ID) {}
+  explicit DxilScalarizeVectorIntrinsics() : ModulePass(ID) {}
 
   StringRef getPassName() const override {
     return "DXIL scalarize vector load/stores";
@@ -47,24 +48,29 @@ class DxilScalarizeVectorLoadStores : public ModulePass {
     bool Changed = false;
 
     hlsl::OP *HlslOP = DM.GetOP();
-    for (auto FIt : HlslOP->GetOpFuncList(DXIL::OpCode::RawBufferVectorLoad)) {
-      Function *Func = FIt.second;
-      if (!Func)
-        continue;
-      for (auto U = Func->user_begin(), UE = Func->user_end(); U != UE;) {
-        CallInst *CI = cast<CallInst>(*(U++));
-        scalarizeVectorLoad(HlslOP, M.getDataLayout(), CI);
-        Changed = true;
-      }
-    }
-    for (auto FIt : HlslOP->GetOpFuncList(DXIL::OpCode::RawBufferVectorStore)) {
-      Function *Func = FIt.second;
-      if (!Func)
-        continue;
-      for (auto U = Func->user_begin(), UE = Func->user_end(); U != UE;) {
-        CallInst *CI = cast<CallInst>(*(U++));
-        scalarizeVectorStore(HlslOP, M.getDataLayout(), CI);
-        Changed = true;
+
+    // Iterate and scalarize native vector loads, stores, and other intrinsics.
+    for (auto F = M.functions().begin(); F != M.functions().end();) {
+      Function *Func = &*(F++);
+      DXIL::OpCodeClass OpClass;
+      if (HlslOP->GetOpCodeClass(Func, OpClass)) {
+        if (OpClass == DXIL::OpCodeClass::RawBufferVectorLoad)
+          for (auto U = Func->user_begin(), UE = Func->user_end(); U != UE;) {
+            CallInst *CI = cast<CallInst>(*(U++));
+            scalarizeVectorLoad(HlslOP, M.getDataLayout(), CI);
+            Changed = true;
+          }
+        else if (OpClass == DXIL::OpCodeClass::RawBufferVectorStore)
+          for (auto U = Func->user_begin(), UE = Func->user_end(); U != UE;) {
+            CallInst *CI = cast<CallInst>(*(U++));
+            scalarizeVectorStore(HlslOP, M.getDataLayout(), CI);
+            Changed = true;
+          }
+        else if (Func->getReturnType()->isVectorTy())
+          for (auto U = Func->user_begin(), UE = Func->user_end(); U != UE;) {
+            CallInst *CI = cast<CallInst>(*(U++));
+            scalarizeVectorIntrinsic(HlslOP, CI);
+          }
       }
     }
     return Changed;
@@ -220,12 +226,47 @@ static void scalarizeVectorStore(hlsl::OP *HlslOP, const DataLayout &DL,
   CI->eraseFromParent();
 }
 
-char DxilScalarizeVectorLoadStores::ID = 0;
+// Scalarize native vector operation represented by `CI`, generating
+// scalar calls for each element of the its vector parameters.
+// Use `HlslOP` to retrieve the associated scalar op function.
+static void scalarizeVectorIntrinsic(hlsl::OP *HlslOP, CallInst *CI) {
+
+  IRBuilder<> Builder(CI);
+  VectorType *VT = cast<VectorType>(CI->getType());
+  unsigned VecSize = VT->getNumElements();
+  unsigned ArgNum = CI->getNumArgOperands();
+  OP::OpCode Opcode = OP::getOpCode(CI);
+  Type *Ty = OP::GetOverloadType(Opcode, CI->getCalledFunction());
+  Function *Func = HlslOP->GetOpFunc(Opcode, Ty->getScalarType());
+  SmallVector<Value *, 4> Args(ArgNum);
+  Args[0] = CI->getArgOperand(0); // Copy opcode over.
+
+  // For each element in the vector, generate a new call instruction.
+  // Insert results into a result vector.
+  Value *RetVal = UndefValue::get(CI->getType());
+  for (unsigned ElIx = 0; ElIx < VecSize; ElIx++) {
+    // Replace each vector argument with the result of an extraction.
+    // Skip known opcode arg as it can't be a vector.
+    for (unsigned ArgIx = 1; ArgIx < ArgNum; ArgIx++) {
+      Value *Arg = CI->getArgOperand(ArgIx);
+      if (Arg->getType()->isVectorTy())
+        Args[ArgIx] = Builder.CreateExtractElement(Arg, ElIx);
+      else
+        Args[ArgIx] = Arg;
+    }
+    Value *ElCI = Builder.CreateCall(Func, Args, CI->getName());
+    RetVal = Builder.CreateInsertElement(RetVal, ElCI, ElIx);
+  }
+  CI->replaceAllUsesWith(RetVal);
+}
+
+char DxilScalarizeVectorIntrinsics::ID = 0;
 
-ModulePass *llvm::createDxilScalarizeVectorLoadStoresPass() {
-  return new DxilScalarizeVectorLoadStores();
+ModulePass *llvm::createDxilScalarizeVectorIntrinsicsPass() {
+  return new DxilScalarizeVectorIntrinsics();
 }
 
-INITIALIZE_PASS(DxilScalarizeVectorLoadStores,
-                "hlsl-dxil-scalarize-vector-load-stores",
-                "DXIL scalarize vector load/stores", false, false)
+INITIALIZE_PASS(
+    DxilScalarizeVectorIntrinsics, "hlsl-dxil-scalarize-vector-intrinsics",
+    "Scalarize native vector DXIL loads, stores, and other intrinsics", false,
+    false)
@@ -0,0 +1,182 @@
+// RUN: %dxc -T lib_6_8 %s | FileCheck %s
+// RUN: %dxc -T lib_6_9 %s -Fo %t.1
+// RUN: %dxl -T ps_6_8 %t.1 | FileCheck %s --check-prefixes=CHECK,UNARY
+
+// Tests non-native-vector behavior for vec ops that scalarize to something
+//  more complex than a simple repetition of the same dx.op calls.
+
+StructuredBuffer< vector<float, 4> > buf;
+ByteAddressBuffer rbuf;
+
+// CHECK-LABEL: define void @main()
+[shader("pixel")]
+float4 main(uint i : SV_PrimitiveID, uint4 m : M) : SV_Target {
+
+  vector<float, 4> vec1 = rbuf.Load< vector<float, 4> >(i++*32);
+  vector<float, 4> vec2 = rbuf.Load< vector<float, 4> >(i++*32);
+  vector<float, 4> vec3 = rbuf.Load< vector<float, 4> >(i++*32);
+  vector<bool, 4> bvec = rbuf.Load< vector<bool, 4> >(i++*32);
+  vector<uint, 4> ivec1 = rbuf.Load< vector<uint, 4> >(i++*32);
+  vector<uint, 4> ivec2 = rbuf.Load< vector<uint, 4> >(i++*32);
+  vector<float, 4> res = 0;
+
+  // CHECK: fdiv fast float
+  // CHECK: fdiv fast float
+  // CHECK: fdiv fast float
+  // CHECK: fdiv fast float
+  // CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
+  // CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
+  // CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
+  // CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
+  // CHECK: fadd fast float %{{.*}}, 0x40
+  // CHECK: fadd fast float %{{.*}}, 0x40
+  // CHECK: fadd fast float %{{.*}}, 0x40
+  // CHECK: fadd fast float %{{.*}}, 0x40
+  // CHECK: fadd fast float %{{.*}}, 0xC0
+  // CHECK: fadd fast float %{{.*}}, 0xC0
+  // CHECK: fadd fast float %{{.*}}, 0xC0
+  // CHECK: fadd fast float %{{.*}}, 0xC0
+  // CHECK: fcmp fast olt float %{{.*}}, 0
+  // CHECK: fcmp fast olt float %{{.*}}, 0
+  // CHECK: fcmp fast olt float %{{.*}}, 0
+  // CHECK: fcmp fast olt float %{{.*}}, 0
+  // CHECK: fcmp fast oeq float %{{.*}}, 0
+  // CHECK: fcmp fast oeq float %{{.*}}, 0
+  // CHECK: fcmp fast oeq float %{{.*}}, 0
+  // CHECK: fcmp fast oeq float %{{.*}}, 0
+  // CHECK: fcmp fast oge float %{{.*}}, 0
+  // CHECK: fcmp fast oge float %{{.*}}, 0
+  // CHECK: fcmp fast oge float %{{.*}}, 0
+  // CHECK: fcmp fast oge float %{{.*}}, 0
+  // CHECK: fcmp fast olt float %{{.*}}, 0
+  // CHECK: fcmp fast olt float %{{.*}}, 0
+  // CHECK: fcmp fast olt float %{{.*}}, 0
+  // CHECK: fcmp fast olt float %{{.*}}, 0
+  // CHECK: and i1
+  // CHECK: and i1
+  // CHECK: and i1
+  // CHECK: and i1
+  // CHECK: select i1 %{{.*}}, float %{{.*}}, float
+  // CHECK: select i1 %{{.*}}, float %{{.*}}, float
+  // CHECK: select i1 %{{.*}}, float %{{.*}}, float
+  // CHECK: select i1 %{{.*}}, float %{{.*}}, float
+  // CHECK: and i1
+  // CHECK: and i1
+  // CHECK: and i1
+  // CHECK: and i1
+  // CHECK: select i1 %{{.*}}, float %{{.*}}, float
+  // CHECK: select i1 %{{.*}}, float %{{.*}}, float
+  // CHECK: select i1 %{{.*}}, float %{{.*}}, float
+  // CHECK: select i1 %{{.*}}, float %{{.*}}, float
+  // CHECK: and i1
+  // CHECK: and i1
+  // CHECK: and i1
+  // CHECK: and i1
+  // CHECK: select i1 %{{.*}}, float 0x
+  // CHECK: select i1 %{{.*}}, float 0x
+  // CHECK: select i1 %{{.*}}, float 0x
+  // CHECK: select i1 %{{.*}}, float 0x
+  // CHECK: and i1
+  // CHECK: and i1
+  // CHECK: and i1
+  // CHECK: and i1
+  // CHECK: select i1 %{{.*}}, float 0x
+  // CHECK: select i1 %{{.*}}, float 0x
+  // CHECK: select i1 %{{.*}}, float 0x
+  // CHECK: select i1 %{{.*}}, float 0x
+  res += atan2(vec1, vec2);
+
+  // CHECK: fdiv fast float
+  // CHECK: fdiv fast float
+  // CHECK: fdiv fast float
+  // CHECK: fdiv fast float
+  // CHECK: fsub fast float
+  // CHECK: fsub fast float
+  // CHECK: fsub fast float
+  // CHECK: fsub fast float
+  // CHECK: fcmp fast oge float
+  // CHECK: fcmp fast oge float
+  // CHECK: fcmp fast oge float
+  // CHECK: fcmp fast oge float
+  // CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
+  // CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
+  // CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
+  // CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
+  // CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
+  // CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
+  // CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
+  // CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
+
+  // CHECK: fsub fast float
+  // CHECK: fsub fast float
+  // CHECK: fsub fast float
+  // CHECK: fsub fast float
+  // CHECK: select i1 %{{.*}}, float %{{.*}}, float
+  // CHECK: select i1 %{{.*}}, float %{{.*}}, float
+  // CHECK: select i1 %{{.*}}, float %{{.*}}, float
+  // CHECK: select i1 %{{.*}}, float %{{.*}}, float
+  // CHECK: fmul fast float
+  // CHECK: fmul fast float
+  // CHECK: fmul fast float
+  // CHECK: fmul fast float
+  res += fmod(vec1, vec3);
+
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: fmul fast float
+  // CHECK: fmul fast float
+  // CHECK: fmul fast float
+  // CHECK: fmul fast float
+  res += ldexp(vec1, vec2);
+
+  // CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
+  // CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
+  // CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
+  // CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
+  // CHECK: fmul fast float
+  // CHECK: fmul fast float
+  // CHECK: fmul fast float
+  // CHECK: fmul fast float
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  res += pow(vec1, vec2);
+
+  // CHECK: mul i32
+  // CHECK: call i32 @dx.op.tertiary.i32(i32 49, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) ; UMad(a,b,c)
+  // CHECK: call i32 @dx.op.tertiary.i32(i32 49, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) ; UMad(a,b,c)
+  // CHECK: call i32 @dx.op.tertiary.i32(i32 49, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) ; UMad(a,b,c)
+  res += dot(ivec1, ivec2);
+
+  // CHECK: call float  @dx.op.unary.f32(i32 29, float  %{{.*}}) ; Round_z(value)
+  // CHECK: call float  @dx.op.unary.f32(i32 29, float  %{{.*}}) ; Round_z(value)
+  // CHECK: call float  @dx.op.unary.f32(i32 29, float  %{{.*}}) ; Round_z(value)
+  // CHECK: call float  @dx.op.unary.f32(i32 29, float  %{{.*}}) ; Round_z(value)
+  // CHECK: fsub fast float
+  // CHECK: fsub fast float
+  // CHECK: fsub fast float
+  // CHECK: fsub fast float
+  res *= modf(vec2, vec3);
+
+  // CHECK: = or i1
+  // CHECK: = or i1
+  // CHECK: = or i1
+  bvec ^= any(vec1);
+
+  // CHECK: = and i1
+  // CHECK: = and i1
+  // CHECK: = and i1
+  bvec ^= all(vec1);
+
+  // CHECK: call {{.*}} @dx.op.wave
+  // CHECK: call {{.*}} @dx.op.wave
+  // CHECK: call {{.*}} @dx.op.wave
+  // CHECK: call {{.*}} @dx.op.wave
+  uint4 match = WaveMatch(bvec);
+
+  return select(match, res, vec3);
+
+}
@@ -2,6 +2,9 @@
 
 // Long vector tests for vec ops that scalarize to something more complex
 //  than a simple repetition of the same dx.op calls.
+// This is a temporary measure to verify that intrinsics are not lowered
+//  to native vectors in SM6.9 unintentionally.
+// Ultimately, this file will be deleted when all are correctly lowered.
 
 // CHECK-LABEL: test_atan2
 // CHECK: fdiv fast <8 x float>
@@ -101,15 +104,14 @@ export void test_all(vector<float, 8> vec1, inout vector<bool, 8> bvec) {
 }
 
 // CHECK-LABEL: test_WaveMatch
-// call {{.*}} @dx.op.wave
-// call {{.*}} @dx.op.wave
-// call {{.*}} @dx.op.wave
-// call {{.*}} @dx.op.wave
-// call {{.*}} @dx.op.wave
-// call {{.*}} @dx.op.wave
-// call {{.*}} @dx.op.wave
-// call {{.*}} @dx.op.wave
-// call {{.*}} @dx.op.wave
+// CHECK: call {{.*}} @dx.op.waveMatch
+// CHECK: call {{.*}} @dx.op.waveMatch
+// CHECK: call {{.*}} @dx.op.waveMatch
+// CHECK: call {{.*}} @dx.op.waveMatch
+// CHECK: call {{.*}} @dx.op.waveMatch
+// CHECK: call {{.*}} @dx.op.waveMatch
+// CHECK: call {{.*}} @dx.op.waveMatch
+// CHECK: call {{.*}} @dx.op.waveMatch
 export uint4 test_WaveMatch(vector<bool, 8> bvec) {
   return WaveMatch(bvec);
 }