@@ -196,6 +196,47 @@ static mlir::Value emitX86PSLLDQIByteShift(CIRGenFunction &cgf,
196
196
return builder.createBitcast (shuffleResult, resultType);
197
197
}
198
198
199
+ static mlir::Value emitX86PSRLDQIByteShift (CIRGenFunction &cgf,
200
+ const CallExpr *E,
201
+ ArrayRef<mlir::Value> Ops) {
202
+ auto &builder = cgf.getBuilder ();
203
+ auto resultType = cast<cir::VectorType>(Ops[0 ].getType ());
204
+ auto loc = cgf.getLoc (E->getExprLoc ());
205
+ unsigned shiftVal = getIntValueFromConstOp (Ops[1 ]) & 0xff ;
206
+
207
+ // If psrldq is shifting the vector more than 15 bytes, emit zero.
208
+ if (shiftVal >= 16 )
209
+ return builder.getZero (loc, resultType);
210
+
211
+ auto numElts = resultType.getSize () * 8 ;
212
+ assert (numElts % 16 == 0 && " Expected a multiple of 16" );
213
+
214
+ llvm::SmallVector<int64_t , 64 > indices;
215
+
216
+ // This correlates to the OG CodeGen
217
+ // As stated in the OG, 256/512-bit psrldq operates on 128-bit lanes.
218
+ // So we have to make sure we handle it.
219
+ for (unsigned l = 0 ; l < numElts; l += 16 ) {
220
+ for (unsigned i = 0 ; i < 16 ; ++i) {
221
+ unsigned idx = i + shiftVal;
222
+ if (idx >= 16 )
223
+ idx += numElts - 16 ;
224
+ indices.push_back (idx + l);
225
+ }
226
+ }
227
+
228
+ auto byteVecTy = cir::VectorType::get (builder.getSInt8Ty (), numElts);
229
+ mlir::Value byteCast = builder.createBitcast (Ops[0 ], byteVecTy);
230
+ mlir::Value zero = builder.getZero (loc, byteVecTy);
231
+
232
+ // Perform the shuffle (right shift by inserting zeros from the left)
233
+ mlir::Value shuffleResult =
234
+ builder.createVecShuffle (loc, byteCast, zero, indices);
235
+
236
+ // Cast back to original type
237
+ return builder.createBitcast (shuffleResult, resultType);
238
+ }
239
+
199
240
mlir::Value CIRGenFunction::emitX86BuiltinExpr (unsigned BuiltinID,
200
241
const CallExpr *E) {
201
242
if (BuiltinID == Builtin::BI__builtin_cpu_is)
@@ -1161,7 +1202,7 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned BuiltinID,
1161
1202
case X86::BI__builtin_ia32_psrldqi128_byteshift:
1162
1203
case X86::BI__builtin_ia32_psrldqi256_byteshift:
1163
1204
case X86::BI__builtin_ia32_psrldqi512_byteshift:
1164
- llvm_unreachable ( " psrldqi NYI " );
1205
+ emitX86PSRLDQIByteShift (* this , E, Ops );
1165
1206
case X86::BI__builtin_ia32_kshiftliqi:
1166
1207
case X86::BI__builtin_ia32_kshiftlihi:
1167
1208
case X86::BI__builtin_ia32_kshiftlisi:
0 commit comments