@@ -158,6 +158,177 @@ static mlir::Value emitX86SExtMask(CIRGenFunction &cgf, mlir::Value op,
158
158
return cgf.getBuilder ().createCast (loc, cir::CastKind::integral, mask, dstTy);
159
159
}
160
160
161
+ // Helper function to convert builtin names to LLVM intrinsic names
162
+ std::string CIRGenFunction::convertBuiltinToIntrinsicName (llvm::StringRef builtinName) {
163
+ // Remove "__builtin_ia32_" prefix
164
+ llvm::StringRef baseName = builtinName.drop_front (15 ); // "__builtin_ia32_".size() == 15
165
+
166
+ // Simple mapping for common patterns
167
+ // This can be extended as needed
168
+ static llvm::StringMap<std::string> intrinsicMap = {
169
+ // Load/Store operations
170
+ {" loadups" , " llvm.x86.sse.loadu.ps" },
171
+ {" loaddqu" , " llvm.x86.sse2.loadu.dq" },
172
+ {" storeups" , " llvm.x86.sse.storeu.ps" },
173
+ {" storedqu" , " llvm.x86.sse2.storeu.dq" },
174
+ {" movntdqa" , " llvm.x86.sse41.movntdqa" },
175
+ {" movntdq" , " llvm.x86.sse2.movnt.dq" },
176
+
177
+ // Arithmetic operations
178
+ {" addps" , " llvm.x86.sse.add.ps" },
179
+ {" subps" , " llvm.x86.sse.sub.ps" },
180
+ {" mulps" , " llvm.x86.sse.mul.ps" },
181
+ {" divps" , " llvm.x86.sse.div.ps" },
182
+
183
+ // Cast operations (these might not need intrinsics)
184
+ {" castps_si128" , " llvm.x86.sse.cast.ps.si128" },
185
+ {" castsi128_ps" , " llvm.x86.sse.cast.si128.ps" },
186
+
187
+ // Set/Zero operations
188
+ {" setzero_ps" , " llvm.x86.sse.setzero.ps" },
189
+ {" setzero_si128" , " llvm.x86.sse2.setzero.si128" },
190
+
191
+ // Unpack operations
192
+ {" unpacklo_epi8" , " llvm.x86.sse2.punpcklbw.128" },
193
+ {" unpackhi_epi8" , " llvm.x86.sse2.punpckhbw.128" },
194
+ {" unpacklo_epi16" , " llvm.x86.sse2.punpcklwd.128" },
195
+ {" unpackhi_epi16" , " llvm.x86.sse2.punpckhwd.128" },
196
+
197
+ // K-mask shift operations (AVX-512)
198
+ {" kshiftliqi" , " llvm.x86.avx512.kshiftl.b" },
199
+ {" kshiftlihi" , " llvm.x86.avx512.kshiftl.w" },
200
+ {" kshiftlisi" , " llvm.x86.avx512.kshiftl.d" },
201
+ {" kshiftlidi" , " llvm.x86.avx512.kshiftl.q" },
202
+ {" kshiftriqi" , " llvm.x86.avx512.kshiftr.b" },
203
+ {" kshiftrihi" , " llvm.x86.avx512.kshiftr.w" },
204
+ {" kshiftrisi" , " llvm.x86.avx512.kshiftr.d" },
205
+ {" kshiftridi" , " llvm.x86.avx512.kshiftr.q" },
206
+
207
+ // Pack operations
208
+ {" packsswb128" , " llvm.x86.sse2.packsswb.128" },
209
+ {" packssdw128" , " llvm.x86.sse2.packssdw.128" },
210
+ {" packuswb128" , " llvm.x86.sse2.packuswb.128" },
211
+
212
+ // Conversion operations
213
+ {" cvtps2dq" , " llvm.x86.sse2.cvtps2dq" },
214
+ {" cvtdq2ps" , " llvm.x86.sse2.cvtdq2ps" },
215
+ {" cvtpd2dq" , " llvm.x86.sse2.cvtpd2dq" },
216
+
217
+ // Shuffle operations
218
+ {" shufps" , " llvm.x86.sse.shuf.ps" },
219
+ {" pshuflw" , " llvm.x86.sse2.pshufl.w" },
220
+ {" pshufhw" , " llvm.x86.sse2.pshufh.w" },
221
+ {" palignr128" , " llvm.x86.ssse3.palign.r.128" },
222
+ {" palignr256" , " llvm.x86.avx2.palign.r" },
223
+ {" permdi256" , " llvm.x86.avx2.permd" },
224
+
225
+ // AES operations
226
+ {" aesdec128" , " llvm.x86.aesni.aesdec" },
227
+ {" aesenc128" , " llvm.x86.aesni.aesenc" },
228
+
229
+ // Shift operations
230
+ {" pslldqi128_byteshift" , " llvm.x86.sse2.psll.dq" },
231
+ {" pslldqi256_byteshift" , " llvm.x86.avx2.psll.dq" },
232
+ {" pslldqi512_byteshift" , " llvm.x86.avx512.psll.dq.512" },
233
+
234
+ // Advanced math operations (using correct LLVM intrinsic names)
235
+ {" sqrtps512" , " llvm.x86.avx512.sqrt.ps.512" },
236
+ {" sqrtpd512" , " llvm.x86.avx512.sqrt.pd.512" },
237
+ // Note: SSE sqrt doesn't have LLVM intrinsics - they become regular sqrt calls
238
+ {" rcpps" , " llvm.x86.sse.rcp.ps" },
239
+ {" rsqrtps" , " llvm.x86.sse.rsqrt.ps" },
240
+ {" minpd" , " llvm.x86.sse2.min.pd" },
241
+ {" maxpd" , " llvm.x86.sse2.max.pd" },
242
+
243
+ // Comparison operations
244
+ {" pcmpeqb128" , " llvm.x86.sse2.pcmpeq.b" },
245
+ {" pcmpeqw128" , " llvm.x86.sse2.pcmpeq.w" },
246
+ {" pcmpeqd128" , " llvm.x86.sse2.pcmpeq.d" },
247
+ {" pcmpgtb128" , " llvm.x86.sse2.pcmpgt.b" },
248
+ {" cmpeqps" , " llvm.x86.sse.cmp.ps" },
249
+ {" cmpltps" , " llvm.x86.sse.cmp.ps" },
250
+ {" cmpleps" , " llvm.x86.sse.cmp.ps" },
251
+
252
+ // Bit manipulation
253
+ {" pand128" , " llvm.x86.sse2.pand" },
254
+ {" por128" , " llvm.x86.sse2.por" },
255
+ {" pxor128" , " llvm.x86.sse2.pxor" },
256
+ {" pandn128" , " llvm.x86.sse2.pandn" },
257
+
258
+ // Mask operations (AVX-512)
259
+ {" kandqi" , " llvm.x86.avx512.kand.b" },
260
+ {" korqi" , " llvm.x86.avx512.kor.b" },
261
+ {" kxorqi" , " llvm.x86.avx512.kxor.b" },
262
+ {" knotqi" , " llvm.x86.avx512.knot.b" },
263
+
264
+ // Conversion operations
265
+ {" cvtdq2ps256" , " llvm.x86.avx.cvtdq2.ps.256" },
266
+ {" cvtpd2ps" , " llvm.x86.sse2.cvtpd2ps" },
267
+ {" cvtps2dq256" , " llvm.x86.avx.cvtps2dq.256" },
268
+
269
+ // Specialized operations
270
+ {" pternlogd128" , " llvm.x86.avx512.pternlog.d.128" },
271
+ {" vpopcntd_128" , " llvm.x86.avx512.vpopcnt.d.128" },
272
+ {" vplzcntd_128" , " llvm.x86.avx512.vplzcnt.d.128" },
273
+
274
+ // Gather/Scatter operations
275
+ {" gathersiv4sf" , " llvm.x86.avx2.gather.d.ps" },
276
+ {" scattersiv4sf" , " llvm.x86.avx512.scatter.dps.512" },
277
+
278
+ // Vector size operations
279
+ {" extract128i256" , " llvm.x86.avx2.vextracti128" },
280
+ {" insert128i256" , " llvm.x86.avx2.vinserti128" },
281
+ {" pbroadcastd256" , " llvm.x86.avx2.pbroadcastd.256" },
282
+
283
+ // String processing
284
+ {" pcmpistri128" , " llvm.x86.sse42.pcmpistri128" },
285
+ {" pcmpistrm128" , " llvm.x86.sse42.pcmpistrm128" },
286
+ };
287
+
288
+ // Check if we have a direct mapping
289
+ auto it = intrinsicMap.find (baseName);
290
+ if (it != intrinsicMap.end ()) {
291
+ return it->second ;
292
+ }
293
+
294
+ // Fallback: For intrinsics without LLVM equivalents, create a function call
295
+ // This allows the backend to handle it as a regular function call
296
+ return (" __" + baseName).str (); // e.g., "__sqrtps" becomes a function call
297
+ }
298
+
299
+ // Generic fallback for unsupported X86 intrinsics
300
+ // This creates a function call with the intrinsic name preserved as a string
301
+ mlir::Value CIRGenFunction::emitX86IntrinsicFallback (unsigned BuiltinID,
302
+ const CallExpr *E,
303
+ llvm::ArrayRef<mlir::Value> Ops) {
304
+ // Get the builtin name from the BuiltinID
305
+ std::string builtinName = getContext ().BuiltinInfo .getName (BuiltinID);
306
+
307
+ // Only handle X86 intrinsics (they start with "__builtin_ia32_")
308
+ llvm::StringRef nameRef (builtinName);
309
+ if (!nameRef.starts_with (" __builtin_ia32_" )) {
310
+ return nullptr ;
311
+ }
312
+
313
+ // Convert builtin name to intrinsic name
314
+ // "__builtin_ia32_addps" -> "llvm.x86.sse.add.ps"
315
+ std::string intrinsicName = convertBuiltinToIntrinsicName (nameRef);
316
+
317
+ // Get the return type
318
+ mlir::Type returnType = convertType (E->getType ());
319
+
320
+ // Create the fallback intrinsic call
321
+ mlir::Location loc = getLoc (E->getExprLoc ());
322
+
323
+ // Use LLVMIntrinsicCallOp to preserve the intrinsic name as a string
324
+ // This allows the LLVM backend to handle it or emit an appropriate error
325
+ auto intrinsicCall = builder.create <cir::LLVMIntrinsicCallOp>(
326
+ loc, builder.getStringAttr (intrinsicName), returnType, Ops);
327
+
328
+ return intrinsicCall.getResult ();
329
+ }
330
+
331
+
161
332
static mlir::Value emitX86PSLLDQIByteShift (CIRGenFunction &cgf,
162
333
const CallExpr *E,
163
334
ArrayRef<mlir::Value> Ops) {
@@ -206,7 +377,7 @@ static mlir::Value emitX86PSRLDQIByteShift(CIRGenFunction &cgf,
206
377
207
378
// If psrldq is shifting the vector more than 15 bytes, emit zero.
208
379
if (shiftVal >= 16 )
209
- return builder.getZero (loc, resultType);
380
+ return builder.getZero (loc, resultType);
210
381
211
382
auto numElts = resultType.getSize () * 8 ;
212
383
assert (numElts % 16 == 0 && " Expected a multiple of 16" );
@@ -215,7 +386,7 @@ static mlir::Value emitX86PSRLDQIByteShift(CIRGenFunction &cgf,
215
386
216
387
// This correlates to the OG CodeGen
217
388
// As stated in the OG, 256/512-bit psrldq operates on 128-bit lanes.
218
- // So we have to make sure we handle it.
389
+ // So we have to make sure we handle it.
219
390
for (unsigned l = 0 ; l < numElts; l += 16 ) {
220
391
for (unsigned i = 0 ; i < 16 ; ++i) {
221
392
unsigned idx = i + shiftVal;
@@ -265,6 +436,10 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned BuiltinID,
265
436
266
437
switch (BuiltinID) {
267
438
default :
439
+ // Try generic fallback for unknown X86 intrinsics
440
+ if (auto fallbackResult = emitX86IntrinsicFallback (BuiltinID, E, Ops)) {
441
+ return fallbackResult;
442
+ }
268
443
return nullptr ;
269
444
case X86::BI_mm_prefetch: {
270
445
mlir::Value Address = builder.createPtrBitcast (Ops[0 ], VoidTy);
@@ -1202,17 +1377,28 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned BuiltinID,
1202
1377
case X86::BI__builtin_ia32_psrldqi128_byteshift:
1203
1378
case X86::BI__builtin_ia32_psrldqi256_byteshift:
1204
1379
case X86::BI__builtin_ia32_psrldqi512_byteshift:
1205
- emitX86PSRLDQIByteShift (*this , E, Ops);
1380
+ return emitX86PSRLDQIByteShift (*this , E, Ops);
1206
1381
case X86::BI__builtin_ia32_kshiftliqi:
1207
1382
case X86::BI__builtin_ia32_kshiftlihi:
1208
1383
case X86::BI__builtin_ia32_kshiftlisi:
1209
1384
case X86::BI__builtin_ia32_kshiftlidi:
1210
- llvm_unreachable (" kshiftl NYI" );
1385
+ // llvm_unreachable("kshiftl NYI");
1386
+ // Try generic fallback for unknown X86 intrinsics
1387
+ if (auto fallbackResult = emitX86IntrinsicFallback (BuiltinID, E, Ops)) {
1388
+ return fallbackResult;
1389
+ }
1390
+ return nullptr ;
1211
1391
case X86::BI__builtin_ia32_kshiftriqi:
1212
1392
case X86::BI__builtin_ia32_kshiftrihi:
1213
1393
case X86::BI__builtin_ia32_kshiftrisi:
1214
1394
case X86::BI__builtin_ia32_kshiftridi:
1215
- llvm_unreachable (" kshiftr NYI" );
1395
+ // llvm_unreachable("kshiftr NYI");
1396
+ // Try generic fallback for unknown X86 intrinsics
1397
+ if (auto fallbackResult = emitX86IntrinsicFallback (BuiltinID, E, Ops)) {
1398
+ return fallbackResult;
1399
+ }
1400
+ return nullptr ;
1401
+
1216
1402
// Rotate is a special case of funnel shift - 1st 2 args are the same.
1217
1403
case X86::BI__builtin_ia32_vprotb:
1218
1404
case X86::BI__builtin_ia32_vprotw:
0 commit comments