@@ -308,7 +308,7 @@ void GenIntrinsicsTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
308
308
// It can potentially do some global cost estimations.
309
309
// TODO: Having compilation retry enables loop unrolling for this case and determines if unrolling actually helps
310
310
// reduce register pressure.
311
- const unsigned UnrollMaxCountForAlloca = 64 ; // May need to be higher for OpenCL
311
+ const unsigned UnrollMaxCountForAlloca = IGC_GET_FLAG_VALUE (PromoteLoopUnrollwithAllocaCountThreshold);
312
312
bool AllocaFound = false ;
313
313
if (MaxTripCount && MaxTripCount <= UnrollMaxCountForAlloca &&
314
314
IGC_IS_FLAG_ENABLED (EnablePromoteLoopUnrollwithAlloca)) {
@@ -332,12 +332,16 @@ void GenIntrinsicsTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
332
332
333
333
if (!AI)
334
334
continue ;
335
-
336
- Type *Ty = AI->getAllocatedType ();
337
- unsigned AllocaSize = Ty->isSized () ? DL.getTypeAllocSize (Ty) : 0 ;
338
- if (AllocaSize > 1024 || AllocaSize == 0 )
335
+ // Not fixed size or not in entry block
336
+ // TODO: Can a alloca with a fixed size not reside in the entry block?
337
+ if (!AI->isStaticAlloca ())
338
+ continue ;
339
+ // Assume every iteration consumes 1 alloca element.
340
+ if (cast<ConstantInt>(AI->getArraySize ())->getZExtValue () > UnrollMaxCountForAlloca)
339
341
continue ;
340
342
343
+ // Using alloca size in bytes as the threshold boost seems a bit tricky.
344
+ unsigned AllocaSize = *(AI->getAllocationSizeInBits (DL)) / 8 ;
341
345
ThresholdBoost += AllocaSize;
342
346
if (GEP)
343
347
isGEPLoopInduction[GEP] = true ;
@@ -348,7 +352,6 @@ void GenIntrinsicsTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
348
352
// LLVM default only to 10, boost to UnrollMaxCountForAlloca
349
353
UP.MaxIterationsCountToAnalyze = UnrollMaxCountForAlloca;
350
354
UP.Threshold += ThresholdBoost;
351
- UP.Runtime = true ;
352
355
UP.UpperBound = true ;
353
356
UP.Force = true ;
354
357
0 commit comments