@@ -407,8 +407,13 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
407
407
};
408
408
409
409
class SIInsertWaitcnts {
410
+ public:
411
+ const GCNSubtarget *ST;
412
+ InstCounterType SmemAccessCounter;
413
+ InstCounterType MaxCounter;
414
+ const unsigned *WaitEventMaskForInst;
415
+
410
416
private:
411
- const GCNSubtarget *ST = nullptr ;
412
417
const SIInstrInfo *TII = nullptr ;
413
418
const SIRegisterInfo *TRI = nullptr ;
414
419
const MachineRegisterInfo *MRI = nullptr ;
@@ -424,8 +429,6 @@ class SIInsertWaitcnts {
424
429
bool Dirty = true ;
425
430
};
426
431
427
- InstCounterType SmemAccessCounter;
428
-
429
432
MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
430
433
431
434
bool ForceEmitWaitcnt[NUM_INST_CNTS];
@@ -442,7 +445,7 @@ class SIInsertWaitcnts {
442
445
// message.
443
446
DenseSet<MachineInstr *> ReleaseVGPRInsts;
444
447
445
- InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS ;
448
+ HardwareLimits Limits ;
446
449
447
450
public:
448
451
SIInsertWaitcnts (MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
@@ -453,6 +456,30 @@ class SIInsertWaitcnts {
453
456
(void )ForceVMCounter;
454
457
}
455
458
459
+ unsigned getWaitCountMax (InstCounterType T) const {
460
+ switch (T) {
461
+ case LOAD_CNT:
462
+ return Limits.LoadcntMax ;
463
+ case DS_CNT:
464
+ return Limits.DscntMax ;
465
+ case EXP_CNT:
466
+ return Limits.ExpcntMax ;
467
+ case STORE_CNT:
468
+ return Limits.StorecntMax ;
469
+ case SAMPLE_CNT:
470
+ return Limits.SamplecntMax ;
471
+ case BVH_CNT:
472
+ return Limits.BvhcntMax ;
473
+ case KM_CNT:
474
+ return Limits.KmcntMax ;
475
+ case X_CNT:
476
+ return Limits.XcntMax ;
477
+ default :
478
+ break ;
479
+ }
480
+ return 0 ;
481
+ }
482
+
456
483
bool shouldFlushVmCnt (MachineLoop *ML, const WaitcntBrackets &Brackets);
457
484
bool isPreheaderToFlush (MachineBasicBlock &MBB,
458
485
const WaitcntBrackets &ScoreBrackets);
@@ -568,39 +595,10 @@ class SIInsertWaitcnts {
568
595
// "s_waitcnt 0" before use.
569
596
class WaitcntBrackets {
570
597
public:
571
- WaitcntBrackets (const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
572
- HardwareLimits Limits, const unsigned *WaitEventMaskForInst,
573
- InstCounterType SmemAccessCounter)
574
- : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
575
- WaitEventMaskForInst (WaitEventMaskForInst),
576
- SmemAccessCounter(SmemAccessCounter) {}
577
-
578
- unsigned getWaitCountMax (InstCounterType T) const {
579
- switch (T) {
580
- case LOAD_CNT:
581
- return Limits.LoadcntMax ;
582
- case DS_CNT:
583
- return Limits.DscntMax ;
584
- case EXP_CNT:
585
- return Limits.ExpcntMax ;
586
- case STORE_CNT:
587
- return Limits.StorecntMax ;
588
- case SAMPLE_CNT:
589
- return Limits.SamplecntMax ;
590
- case BVH_CNT:
591
- return Limits.BvhcntMax ;
592
- case KM_CNT:
593
- return Limits.KmcntMax ;
594
- case X_CNT:
595
- return Limits.XcntMax ;
596
- default :
597
- break ;
598
- }
599
- return 0 ;
600
- }
598
+ WaitcntBrackets (const SIInsertWaitcnts *Parent) : Parent(Parent) {}
601
599
602
600
bool isSmemCounter (InstCounterType T) const {
603
- return T == SmemAccessCounter || T == X_CNT;
601
+ return T == Parent-> SmemAccessCounter || T == X_CNT;
604
602
}
605
603
606
604
unsigned getSgprScoresIdx (InstCounterType T) const {
@@ -658,7 +656,7 @@ class WaitcntBrackets {
658
656
return PendingEvents & (1 << E);
659
657
}
660
658
unsigned hasPendingEvent (InstCounterType T) const {
661
- unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
659
+ unsigned HasPending = PendingEvents & Parent-> WaitEventMaskForInst [T];
662
660
assert ((HasPending != 0 ) == (getScoreRange (T) != 0 ));
663
661
return HasPending;
664
662
}
@@ -686,7 +684,8 @@ class WaitcntBrackets {
686
684
}
687
685
688
686
unsigned getPendingGDSWait () const {
689
- return std::min (getScoreUB (DS_CNT) - LastGDS, getWaitCountMax (DS_CNT) - 1 );
687
+ return std::min (getScoreUB (DS_CNT) - LastGDS,
688
+ Parent->getWaitCountMax (DS_CNT) - 1 );
690
689
}
691
690
692
691
void setPendingGDS () { LastGDS = ScoreUBs[DS_CNT]; }
@@ -710,8 +709,9 @@ class WaitcntBrackets {
710
709
}
711
710
712
711
void setStateOnFunctionEntryOrReturn () {
713
- setScoreUB (STORE_CNT, getScoreUB (STORE_CNT) + getWaitCountMax (STORE_CNT));
714
- PendingEvents |= WaitEventMaskForInst[STORE_CNT];
712
+ setScoreUB (STORE_CNT,
713
+ getScoreUB (STORE_CNT) + Parent->getWaitCountMax (STORE_CNT));
714
+ PendingEvents |= Parent->WaitEventMaskForInst [STORE_CNT];
715
715
}
716
716
717
717
ArrayRef<const MachineInstr *> getLDSDMAStores () const {
@@ -747,8 +747,8 @@ class WaitcntBrackets {
747
747
if (T != EXP_CNT)
748
748
return ;
749
749
750
- if (getScoreRange (EXP_CNT) > getWaitCountMax (EXP_CNT))
751
- ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax (EXP_CNT);
750
+ if (getScoreRange (EXP_CNT) > Parent-> getWaitCountMax (EXP_CNT))
751
+ ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Parent-> getWaitCountMax (EXP_CNT);
752
752
}
753
753
754
754
void setRegScore (int GprNo, InstCounterType T, unsigned Val) {
@@ -763,11 +763,8 @@ class WaitcntBrackets {
763
763
const MachineOperand &Op, InstCounterType CntTy,
764
764
unsigned Val);
765
765
766
- const GCNSubtarget *ST = nullptr ;
767
- InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
768
- HardwareLimits Limits = {};
769
- const unsigned *WaitEventMaskForInst;
770
- InstCounterType SmemAccessCounter;
766
+ const SIInsertWaitcnts *Parent;
767
+
771
768
unsigned ScoreLBs[NUM_INST_CNTS] = {0 };
772
769
unsigned ScoreUBs[NUM_INST_CNTS] = {0 };
773
770
unsigned PendingEvents = 0 ;
@@ -829,7 +826,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
829
826
830
827
RegInterval Result;
831
828
832
- MCRegister MCReg = AMDGPU::getMCReg (Op.getReg (), *ST);
829
+ MCRegister MCReg = AMDGPU::getMCReg (Op.getReg (), *Parent-> ST );
833
830
unsigned RegIdx = TRI->getHWRegIndex (MCReg);
834
831
assert (isUInt<8 >(RegIdx));
835
832
@@ -887,7 +884,7 @@ void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI,
887
884
// this at compile time, so we have to assume it might be applied if the
888
885
// instruction supports it).
889
886
bool WaitcntBrackets::hasPointSampleAccel (const MachineInstr &MI) const {
890
- if (!ST->hasPointSampleAccel () || !SIInstrInfo::isMIMG (MI))
887
+ if (!Parent-> ST ->hasPointSampleAccel () || !SIInstrInfo::isMIMG (MI))
891
888
return false ;
892
889
893
890
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo (MI.getOpcode ());
@@ -913,7 +910,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
913
910
const SIRegisterInfo *TRI,
914
911
const MachineRegisterInfo *MRI,
915
912
WaitEventType E, MachineInstr &Inst) {
916
- InstCounterType T = eventCounter (WaitEventMaskForInst, E);
913
+ InstCounterType T = eventCounter (Parent-> WaitEventMaskForInst , E);
917
914
918
915
unsigned UB = getScoreUB (T);
919
916
unsigned CurrScore = UB + 1 ;
@@ -1082,8 +1079,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
1082
1079
}
1083
1080
1084
1081
void WaitcntBrackets::print (raw_ostream &OS) const {
1082
+ const GCNSubtarget *ST = Parent->ST ;
1083
+
1085
1084
OS << ' \n ' ;
1086
- for (auto T : inst_counter_types (MaxCounter)) {
1085
+ for (auto T : inst_counter_types (Parent-> MaxCounter )) {
1087
1086
unsigned SR = getScoreRange (T);
1088
1087
1089
1088
switch (T) {
@@ -1197,7 +1196,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
1197
1196
// s_waitcnt instruction.
1198
1197
if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1199
1198
if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat () &&
1200
- !ST->hasFlatLgkmVMemCountInOrder ()) {
1199
+ !Parent-> ST ->hasFlatLgkmVMemCountInOrder ()) {
1201
1200
// If there is a pending FLAT operation, and this is a VMem or LGKM
1202
1201
// waitcnt and the target can report early completion, then we need
1203
1202
// to force a waitcnt 0.
@@ -1211,7 +1210,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
1211
1210
// If a counter has been maxed out avoid overflow by waiting for
1212
1211
// MAX(CounterType) - 1 instead.
1213
1212
unsigned NeededWait =
1214
- std::min (UB - ScoreToWait, getWaitCountMax (T) - 1 );
1213
+ std::min (UB - ScoreToWait, Parent-> getWaitCountMax (T) - 1 );
1215
1214
addWait (Wait, T, NeededWait);
1216
1215
}
1217
1216
}
@@ -1239,7 +1238,7 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
1239
1238
setScoreLB (T, std::max (getScoreLB (T), UB - Count));
1240
1239
} else {
1241
1240
setScoreLB (T, UB);
1242
- PendingEvents &= ~WaitEventMaskForInst[T];
1241
+ PendingEvents &= ~Parent-> WaitEventMaskForInst [T];
1243
1242
}
1244
1243
}
1245
1244
@@ -1264,7 +1263,7 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
1264
1263
// the decrement may go out of order.
1265
1264
bool WaitcntBrackets::counterOutOfOrder (InstCounterType T) const {
1266
1265
// Scalar memory read always can go out of order.
1267
- if ((T == SmemAccessCounter && hasPendingEvent (SMEM_ACCESS)) ||
1266
+ if ((T == Parent-> SmemAccessCounter && hasPendingEvent (SMEM_ACCESS)) ||
1268
1267
(T == X_CNT && hasPendingEvent (SMEM_GROUP)))
1269
1268
return true ;
1270
1269
return hasMixedPendingEvents (T);
@@ -2388,8 +2387,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2388
2387
VgprUB = std::max (VgprUB, Other.VgprUB );
2389
2388
SgprUB = std::max (SgprUB, Other.SgprUB );
2390
2389
2391
- for (auto T : inst_counter_types (MaxCounter)) {
2390
+ for (auto T : inst_counter_types (Parent-> MaxCounter )) {
2392
2391
// Merge event flags for this counter
2392
+ const unsigned *WaitEventMaskForInst = Parent->WaitEventMaskForInst ;
2393
2393
const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
2394
2394
const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
2395
2395
if (OtherEvents & ~OldEvents)
@@ -2748,11 +2748,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
2748
2748
for (auto T : inst_counter_types ())
2749
2749
ForceEmitWaitcnt[T] = false ;
2750
2750
2751
- const unsigned * WaitEventMaskForInst = WCG->getWaitEventMask ();
2751
+ WaitEventMaskForInst = WCG->getWaitEventMask ();
2752
2752
2753
2753
SmemAccessCounter = eventCounter (WaitEventMaskForInst, SMEM_ACCESS);
2754
2754
2755
- HardwareLimits Limits = {};
2756
2755
if (ST->hasExtendedWaitCounts ()) {
2757
2756
Limits.LoadcntMax = AMDGPU::getLoadcntBitMask (IV);
2758
2757
Limits.DscntMax = AMDGPU::getDscntBitMask (IV);
@@ -2809,8 +2808,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
2809
2808
BuildMI (EntryBB, I, DebugLoc (), TII->get (AMDGPU::S_WAITCNT)).addImm (0 );
2810
2809
}
2811
2810
2812
- auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
2813
- ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
2811
+ auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this );
2814
2812
NonKernelInitialState->setStateOnFunctionEntryOrReturn ();
2815
2813
BlockInfos[&EntryBB].Incoming = std::move (NonKernelInitialState);
2816
2814
@@ -2841,15 +2839,13 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
2841
2839
*Brackets = *BI.Incoming ;
2842
2840
} else {
2843
2841
if (!Brackets) {
2844
- Brackets = std::make_unique<WaitcntBrackets>(
2845
- ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
2842
+ Brackets = std::make_unique<WaitcntBrackets>(this );
2846
2843
} else {
2847
2844
// Reinitialize in-place. N.B. do not do this by assigning from a
2848
2845
// temporary because the WaitcntBrackets class is large and it could
2849
2846
// cause this function to use an unreasonable amount of stack space.
2850
2847
Brackets->~WaitcntBrackets ();
2851
- new (Brackets.get ()) WaitcntBrackets (
2852
- ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
2848
+ new (Brackets.get ()) WaitcntBrackets (this );
2853
2849
}
2854
2850
}
2855
2851
0 commit comments