Skip to content

Commit b0c96d6

Browse files
committed
[AMDGPU] Move common fields out of WaitcntBrackets. NFC.
WaitcntBrackets holds per-basic-block information about the state of wait counters. It also held a bunch of fields that are constant throughout a run of the pass. This patch moves them out into the SIInsertWaitcnts class, for better logical separation and to save a tiny bit of memory.
1 parent 516da3f commit b0c96d6

File tree

1 file changed

+59
-63
lines changed

1 file changed

+59
-63
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 59 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -407,8 +407,13 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
407407
};
408408

409409
class SIInsertWaitcnts {
410+
public:
411+
const GCNSubtarget *ST;
412+
InstCounterType SmemAccessCounter;
413+
InstCounterType MaxCounter;
414+
const unsigned *WaitEventMaskForInst;
415+
410416
private:
411-
const GCNSubtarget *ST = nullptr;
412417
const SIInstrInfo *TII = nullptr;
413418
const SIRegisterInfo *TRI = nullptr;
414419
const MachineRegisterInfo *MRI = nullptr;
@@ -424,8 +429,6 @@ class SIInsertWaitcnts {
424429
bool Dirty = true;
425430
};
426431

427-
InstCounterType SmemAccessCounter;
428-
429432
MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
430433

431434
bool ForceEmitWaitcnt[NUM_INST_CNTS];
@@ -442,7 +445,7 @@ class SIInsertWaitcnts {
442445
// message.
443446
DenseSet<MachineInstr *> ReleaseVGPRInsts;
444447

445-
InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
448+
HardwareLimits Limits;
446449

447450
public:
448451
SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
@@ -453,6 +456,30 @@ class SIInsertWaitcnts {
453456
(void)ForceVMCounter;
454457
}
455458

459+
unsigned getWaitCountMax(InstCounterType T) const {
460+
switch (T) {
461+
case LOAD_CNT:
462+
return Limits.LoadcntMax;
463+
case DS_CNT:
464+
return Limits.DscntMax;
465+
case EXP_CNT:
466+
return Limits.ExpcntMax;
467+
case STORE_CNT:
468+
return Limits.StorecntMax;
469+
case SAMPLE_CNT:
470+
return Limits.SamplecntMax;
471+
case BVH_CNT:
472+
return Limits.BvhcntMax;
473+
case KM_CNT:
474+
return Limits.KmcntMax;
475+
case X_CNT:
476+
return Limits.XcntMax;
477+
default:
478+
break;
479+
}
480+
return 0;
481+
}
482+
456483
bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets);
457484
bool isPreheaderToFlush(MachineBasicBlock &MBB,
458485
const WaitcntBrackets &ScoreBrackets);
@@ -568,39 +595,10 @@ class SIInsertWaitcnts {
568595
// "s_waitcnt 0" before use.
569596
class WaitcntBrackets {
570597
public:
571-
WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
572-
HardwareLimits Limits, const unsigned *WaitEventMaskForInst,
573-
InstCounterType SmemAccessCounter)
574-
: ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
575-
WaitEventMaskForInst(WaitEventMaskForInst),
576-
SmemAccessCounter(SmemAccessCounter) {}
577-
578-
unsigned getWaitCountMax(InstCounterType T) const {
579-
switch (T) {
580-
case LOAD_CNT:
581-
return Limits.LoadcntMax;
582-
case DS_CNT:
583-
return Limits.DscntMax;
584-
case EXP_CNT:
585-
return Limits.ExpcntMax;
586-
case STORE_CNT:
587-
return Limits.StorecntMax;
588-
case SAMPLE_CNT:
589-
return Limits.SamplecntMax;
590-
case BVH_CNT:
591-
return Limits.BvhcntMax;
592-
case KM_CNT:
593-
return Limits.KmcntMax;
594-
case X_CNT:
595-
return Limits.XcntMax;
596-
default:
597-
break;
598-
}
599-
return 0;
600-
}
598+
WaitcntBrackets(const SIInsertWaitcnts *Parent) : Parent(Parent) {}
601599

602600
bool isSmemCounter(InstCounterType T) const {
603-
return T == SmemAccessCounter || T == X_CNT;
601+
return T == Parent->SmemAccessCounter || T == X_CNT;
604602
}
605603

606604
unsigned getSgprScoresIdx(InstCounterType T) const {
@@ -658,7 +656,7 @@ class WaitcntBrackets {
658656
return PendingEvents & (1 << E);
659657
}
660658
unsigned hasPendingEvent(InstCounterType T) const {
661-
unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
659+
unsigned HasPending = PendingEvents & Parent->WaitEventMaskForInst[T];
662660
assert((HasPending != 0) == (getScoreRange(T) != 0));
663661
return HasPending;
664662
}
@@ -686,7 +684,8 @@ class WaitcntBrackets {
686684
}
687685

688686
unsigned getPendingGDSWait() const {
689-
return std::min(getScoreUB(DS_CNT) - LastGDS, getWaitCountMax(DS_CNT) - 1);
687+
return std::min(getScoreUB(DS_CNT) - LastGDS,
688+
Parent->getWaitCountMax(DS_CNT) - 1);
690689
}
691690

692691
void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
@@ -710,8 +709,9 @@ class WaitcntBrackets {
710709
}
711710

712711
void setStateOnFunctionEntryOrReturn() {
713-
setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT));
714-
PendingEvents |= WaitEventMaskForInst[STORE_CNT];
712+
setScoreUB(STORE_CNT,
713+
getScoreUB(STORE_CNT) + Parent->getWaitCountMax(STORE_CNT));
714+
PendingEvents |= Parent->WaitEventMaskForInst[STORE_CNT];
715715
}
716716

717717
ArrayRef<const MachineInstr *> getLDSDMAStores() const {
@@ -747,8 +747,8 @@ class WaitcntBrackets {
747747
if (T != EXP_CNT)
748748
return;
749749

750-
if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
751-
ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
750+
if (getScoreRange(EXP_CNT) > Parent->getWaitCountMax(EXP_CNT))
751+
ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Parent->getWaitCountMax(EXP_CNT);
752752
}
753753

754754
void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
@@ -763,11 +763,8 @@ class WaitcntBrackets {
763763
const MachineOperand &Op, InstCounterType CntTy,
764764
unsigned Val);
765765

766-
const GCNSubtarget *ST = nullptr;
767-
InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
768-
HardwareLimits Limits = {};
769-
const unsigned *WaitEventMaskForInst;
770-
InstCounterType SmemAccessCounter;
766+
const SIInsertWaitcnts *Parent;
767+
771768
unsigned ScoreLBs[NUM_INST_CNTS] = {0};
772769
unsigned ScoreUBs[NUM_INST_CNTS] = {0};
773770
unsigned PendingEvents = 0;
@@ -829,7 +826,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
829826

830827
RegInterval Result;
831828

832-
MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *ST);
829+
MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *Parent->ST);
833830
unsigned RegIdx = TRI->getHWRegIndex(MCReg);
834831
assert(isUInt<8>(RegIdx));
835832

@@ -887,7 +884,7 @@ void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI,
887884
// this at compile time, so we have to assume it might be applied if the
888885
// instruction supports it).
889886
bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
890-
if (!ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
887+
if (!Parent->ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
891888
return false;
892889

893890
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
@@ -913,7 +910,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
913910
const SIRegisterInfo *TRI,
914911
const MachineRegisterInfo *MRI,
915912
WaitEventType E, MachineInstr &Inst) {
916-
InstCounterType T = eventCounter(WaitEventMaskForInst, E);
913+
InstCounterType T = eventCounter(Parent->WaitEventMaskForInst, E);
917914

918915
unsigned UB = getScoreUB(T);
919916
unsigned CurrScore = UB + 1;
@@ -1082,8 +1079,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
10821079
}
10831080

10841081
void WaitcntBrackets::print(raw_ostream &OS) const {
1082+
const GCNSubtarget *ST = Parent->ST;
1083+
10851084
OS << '\n';
1086-
for (auto T : inst_counter_types(MaxCounter)) {
1085+
for (auto T : inst_counter_types(Parent->MaxCounter)) {
10871086
unsigned SR = getScoreRange(T);
10881087

10891088
switch (T) {
@@ -1197,7 +1196,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
11971196
// s_waitcnt instruction.
11981197
if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
11991198
if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
1200-
!ST->hasFlatLgkmVMemCountInOrder()) {
1199+
!Parent->ST->hasFlatLgkmVMemCountInOrder()) {
12011200
// If there is a pending FLAT operation, and this is a VMem or LGKM
12021201
// waitcnt and the target can report early completion, then we need
12031202
// to force a waitcnt 0.
@@ -1211,7 +1210,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
12111210
// If a counter has been maxed out avoid overflow by waiting for
12121211
// MAX(CounterType) - 1 instead.
12131212
unsigned NeededWait =
1214-
std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
1213+
std::min(UB - ScoreToWait, Parent->getWaitCountMax(T) - 1);
12151214
addWait(Wait, T, NeededWait);
12161215
}
12171216
}
@@ -1239,7 +1238,7 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
12391238
setScoreLB(T, std::max(getScoreLB(T), UB - Count));
12401239
} else {
12411240
setScoreLB(T, UB);
1242-
PendingEvents &= ~WaitEventMaskForInst[T];
1241+
PendingEvents &= ~Parent->WaitEventMaskForInst[T];
12431242
}
12441243
}
12451244

@@ -1264,7 +1263,7 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
12641263
// the decrement may go out of order.
12651264
bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
12661265
// Scalar memory read always can go out of order.
1267-
if ((T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
1266+
if ((T == Parent->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
12681267
(T == X_CNT && hasPendingEvent(SMEM_GROUP)))
12691268
return true;
12701269
return hasMixedPendingEvents(T);
@@ -2388,8 +2387,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
23882387
VgprUB = std::max(VgprUB, Other.VgprUB);
23892388
SgprUB = std::max(SgprUB, Other.SgprUB);
23902389

2391-
for (auto T : inst_counter_types(MaxCounter)) {
2390+
for (auto T : inst_counter_types(Parent->MaxCounter)) {
23922391
// Merge event flags for this counter
2392+
const unsigned *WaitEventMaskForInst = Parent->WaitEventMaskForInst;
23932393
const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
23942394
const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
23952395
if (OtherEvents & ~OldEvents)
@@ -2748,11 +2748,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
27482748
for (auto T : inst_counter_types())
27492749
ForceEmitWaitcnt[T] = false;
27502750

2751-
const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
2751+
WaitEventMaskForInst = WCG->getWaitEventMask();
27522752

27532753
SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
27542754

2755-
HardwareLimits Limits = {};
27562755
if (ST->hasExtendedWaitCounts()) {
27572756
Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
27582757
Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
@@ -2809,8 +2808,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
28092808
BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
28102809
}
28112810

2812-
auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
2813-
ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
2811+
auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this);
28142812
NonKernelInitialState->setStateOnFunctionEntryOrReturn();
28152813
BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
28162814

@@ -2841,15 +2839,13 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
28412839
*Brackets = *BI.Incoming;
28422840
} else {
28432841
if (!Brackets) {
2844-
Brackets = std::make_unique<WaitcntBrackets>(
2845-
ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
2842+
Brackets = std::make_unique<WaitcntBrackets>(this);
28462843
} else {
28472844
// Reinitialize in-place. N.B. do not do this by assigning from a
28482845
// temporary because the WaitcntBrackets class is large and it could
28492846
// cause this function to use an unreasonable amount of stack space.
28502847
Brackets->~WaitcntBrackets();
2851-
new (Brackets.get()) WaitcntBrackets(
2852-
ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
2848+
new (Brackets.get()) WaitcntBrackets(this);
28532849
}
28542850
}
28552851

0 commit comments

Comments
 (0)