diff options
Diffstat (limited to 'gnu/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp')
| -rw-r--r-- | gnu/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp | 2050 |
1 files changed, 0 insertions, 2050 deletions
diff --git a/gnu/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/gnu/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp deleted file mode 100644 index fb7e670068f..00000000000 --- a/gnu/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ /dev/null @@ -1,2050 +0,0 @@ -//===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// SI Machine Scheduler interface -// -//===----------------------------------------------------------------------===// - -#include "SIMachineScheduler.h" -#include "AMDGPU.h" -#include "SIInstrInfo.h" -#include "SIRegisterInfo.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/CodeGen/LiveInterval.h" -#include "llvm/CodeGen/LiveIntervals.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/MachineScheduler.h" -#include "llvm/CodeGen/RegisterPressure.h" -#include "llvm/CodeGen/SlotIndexes.h" -#include "llvm/CodeGen/TargetRegisterInfo.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" -#include <algorithm> -#include <cassert> -#include <map> -#include <set> -#include <utility> -#include <vector> - -using namespace llvm; - -#define DEBUG_TYPE "machine-scheduler" - -// This scheduler implements a different scheduling algorithm than -// GenericScheduler. -// -// There are several specific architecture behaviours that can't be modelled -// for GenericScheduler: -// . When accessing the result of an SGPR load instruction, you have to wait -// for all the SGPR load instructions before your current instruction to -// have finished. -// . When accessing the result of an VGPR load instruction, you have to wait -// for all the VGPR load instructions previous to the VGPR load instruction -// you are interested in to finish. -// . The less the register pressure, the best load latencies are hidden -// -// Moreover some specifities (like the fact a lot of instructions in the shader -// have few dependencies) makes the generic scheduler have some unpredictable -// behaviours. For example when register pressure becomes high, it can either -// manage to prevent register pressure from going too high, or it can -// increase register pressure even more than if it hadn't taken register -// pressure into account. -// -// Also some other bad behaviours are generated, like loading at the beginning -// of the shader a constant in VGPR you won't need until the end of the shader. -// -// The scheduling problem for SI can distinguish three main parts: -// . Hiding high latencies (texture sampling, etc) -// . Hiding low latencies (SGPR constant loading, etc) -// . Keeping register usage low for better latency hiding and general -// performance -// -// Some other things can also affect performance, but are hard to predict -// (cache usage, the fact the HW can issue several instructions from different -// wavefronts if different types, etc) -// -// This scheduler tries to solve the scheduling problem by dividing it into -// simpler sub-problems. It divides the instructions into blocks, schedules -// locally inside the blocks where it takes care of low latencies, and then -// chooses the order of the blocks by taking care of high latencies. -// Dividing the instructions into blocks helps control keeping register -// usage low. -// -// First the instructions are put into blocks. -// We want the blocks help control register usage and hide high latencies -// later. To help control register usage, we typically want all local -// computations, when for example you create a result that can be comsummed -// right away, to be contained in a block. Block inputs and outputs would -// typically be important results that are needed in several locations of -// the shader. Since we do want blocks to help hide high latencies, we want -// the instructions inside the block to have a minimal set of dependencies -// on high latencies. It will make it easy to pick blocks to hide specific -// high latencies. -// The block creation algorithm is divided into several steps, and several -// variants can be tried during the scheduling process. -// -// Second the order of the instructions inside the blocks is chosen. -// At that step we do take into account only register usage and hiding -// low latency instructions -// -// Third the block order is chosen, there we try to hide high latencies -// and keep register usage low. -// -// After the third step, a pass is done to improve the hiding of low -// latencies. -// -// Actually when talking about 'low latency' or 'high latency' it includes -// both the latency to get the cache (or global mem) data go to the register, -// and the bandwidth limitations. -// Increasing the number of active wavefronts helps hide the former, but it -// doesn't solve the latter, thus why even if wavefront count is high, we have -// to try have as many instructions hiding high latencies as possible. -// The OpenCL doc says for example latency of 400 cycles for a global mem access, -// which is hidden by 10 instructions if the wavefront count is 10. - -// Some figures taken from AMD docs: -// Both texture and constant L1 caches are 4-way associative with 64 bytes -// lines. -// Constant cache is shared with 4 CUs. -// For texture sampling, the address generation unit receives 4 texture -// addresses per cycle, thus we could expect texture sampling latency to be -// equivalent to 4 instructions in the very best case (a VGPR is 64 work items, -// instructions in a wavefront group are executed every 4 cycles), -// or 16 instructions if the other wavefronts associated to the 3 other VALUs -// of the CU do texture sampling too. (Don't take these figures too seriously, -// as I'm not 100% sure of the computation) -// Data exports should get similar latency. -// For constant loading, the cache is shader with 4 CUs. -// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit" -// I guess if the other CU don't read the cache, it can go up to 64B/cycle. -// It means a simple s_buffer_load should take one instruction to hide, as -// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same -// cache line. -// -// As of today the driver doesn't preload the constants in cache, thus the -// first loads get extra latency. The doc says global memory access can be -// 300-600 cycles. We do not specially take that into account when scheduling -// As we expect the driver to be able to preload the constants soon. - -// common code // - -#ifndef NDEBUG - -static const char *getReasonStr(SIScheduleCandReason Reason) { - switch (Reason) { - case NoCand: return "NOCAND"; - case RegUsage: return "REGUSAGE"; - case Latency: return "LATENCY"; - case Successor: return "SUCCESSOR"; - case Depth: return "DEPTH"; - case NodeOrder: return "ORDER"; - } - llvm_unreachable("Unknown reason!"); -} - -#endif - -namespace llvm { -namespace SISched { -static bool tryLess(int TryVal, int CandVal, - SISchedulerCandidate &TryCand, - SISchedulerCandidate &Cand, - SIScheduleCandReason Reason) { - if (TryVal < CandVal) { - TryCand.Reason = Reason; - return true; - } - if (TryVal > CandVal) { - if (Cand.Reason > Reason) - Cand.Reason = Reason; - return true; - } - Cand.setRepeat(Reason); - return false; -} - -static bool tryGreater(int TryVal, int CandVal, - SISchedulerCandidate &TryCand, - SISchedulerCandidate &Cand, - SIScheduleCandReason Reason) { - if (TryVal > CandVal) { - TryCand.Reason = Reason; - return true; - } - if (TryVal < CandVal) { - if (Cand.Reason > Reason) - Cand.Reason = Reason; - return true; - } - Cand.setRepeat(Reason); - return false; -} -} // end namespace SISched -} // end namespace llvm - -// SIScheduleBlock // - -void SIScheduleBlock::addUnit(SUnit *SU) { - NodeNum2Index[SU->NodeNum] = SUnits.size(); - SUnits.push_back(SU); -} - -#ifndef NDEBUG -void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) { - - dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason); - dbgs() << '\n'; -} -#endif - -void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand, - SISchedCandidate &TryCand) { - // Initialize the candidate if needed. - if (!Cand.isValid()) { - TryCand.Reason = NodeOrder; - return; - } - - if (Cand.SGPRUsage > 60 && - SISched::tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, - TryCand, Cand, RegUsage)) - return; - - // Schedule low latency instructions as top as possible. - // Order of priority is: - // . Low latency instructions which do not depend on other low latency - // instructions we haven't waited for - // . Other instructions which do not depend on low latency instructions - // we haven't waited for - // . Low latencies - // . All other instructions - // Goal is to get: low latency instructions - independent instructions - // - (eventually some more low latency instructions) - // - instructions that depend on the first low latency instructions. - // If in the block there is a lot of constant loads, the SGPR usage - // could go quite high, thus above the arbitrary limit of 60 will encourage - // use the already loaded constants (in order to release some SGPRs) before - // loading more. - if (SISched::tryLess(TryCand.HasLowLatencyNonWaitedParent, - Cand.HasLowLatencyNonWaitedParent, - TryCand, Cand, SIScheduleCandReason::Depth)) - return; - - if (SISched::tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency, - TryCand, Cand, SIScheduleCandReason::Depth)) - return; - - if (TryCand.IsLowLatency && - SISched::tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset, - TryCand, Cand, SIScheduleCandReason::Depth)) - return; - - if (SISched::tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, - TryCand, Cand, RegUsage)) - return; - - // Fall through to original instruction order. - if (TryCand.SU->NodeNum < Cand.SU->NodeNum) { - TryCand.Reason = NodeOrder; - } -} - -SUnit* SIScheduleBlock::pickNode() { - SISchedCandidate TopCand; - - for (SUnit* SU : TopReadySUs) { - SISchedCandidate TryCand; - std::vector<unsigned> pressure; - std::vector<unsigned> MaxPressure; - // Predict register usage after this instruction. - TryCand.SU = SU; - TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure); - TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()]; - TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()]; - TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum]; - TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum]; - TryCand.HasLowLatencyNonWaitedParent = - HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]; - tryCandidateTopDown(TopCand, TryCand); - if (TryCand.Reason != NoCand) - TopCand.setBest(TryCand); - } - - return TopCand.SU; -} - - -// Schedule something valid. -void SIScheduleBlock::fastSchedule() { - TopReadySUs.clear(); - if (Scheduled) - undoSchedule(); - - for (SUnit* SU : SUnits) { - if (!SU->NumPredsLeft) - TopReadySUs.push_back(SU); - } - - while (!TopReadySUs.empty()) { - SUnit *SU = TopReadySUs[0]; - ScheduledSUnits.push_back(SU); - nodeScheduled(SU); - } - - Scheduled = true; -} - -// Returns if the register was set between first and last. -static bool isDefBetween(unsigned Reg, - SlotIndex First, SlotIndex Last, - const MachineRegisterInfo *MRI, - const LiveIntervals *LIS) { - for (MachineRegisterInfo::def_instr_iterator - UI = MRI->def_instr_begin(Reg), - UE = MRI->def_instr_end(); UI != UE; ++UI) { - const MachineInstr* MI = &*UI; - if (MI->isDebugValue()) - continue; - SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot(); - if (InstSlot >= First && InstSlot <= Last) - return true; - } - return false; -} - -void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock, - MachineBasicBlock::iterator EndBlock) { - IntervalPressure Pressure, BotPressure; - RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure); - LiveIntervals *LIS = DAG->getLIS(); - MachineRegisterInfo *MRI = DAG->getMRI(); - DAG->initRPTracker(TopRPTracker); - DAG->initRPTracker(BotRPTracker); - DAG->initRPTracker(RPTracker); - - // Goes though all SU. RPTracker captures what had to be alive for the SUs - // to execute, and what is still alive at the end. - for (SUnit* SU : ScheduledSUnits) { - RPTracker.setPos(SU->getInstr()); - RPTracker.advance(); - } - - // Close the RPTracker to finalize live ins/outs. - RPTracker.closeRegion(); - - // Initialize the live ins and live outs. - TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs); - BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs); - - // Do not Track Physical Registers, because it messes up. - for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) { - if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit)) - LiveInRegs.insert(RegMaskPair.RegUnit); - } - LiveOutRegs.clear(); - // There is several possibilities to distinguish: - // 1) Reg is not input to any instruction in the block, but is output of one - // 2) 1) + read in the block and not needed after it - // 3) 1) + read in the block but needed in another block - // 4) Reg is input of an instruction but another block will read it too - // 5) Reg is input of an instruction and then rewritten in the block. - // result is not read in the block (implies used in another block) - // 6) Reg is input of an instruction and then rewritten in the block. - // result is read in the block and not needed in another block - // 7) Reg is input of an instruction and then rewritten in the block. - // result is read in the block but also needed in another block - // LiveInRegs will contains all the regs in situation 4, 5, 6, 7 - // We want LiveOutRegs to contain only Regs whose content will be read after - // in another block, and whose content was written in the current block, - // that is we want it to get 1, 3, 5, 7 - // Since we made the MIs of a block to be packed all together before - // scheduling, then the LiveIntervals were correct, and the RPTracker was - // able to correctly handle 5 vs 6, 2 vs 3. - // (Note: This is not sufficient for RPTracker to not do mistakes for case 4) - // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7 - // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7 - // The use of findDefBetween removes the case 4. - for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) { - unsigned Reg = RegMaskPair.RegUnit; - if (TargetRegisterInfo::isVirtualRegister(Reg) && - isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(), - LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI, - LIS)) { - LiveOutRegs.insert(Reg); - } - } - - // Pressure = sum_alive_registers register size - // Internally llvm will represent some registers as big 128 bits registers - // for example, but they actually correspond to 4 actual 32 bits registers. - // Thus Pressure is not equal to num_alive_registers * constant. - LiveInPressure = TopPressure.MaxSetPressure; - LiveOutPressure = BotPressure.MaxSetPressure; - - // Prepares TopRPTracker for top down scheduling. - TopRPTracker.closeTop(); -} - -void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock, - MachineBasicBlock::iterator EndBlock) { - if (!Scheduled) - fastSchedule(); - - // PreScheduling phase to set LiveIn and LiveOut. - initRegPressure(BeginBlock, EndBlock); - undoSchedule(); - - // Schedule for real now. - - TopReadySUs.clear(); - - for (SUnit* SU : SUnits) { - if (!SU->NumPredsLeft) - TopReadySUs.push_back(SU); - } - - while (!TopReadySUs.empty()) { - SUnit *SU = pickNode(); - ScheduledSUnits.push_back(SU); - TopRPTracker.setPos(SU->getInstr()); - TopRPTracker.advance(); - nodeScheduled(SU); - } - - // TODO: compute InternalAdditionnalPressure. - InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size()); - - // Check everything is right. -#ifndef NDEBUG - assert(SUnits.size() == ScheduledSUnits.size() && - TopReadySUs.empty()); - for (SUnit* SU : SUnits) { - assert(SU->isScheduled && - SU->NumPredsLeft == 0); - } -#endif - - Scheduled = true; -} - -void SIScheduleBlock::undoSchedule() { - for (SUnit* SU : SUnits) { - SU->isScheduled = false; - for (SDep& Succ : SU->Succs) { - if (BC->isSUInBlock(Succ.getSUnit(), ID)) - undoReleaseSucc(SU, &Succ); - } - } - HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0); - ScheduledSUnits.clear(); - Scheduled = false; -} - -void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) { - SUnit *SuccSU = SuccEdge->getSUnit(); - - if (SuccEdge->isWeak()) { - ++SuccSU->WeakPredsLeft; - return; - } - ++SuccSU->NumPredsLeft; -} - -void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) { - SUnit *SuccSU = SuccEdge->getSUnit(); - - if (SuccEdge->isWeak()) { - --SuccSU->WeakPredsLeft; - return; - } -#ifndef NDEBUG - if (SuccSU->NumPredsLeft == 0) { - dbgs() << "*** Scheduling failed! ***\n"; - DAG->dumpNode(*SuccSU); - dbgs() << " has been released too many times!\n"; - llvm_unreachable(nullptr); - } -#endif - - --SuccSU->NumPredsLeft; -} - -/// Release Successors of the SU that are in the block or not. -void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) { - for (SDep& Succ : SU->Succs) { - SUnit *SuccSU = Succ.getSUnit(); - - if (SuccSU->NodeNum >= DAG->SUnits.size()) - continue; - - if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock) - continue; - - releaseSucc(SU, &Succ); - if (SuccSU->NumPredsLeft == 0 && InOrOutBlock) - TopReadySUs.push_back(SuccSU); - } -} - -void SIScheduleBlock::nodeScheduled(SUnit *SU) { - // Is in TopReadySUs - assert (!SU->NumPredsLeft); - std::vector<SUnit *>::iterator I = llvm::find(TopReadySUs, SU); - if (I == TopReadySUs.end()) { - dbgs() << "Data Structure Bug in SI Scheduler\n"; - llvm_unreachable(nullptr); - } - TopReadySUs.erase(I); - - releaseSuccessors(SU, true); - // Scheduling this node will trigger a wait, - // thus propagate to other instructions that they do not need to wait either. - if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]) - HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0); - - if (DAG->IsLowLatencySU[SU->NodeNum]) { - for (SDep& Succ : SU->Succs) { - std::map<unsigned, unsigned>::iterator I = - NodeNum2Index.find(Succ.getSUnit()->NodeNum); - if (I != NodeNum2Index.end()) - HasLowLatencyNonWaitedParent[I->second] = 1; - } - } - SU->isScheduled = true; -} - -void SIScheduleBlock::finalizeUnits() { - // We remove links from outside blocks to enable scheduling inside the block. - for (SUnit* SU : SUnits) { - releaseSuccessors(SU, false); - if (DAG->IsHighLatencySU[SU->NodeNum]) - HighLatencyBlock = true; - } - HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0); -} - -// we maintain ascending order of IDs -void SIScheduleBlock::addPred(SIScheduleBlock *Pred) { - unsigned PredID = Pred->getID(); - - // Check if not already predecessor. - for (SIScheduleBlock* P : Preds) { - if (PredID == P->getID()) - return; - } - Preds.push_back(Pred); - - assert(none_of(Succs, - [=](std::pair<SIScheduleBlock*, - SIScheduleBlockLinkKind> S) { - return PredID == S.first->getID(); - }) && - "Loop in the Block Graph!"); -} - -void SIScheduleBlock::addSucc(SIScheduleBlock *Succ, - SIScheduleBlockLinkKind Kind) { - unsigned SuccID = Succ->getID(); - - // Check if not already predecessor. - for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) { - if (SuccID == S.first->getID()) { - if (S.second == SIScheduleBlockLinkKind::NoData && - Kind == SIScheduleBlockLinkKind::Data) - S.second = Kind; - return; - } - } - if (Succ->isHighLatencyBlock()) - ++NumHighLatencySuccessors; - Succs.push_back(std::make_pair(Succ, Kind)); - - assert(none_of(Preds, - [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && - "Loop in the Block Graph!"); -} - -#ifndef NDEBUG -void SIScheduleBlock::printDebug(bool full) { - dbgs() << "Block (" << ID << ")\n"; - if (!full) - return; - - dbgs() << "\nContains High Latency Instruction: " - << HighLatencyBlock << '\n'; - dbgs() << "\nDepends On:\n"; - for (SIScheduleBlock* P : Preds) { - P->printDebug(false); - } - - dbgs() << "\nSuccessors:\n"; - for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) { - if (S.second == SIScheduleBlockLinkKind::Data) - dbgs() << "(Data Dep) "; - S.first->printDebug(false); - } - - if (Scheduled) { - dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' ' - << LiveInPressure[DAG->getVGPRSetID()] << '\n'; - dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' ' - << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n"; - dbgs() << "LiveIns:\n"; - for (unsigned Reg : LiveInRegs) - dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; - - dbgs() << "\nLiveOuts:\n"; - for (unsigned Reg : LiveOutRegs) - dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; - } - - dbgs() << "\nInstructions:\n"; - if (!Scheduled) { - for (const SUnit* SU : SUnits) - DAG->dumpNode(*SU); - } else { - for (const SUnit* SU : SUnits) - DAG->dumpNode(*SU); - } - - dbgs() << "///////////////////////\n"; -} -#endif - -// SIScheduleBlockCreator // - -SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) : -DAG(DAG) { -} - -SIScheduleBlockCreator::~SIScheduleBlockCreator() = default; - -SIScheduleBlocks -SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) { - std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B = - Blocks.find(BlockVariant); - if (B == Blocks.end()) { - SIScheduleBlocks Res; - createBlocksForVariant(BlockVariant); - topologicalSort(); - scheduleInsideBlocks(); - fillStats(); - Res.Blocks = CurrentBlocks; - Res.TopDownIndex2Block = TopDownIndex2Block; - Res.TopDownBlock2Index = TopDownBlock2Index; - Blocks[BlockVariant] = Res; - return Res; - } else { - return B->second; - } -} - -bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) { - if (SU->NodeNum >= DAG->SUnits.size()) - return false; - return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID; -} - -void SIScheduleBlockCreator::colorHighLatenciesAlone() { - unsigned DAGSize = DAG->SUnits.size(); - - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &DAG->SUnits[i]; - if (DAG->IsHighLatencySU[SU->NodeNum]) { - CurrentColoring[SU->NodeNum] = NextReservedID++; - } - } -} - -static bool -hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) { - for (const auto &PredDep : SU.Preds) { - if (PredDep.getSUnit() == &FromSU && - PredDep.getKind() == llvm::SDep::Data) - return true; - } - return false; -} - -void SIScheduleBlockCreator::colorHighLatenciesGroups() { - unsigned DAGSize = DAG->SUnits.size(); - unsigned NumHighLatencies = 0; - unsigned GroupSize; - int Color = NextReservedID; - unsigned Count = 0; - std::set<unsigned> FormingGroup; - - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &DAG->SUnits[i]; - if (DAG->IsHighLatencySU[SU->NodeNum]) - ++NumHighLatencies; - } - - if (NumHighLatencies == 0) - return; - - if (NumHighLatencies <= 6) - GroupSize = 2; - else if (NumHighLatencies <= 12) - GroupSize = 3; - else - GroupSize = 4; - - for (unsigned SUNum : DAG->TopDownIndex2SU) { - const SUnit &SU = DAG->SUnits[SUNum]; - if (DAG->IsHighLatencySU[SU.NodeNum]) { - unsigned CompatibleGroup = true; - int ProposedColor = Color; - std::vector<int> AdditionalElements; - - // We don't want to put in the same block - // two high latency instructions that depend - // on each other. - // One way would be to check canAddEdge - // in both directions, but that currently is not - // enough because there the high latency order is - // enforced (via links). - // Instead, look at the dependencies between the - // high latency instructions and deduce if it is - // a data dependency or not. - for (unsigned j : FormingGroup) { - bool HasSubGraph; - std::vector<int> SubGraph; - // By construction (topological order), if SU and - // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary - // in the parent graph of SU. -#ifndef NDEBUG - SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j], - HasSubGraph); - assert(!HasSubGraph); -#endif - SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU, - HasSubGraph); - if (!HasSubGraph) - continue; // No dependencies between each other - else if (SubGraph.size() > 5) { - // Too many elements would be required to be added to the block. - CompatibleGroup = false; - break; - } - else { - // Check the type of dependency - for (unsigned k : SubGraph) { - // If in the path to join the two instructions, - // there is another high latency instruction, - // or instructions colored for another block - // abort the merge. - if (DAG->IsHighLatencySU[k] || - (CurrentColoring[k] != ProposedColor && - CurrentColoring[k] != 0)) { - CompatibleGroup = false; - break; - } - // If one of the SU in the subgraph depends on the result of SU j, - // there'll be a data dependency. - if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) { - CompatibleGroup = false; - break; - } - } - if (!CompatibleGroup) - break; - // Same check for the SU - if (hasDataDependencyPred(SU, DAG->SUnits[j])) { - CompatibleGroup = false; - break; - } - // Add all the required instructions to the block - // These cannot live in another block (because they - // depend (order dependency) on one of the - // instruction in the block, and are required for the - // high latency instruction we add. - AdditionalElements.insert(AdditionalElements.end(), - SubGraph.begin(), SubGraph.end()); - } - } - if (CompatibleGroup) { - FormingGroup.insert(SU.NodeNum); - for (unsigned j : AdditionalElements) - CurrentColoring[j] = ProposedColor; - CurrentColoring[SU.NodeNum] = ProposedColor; - ++Count; - } - // Found one incompatible instruction, - // or has filled a big enough group. - // -> start a new one. - if (!CompatibleGroup) { - FormingGroup.clear(); - Color = ++NextReservedID; - ProposedColor = Color; - FormingGroup.insert(SU.NodeNum); - CurrentColoring[SU.NodeNum] = ProposedColor; - Count = 0; - } else if (Count == GroupSize) { - FormingGroup.clear(); - Color = ++NextReservedID; - ProposedColor = Color; - Count = 0; - } - } - } -} - -void SIScheduleBlockCreator::colorComputeReservedDependencies() { - unsigned DAGSize = DAG->SUnits.size(); - std::map<std::set<unsigned>, unsigned> ColorCombinations; - - CurrentTopDownReservedDependencyColoring.clear(); - CurrentBottomUpReservedDependencyColoring.clear(); - - CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0); - CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0); - - // Traverse TopDown, and give different colors to SUs depending - // on which combination of High Latencies they depend on. - - for (unsigned SUNum : DAG->TopDownIndex2SU) { - SUnit *SU = &DAG->SUnits[SUNum]; - std::set<unsigned> SUColors; - - // Already given. - if (CurrentColoring[SU->NodeNum]) { - CurrentTopDownReservedDependencyColoring[SU->NodeNum] = - CurrentColoring[SU->NodeNum]; - continue; - } - - for (SDep& PredDep : SU->Preds) { - SUnit *Pred = PredDep.getSUnit(); - if (PredDep.isWeak() || Pred->NodeNum >= DAGSize) - continue; - if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0) - SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]); - } - // Color 0 by default. - if (SUColors.empty()) - continue; - // Same color than parents. - if (SUColors.size() == 1 && *SUColors.begin() > DAGSize) - CurrentTopDownReservedDependencyColoring[SU->NodeNum] = - *SUColors.begin(); - else { - std::map<std::set<unsigned>, unsigned>::iterator Pos = - ColorCombinations.find(SUColors); - if (Pos != ColorCombinations.end()) { - CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second; - } else { - CurrentTopDownReservedDependencyColoring[SU->NodeNum] = - NextNonReservedID; - ColorCombinations[SUColors] = NextNonReservedID++; - } - } - } - - ColorCombinations.clear(); - - // Same as before, but BottomUp. - - for (unsigned SUNum : DAG->BottomUpIndex2SU) { - SUnit *SU = &DAG->SUnits[SUNum]; - std::set<unsigned> SUColors; - - // Already given. - if (CurrentColoring[SU->NodeNum]) { - CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = - CurrentColoring[SU->NodeNum]; - continue; - } - - for (SDep& SuccDep : SU->Succs) { - SUnit *Succ = SuccDep.getSUnit(); - if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) - continue; - if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0) - SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]); - } - // Keep color 0. - if (SUColors.empty()) - continue; - // Same color than parents. - if (SUColors.size() == 1 && *SUColors.begin() > DAGSize) - CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = - *SUColors.begin(); - else { - std::map<std::set<unsigned>, unsigned>::iterator Pos = - ColorCombinations.find(SUColors); - if (Pos != ColorCombinations.end()) { - CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second; - } else { - CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = - NextNonReservedID; - ColorCombinations[SUColors] = NextNonReservedID++; - } - } - } -} - -void SIScheduleBlockCreator::colorAccordingToReservedDependencies() { - unsigned DAGSize = DAG->SUnits.size(); - std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations; - - // Every combination of colors given by the top down - // and bottom up Reserved node dependency - - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &DAG->SUnits[i]; - std::pair<unsigned, unsigned> SUColors; - - // High latency instructions: already given. - if (CurrentColoring[SU->NodeNum]) - continue; - - SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum]; - SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum]; - - std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos = - ColorCombinations.find(SUColors); - if (Pos != ColorCombinations.end()) { - CurrentColoring[SU->NodeNum] = Pos->second; - } else { - CurrentColoring[SU->NodeNum] = NextNonReservedID; - ColorCombinations[SUColors] = NextNonReservedID++; - } - } -} - -void SIScheduleBlockCreator::colorEndsAccordingToDependencies() { - unsigned DAGSize = DAG->SUnits.size(); - std::vector<int> PendingColoring = CurrentColoring; - - assert(DAGSize >= 1 && - CurrentBottomUpReservedDependencyColoring.size() == DAGSize && - CurrentTopDownReservedDependencyColoring.size() == DAGSize); - // If there is no reserved block at all, do nothing. We don't want - // everything in one block. - if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(), - CurrentBottomUpReservedDependencyColoring.end()) == 0 && - *std::max_element(CurrentTopDownReservedDependencyColoring.begin(), - CurrentTopDownReservedDependencyColoring.end()) == 0) - return; - - for (unsigned SUNum : DAG->BottomUpIndex2SU) { - SUnit *SU = &DAG->SUnits[SUNum]; - std::set<unsigned> SUColors; - std::set<unsigned> SUColorsPending; - - if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) - continue; - - if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 || - CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0) - continue; - - for (SDep& SuccDep : SU->Succs) { - SUnit *Succ = SuccDep.getSUnit(); - if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) - continue; - if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 || - CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0) - SUColors.insert(CurrentColoring[Succ->NodeNum]); - SUColorsPending.insert(PendingColoring[Succ->NodeNum]); - } - // If there is only one child/parent block, and that block - // is not among the ones we are removing in this path, then - // merge the instruction to that block - if (SUColors.size() == 1 && SUColorsPending.size() == 1) - PendingColoring[SU->NodeNum] = *SUColors.begin(); - else // TODO: Attribute new colors depending on color - // combination of children. - PendingColoring[SU->NodeNum] = NextNonReservedID++; - } - CurrentColoring = PendingColoring; -} - - -void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() { - unsigned DAGSize = DAG->SUnits.size(); - unsigned PreviousColor; - std::set<unsigned> SeenColors; - - if (DAGSize <= 1) - return; - - PreviousColor = CurrentColoring[0]; - - for (unsigned i = 1, e = DAGSize; i != e; ++i) { - SUnit *SU = &DAG->SUnits[i]; - unsigned CurrentColor = CurrentColoring[i]; - unsigned PreviousColorSave = PreviousColor; - assert(i == SU->NodeNum); - - if (CurrentColor != PreviousColor) - SeenColors.insert(PreviousColor); - PreviousColor = CurrentColor; - - if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) - continue; - - if (SeenColors.find(CurrentColor) == SeenColors.end()) - continue; - - if (PreviousColorSave != CurrentColor) - CurrentColoring[i] = NextNonReservedID++; - else - CurrentColoring[i] = CurrentColoring[i-1]; - } -} - -void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() { - unsigned DAGSize = DAG->SUnits.size(); - - for (unsigned SUNum : DAG->BottomUpIndex2SU) { - SUnit *SU = &DAG->SUnits[SUNum]; - std::set<unsigned> SUColors; - - if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) - continue; - - // No predecessor: Vgpr constant loading. - // Low latency instructions usually have a predecessor (the address) - if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum]) - continue; - - for (SDep& SuccDep : SU->Succs) { - SUnit *Succ = SuccDep.getSUnit(); - if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) - continue; - SUColors.insert(CurrentColoring[Succ->NodeNum]); - } - if (SUColors.size() == 1) - CurrentColoring[SU->NodeNum] = *SUColors.begin(); - } -} - -void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() { - unsigned DAGSize = DAG->SUnits.size(); - - for (unsigned SUNum : DAG->BottomUpIndex2SU) { - SUnit *SU = &DAG->SUnits[SUNum]; - std::set<unsigned> SUColors; - - if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) - continue; - - for (SDep& SuccDep : SU->Succs) { - SUnit *Succ = SuccDep.getSUnit(); - if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) - continue; - SUColors.insert(CurrentColoring[Succ->NodeNum]); - } - if (SUColors.size() == 1) - CurrentColoring[SU->NodeNum] = *SUColors.begin(); - } -} - -void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() { - unsigned DAGSize = DAG->SUnits.size(); - - for (unsigned SUNum : DAG->BottomUpIndex2SU) { - SUnit *SU = &DAG->SUnits[SUNum]; - std::set<unsigned> SUColors; - - if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) - continue; - - for (SDep& SuccDep : SU->Succs) { - SUnit *Succ = SuccDep.getSUnit(); - if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) - continue; - SUColors.insert(CurrentColoring[Succ->NodeNum]); - } - if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize) - CurrentColoring[SU->NodeNum] = *SUColors.begin(); - } -} - -void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() { - unsigned DAGSize = DAG->SUnits.size(); - std::map<unsigned, unsigned> ColorCount; - - for (unsigned SUNum : DAG->BottomUpIndex2SU) { - SUnit *SU = &DAG->SUnits[SUNum]; - unsigned color = CurrentColoring[SU->NodeNum]; - ++ColorCount[color]; - } - - for (unsigned SUNum : DAG->BottomUpIndex2SU) { - SUnit *SU = &DAG->SUnits[SUNum]; - unsigned color = CurrentColoring[SU->NodeNum]; - std::set<unsigned> SUColors; - - if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) - continue; - - if (ColorCount[color] > 1) - continue; - - for (SDep& SuccDep : SU->Succs) { - SUnit *Succ = SuccDep.getSUnit(); - if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) - continue; - SUColors.insert(CurrentColoring[Succ->NodeNum]); - } - if (SUColors.size() == 1 && *SUColors.begin() != color) { - --ColorCount[color]; - CurrentColoring[SU->NodeNum] = *SUColors.begin(); - ++ColorCount[*SUColors.begin()]; - } - } -} - -void SIScheduleBlockCreator::cutHugeBlocks() { - // TODO -} - -void SIScheduleBlockCreator::regroupNoUserInstructions() { - unsigned DAGSize = DAG->SUnits.size(); - int GroupID = NextNonReservedID++; - - for (unsigned SUNum : DAG->BottomUpIndex2SU) { - SUnit *SU = &DAG->SUnits[SUNum]; - bool hasSuccessor = false; - - if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) - continue; - - for (SDep& SuccDep : SU->Succs) { - SUnit *Succ = SuccDep.getSUnit(); - if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) - continue; - hasSuccessor = true; - } - if (!hasSuccessor) - CurrentColoring[SU->NodeNum] = GroupID; - } -} - -void SIScheduleBlockCreator::colorExports() { - unsigned ExportColor = NextNonReservedID++; - SmallVector<unsigned, 8> ExpGroup; - - // Put all exports together in a block. - // The block will naturally end up being scheduled last, - // thus putting exports at the end of the schedule, which - // is better for performance. - // However we must ensure, for safety, the exports can be put - // together in the same block without any other instruction. - // This could happen, for example, when scheduling after regalloc - // if reloading a spilled register from memory using the same - // register than used in a previous export. - // If that happens, do not regroup the exports. - for (unsigned SUNum : DAG->TopDownIndex2SU) { - const SUnit &SU = DAG->SUnits[SUNum]; - if (SIInstrInfo::isEXP(*SU.getInstr())) { - // Check the EXP can be added to the group safely, - // ie without needing any other instruction. - // The EXP is allowed to depend on other EXP - // (they will be in the same group). - for (unsigned j : ExpGroup) { - bool HasSubGraph; - std::vector<int> SubGraph; - // By construction (topological order), if SU and - // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary - // in the parent graph of SU. -#ifndef NDEBUG - SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j], - HasSubGraph); - assert(!HasSubGraph); -#endif - SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU, - HasSubGraph); - if (!HasSubGraph) - continue; // No dependencies between each other - - // SubGraph contains all the instructions required - // between EXP SUnits[j] and EXP SU. - for (unsigned k : SubGraph) { - if (!SIInstrInfo::isEXP(*DAG->SUnits[k].getInstr())) - // Other instructions than EXP would be required in the group. - // Abort the groupping. - return; - } - } - - ExpGroup.push_back(SUNum); - } - } - - // The group can be formed. Give the color. - for (unsigned j : ExpGroup) - CurrentColoring[j] = ExportColor; -} - -void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) { - unsigned DAGSize = DAG->SUnits.size(); - std::map<unsigned,unsigned> RealID; - - CurrentBlocks.clear(); - CurrentColoring.clear(); - CurrentColoring.resize(DAGSize, 0); - Node2CurrentBlock.clear(); - - // Restore links previous scheduling variant has overridden. - DAG->restoreSULinksLeft(); - - NextReservedID = 1; - NextNonReservedID = DAGSize + 1; - - LLVM_DEBUG(dbgs() << "Coloring the graph\n"); - - if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped) - colorHighLatenciesGroups(); - else - colorHighLatenciesAlone(); - colorComputeReservedDependencies(); - colorAccordingToReservedDependencies(); - colorEndsAccordingToDependencies(); - if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive) - colorForceConsecutiveOrderInGroup(); - regroupNoUserInstructions(); - colorMergeConstantLoadsNextGroup(); - colorMergeIfPossibleNextGroupOnlyForReserved(); - colorExports(); - - // Put SUs of same color into same block - Node2CurrentBlock.resize(DAGSize, -1); - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &DAG->SUnits[i]; - unsigned Color = CurrentColoring[SU->NodeNum]; - if (RealID.find(Color) == RealID.end()) { - int ID = CurrentBlocks.size(); - BlockPtrs.push_back(llvm::make_unique<SIScheduleBlock>(DAG, this, ID)); - CurrentBlocks.push_back(BlockPtrs.rbegin()->get()); - RealID[Color] = ID; - } - CurrentBlocks[RealID[Color]]->addUnit(SU); - Node2CurrentBlock[SU->NodeNum] = RealID[Color]; - } - - // Build dependencies between blocks. - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &DAG->SUnits[i]; - int SUID = Node2CurrentBlock[i]; - for (SDep& SuccDep : SU->Succs) { - SUnit *Succ = SuccDep.getSUnit(); - if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) - continue; - if (Node2CurrentBlock[Succ->NodeNum] != SUID) - CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]], - SuccDep.isCtrl() ? NoData : Data); - } - for (SDep& PredDep : SU->Preds) { - SUnit *Pred = PredDep.getSUnit(); - if (PredDep.isWeak() || Pred->NodeNum >= DAGSize) - continue; - if (Node2CurrentBlock[Pred->NodeNum] != SUID) - CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]); - } - } - - // Free root and leafs of all blocks to enable scheduling inside them. - for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { - SIScheduleBlock *Block = CurrentBlocks[i]; - Block->finalizeUnits(); - } - LLVM_DEBUG(dbgs() << "Blocks created:\n\n"; - for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { - SIScheduleBlock *Block = CurrentBlocks[i]; - Block->printDebug(true); - }); -} - -// Two functions taken from Codegen/MachineScheduler.cpp - -/// Non-const version. -static MachineBasicBlock::iterator -nextIfDebug(MachineBasicBlock::iterator I, - MachineBasicBlock::const_iterator End) { - for (; I != End; ++I) { - if (!I->isDebugInstr()) - break; - } - return I; -} - -void SIScheduleBlockCreator::topologicalSort() { - unsigned DAGSize = CurrentBlocks.size(); - std::vector<int> WorkList; - - LLVM_DEBUG(dbgs() << "Topological Sort\n"); - - WorkList.reserve(DAGSize); - TopDownIndex2Block.resize(DAGSize); - TopDownBlock2Index.resize(DAGSize); - BottomUpIndex2Block.resize(DAGSize); - - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SIScheduleBlock *Block = CurrentBlocks[i]; - unsigned Degree = Block->getSuccs().size(); - TopDownBlock2Index[i] = Degree; - if (Degree == 0) { - WorkList.push_back(i); - } - } - - int Id = DAGSize; - while (!WorkList.empty()) { - int i = WorkList.back(); - SIScheduleBlock *Block = CurrentBlocks[i]; - WorkList.pop_back(); - TopDownBlock2Index[i] = --Id; - TopDownIndex2Block[Id] = i; - for (SIScheduleBlock* Pred : Block->getPreds()) { - if (!--TopDownBlock2Index[Pred->getID()]) - WorkList.push_back(Pred->getID()); - } - } - -#ifndef NDEBUG - // Check correctness of the ordering. - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SIScheduleBlock *Block = CurrentBlocks[i]; - for (SIScheduleBlock* Pred : Block->getPreds()) { - assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] && - "Wrong Top Down topological sorting"); - } - } -#endif - - BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(), - TopDownIndex2Block.rend()); -} - -void SIScheduleBlockCreator::scheduleInsideBlocks() { - unsigned DAGSize = CurrentBlocks.size(); - - LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n"); - - // We do schedule a valid scheduling such that a Block corresponds - // to a range of instructions. - LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n"); - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SIScheduleBlock *Block = CurrentBlocks[i]; - Block->fastSchedule(); - } - - // Note: the following code, and the part restoring previous position - // is by far the most expensive operation of the Scheduler. - - // Do not update CurrentTop. - MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop(); - std::vector<MachineBasicBlock::iterator> PosOld; - std::vector<MachineBasicBlock::iterator> PosNew; - PosOld.reserve(DAG->SUnits.size()); - PosNew.reserve(DAG->SUnits.size()); - - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - int BlockIndice = TopDownIndex2Block[i]; - SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; - std::vector<SUnit*> SUs = Block->getScheduledUnits(); - - for (SUnit* SU : SUs) { - MachineInstr *MI = SU->getInstr(); - MachineBasicBlock::iterator Pos = MI; - PosOld.push_back(Pos); - if (&*CurrentTopFastSched == MI) { - PosNew.push_back(Pos); - CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched, - DAG->getCurrentBottom()); - } else { - // Update the instruction stream. - DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI); - - // Update LiveIntervals. - // Note: Moving all instructions and calling handleMove every time - // is the most cpu intensive operation of the scheduler. - // It would gain a lot if there was a way to recompute the - // LiveIntervals for the entire scheduling region. - DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true); - PosNew.push_back(CurrentTopFastSched); - } - } - } - - // Now we have Block of SUs == Block of MI. - // We do the final schedule for the instructions inside the block. - // The property that all the SUs of the Block are grouped together as MI - // is used for correct reg usage tracking. - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SIScheduleBlock *Block = CurrentBlocks[i]; - std::vector<SUnit*> SUs = Block->getScheduledUnits(); - Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr()); - } - - LLVM_DEBUG(dbgs() << "Restoring MI Pos\n"); - // Restore old ordering (which prevents a LIS->handleMove bug). - for (unsigned i = PosOld.size(), e = 0; i != e; --i) { - MachineBasicBlock::iterator POld = PosOld[i-1]; - MachineBasicBlock::iterator PNew = PosNew[i-1]; - if (PNew != POld) { - // Update the instruction stream. - DAG->getBB()->splice(POld, DAG->getBB(), PNew); - - // Update LiveIntervals. - DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true); - } - } - - LLVM_DEBUG(for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { - SIScheduleBlock *Block = CurrentBlocks[i]; - Block->printDebug(true); - }); -} - -void SIScheduleBlockCreator::fillStats() { - unsigned DAGSize = CurrentBlocks.size(); - - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - int BlockIndice = TopDownIndex2Block[i]; - SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; - if (Block->getPreds().empty()) - Block->Depth = 0; - else { - unsigned Depth = 0; - for (SIScheduleBlock *Pred : Block->getPreds()) { - if (Depth < Pred->Depth + Pred->getCost()) - Depth = Pred->Depth + Pred->getCost(); - } - Block->Depth = Depth; - } - } - - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - int BlockIndice = BottomUpIndex2Block[i]; - SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; - if (Block->getSuccs().empty()) - Block->Height = 0; - else { - unsigned Height = 0; - for (const auto &Succ : Block->getSuccs()) - Height = std::max(Height, Succ.first->Height + Succ.first->getCost()); - Block->Height = Height; - } - } -} - -// SIScheduleBlockScheduler // - -SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, - SISchedulerBlockSchedulerVariant Variant, - SIScheduleBlocks BlocksStruct) : - DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks), - LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0), - SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) { - - // Fill the usage of every output - // Warning: while by construction we always have a link between two blocks - // when one needs a result from the other, the number of users of an output - // is not the sum of child blocks having as input the same virtual register. - // Here is an example. A produces x and y. B eats x and produces x'. - // C eats x' and y. The register coalescer may have attributed the same - // virtual register to x and x'. - // To count accurately, we do a topological sort. In case the register is - // found for several parents, we increment the usage of the one with the - // highest topological index. - LiveOutRegsNumUsages.resize(Blocks.size()); - for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { - SIScheduleBlock *Block = Blocks[i]; - for (unsigned Reg : Block->getInRegs()) { - bool Found = false; - int topoInd = -1; - for (SIScheduleBlock* Pred: Block->getPreds()) { - std::set<unsigned> PredOutRegs = Pred->getOutRegs(); - std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg); - - if (RegPos != PredOutRegs.end()) { - Found = true; - if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) { - topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()]; - } - } - } - - if (!Found) - continue; - - int PredID = BlocksStruct.TopDownIndex2Block[topoInd]; - ++LiveOutRegsNumUsages[PredID][Reg]; - } - } - - LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0); - BlockNumPredsLeft.resize(Blocks.size()); - BlockNumSuccsLeft.resize(Blocks.size()); - - for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { - SIScheduleBlock *Block = Blocks[i]; - BlockNumPredsLeft[i] = Block->getPreds().size(); - BlockNumSuccsLeft[i] = Block->getSuccs().size(); - } - -#ifndef NDEBUG - for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { - SIScheduleBlock *Block = Blocks[i]; - assert(Block->getID() == i); - } -#endif - - std::set<unsigned> InRegs = DAG->getInRegs(); - addLiveRegs(InRegs); - - // Increase LiveOutRegsNumUsages for blocks - // producing registers consumed in another - // scheduling region. - for (unsigned Reg : DAG->getOutRegs()) { - for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { - // Do reverse traversal - int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i]; - SIScheduleBlock *Block = Blocks[ID]; - const std::set<unsigned> &OutRegs = Block->getOutRegs(); - - if (OutRegs.find(Reg) == OutRegs.end()) - continue; - - ++LiveOutRegsNumUsages[ID][Reg]; - break; - } - } - - // Fill LiveRegsConsumers for regs that were already - // defined before scheduling. - for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { - SIScheduleBlock *Block = Blocks[i]; - for (unsigned Reg : Block->getInRegs()) { - bool Found = false; - for (SIScheduleBlock* Pred: Block->getPreds()) { - std::set<unsigned> PredOutRegs = Pred->getOutRegs(); - std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg); - - if (RegPos != PredOutRegs.end()) { - Found = true; - break; - } - } - - if (!Found) - ++LiveRegsConsumers[Reg]; - } - } - - for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { - SIScheduleBlock *Block = Blocks[i]; - if (BlockNumPredsLeft[i] == 0) { - ReadyBlocks.push_back(Block); - } - } - - while (SIScheduleBlock *Block = pickBlock()) { - BlocksScheduled.push_back(Block); - blockScheduled(Block); - } - - LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock *Block - : BlocksScheduled) { - dbgs() << ' ' << Block->getID(); - } dbgs() << '\n';); -} - -bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand, - SIBlockSchedCandidate &TryCand) { - if (!Cand.isValid()) { - TryCand.Reason = NodeOrder; - return true; - } - - // Try to hide high latencies. - if (SISched::tryLess(TryCand.LastPosHighLatParentScheduled, - Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency)) - return true; - // Schedule high latencies early so you can hide them better. - if (SISched::tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency, - TryCand, Cand, Latency)) - return true; - if (TryCand.IsHighLatency && SISched::tryGreater(TryCand.Height, Cand.Height, - TryCand, Cand, Depth)) - return true; - if (SISched::tryGreater(TryCand.NumHighLatencySuccessors, - Cand.NumHighLatencySuccessors, - TryCand, Cand, Successor)) - return true; - return false; -} - -bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand, - SIBlockSchedCandidate &TryCand) { - if (!Cand.isValid()) { - TryCand.Reason = NodeOrder; - return true; - } - - if (SISched::tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0, - TryCand, Cand, RegUsage)) - return true; - if (SISched::tryGreater(TryCand.NumSuccessors > 0, - Cand.NumSuccessors > 0, - TryCand, Cand, Successor)) - return true; - if (SISched::tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth)) - return true; - if (SISched::tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff, - TryCand, Cand, RegUsage)) - return true; - return false; -} - -SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { - SIBlockSchedCandidate Cand; - std::vector<SIScheduleBlock*>::iterator Best; - SIScheduleBlock *Block; - if (ReadyBlocks.empty()) - return nullptr; - - DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(), - VregCurrentUsage, SregCurrentUsage); - if (VregCurrentUsage > maxVregUsage) - maxVregUsage = VregCurrentUsage; - if (SregCurrentUsage > maxSregUsage) - maxSregUsage = SregCurrentUsage; - LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: "; - for (SIScheduleBlock *Block - : ReadyBlocks) dbgs() - << Block->getID() << ' '; - dbgs() << "\nCurrent Live:\n"; - for (unsigned Reg - : LiveRegs) dbgs() - << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; - dbgs() << '\n'; - dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; - dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';); - - Cand.Block = nullptr; - for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(), - E = ReadyBlocks.end(); I != E; ++I) { - SIBlockSchedCandidate TryCand; - TryCand.Block = *I; - TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock(); - TryCand.VGPRUsageDiff = - checkRegUsageImpact(TryCand.Block->getInRegs(), - TryCand.Block->getOutRegs())[DAG->getVGPRSetID()]; - TryCand.NumSuccessors = TryCand.Block->getSuccs().size(); - TryCand.NumHighLatencySuccessors = - TryCand.Block->getNumHighLatencySuccessors(); - TryCand.LastPosHighLatParentScheduled = - (unsigned int) std::max<int> (0, - LastPosHighLatencyParentScheduled[TryCand.Block->getID()] - - LastPosWaitedHighLatency); - TryCand.Height = TryCand.Block->Height; - // Try not to increase VGPR usage too much, else we may spill. - if (VregCurrentUsage > 120 || - Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) { - if (!tryCandidateRegUsage(Cand, TryCand) && - Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage) - tryCandidateLatency(Cand, TryCand); - } else { - if (!tryCandidateLatency(Cand, TryCand)) - tryCandidateRegUsage(Cand, TryCand); - } - if (TryCand.Reason != NoCand) { - Cand.setBest(TryCand); - Best = I; - LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' ' - << getReasonStr(Cand.Reason) << '\n'); - } - } - - LLVM_DEBUG(dbgs() << "Picking: " << Cand.Block->getID() << '\n'; - dbgs() << "Is a block with high latency instruction: " - << (Cand.IsHighLatency ? "yes\n" : "no\n"); - dbgs() << "Position of last high latency dependency: " - << Cand.LastPosHighLatParentScheduled << '\n'; - dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n'; - dbgs() << '\n';); - - Block = Cand.Block; - ReadyBlocks.erase(Best); - return Block; -} - -// Tracking of currently alive registers to determine VGPR Usage. - -void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) { - for (unsigned Reg : Regs) { - // For now only track virtual registers. - if (!TargetRegisterInfo::isVirtualRegister(Reg)) - continue; - // If not already in the live set, then add it. - (void) LiveRegs.insert(Reg); - } -} - -void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block, - std::set<unsigned> &Regs) { - for (unsigned Reg : Regs) { - // For now only track virtual registers. - std::set<unsigned>::iterator Pos = LiveRegs.find(Reg); - assert (Pos != LiveRegs.end() && // Reg must be live. - LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() && - LiveRegsConsumers[Reg] >= 1); - --LiveRegsConsumers[Reg]; - if (LiveRegsConsumers[Reg] == 0) - LiveRegs.erase(Pos); - } -} - -void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) { - for (const auto &Block : Parent->getSuccs()) { - if (--BlockNumPredsLeft[Block.first->getID()] == 0) - ReadyBlocks.push_back(Block.first); - - if (Parent->isHighLatencyBlock() && - Block.second == SIScheduleBlockLinkKind::Data) - LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled; - } -} - -void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) { - decreaseLiveRegs(Block, Block->getInRegs()); - addLiveRegs(Block->getOutRegs()); - releaseBlockSuccs(Block); - for (std::map<unsigned, unsigned>::iterator RegI = - LiveOutRegsNumUsages[Block->getID()].begin(), - E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) { - std::pair<unsigned, unsigned> RegP = *RegI; - // We produce this register, thus it must not be previously alive. - assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() || - LiveRegsConsumers[RegP.first] == 0); - LiveRegsConsumers[RegP.first] += RegP.second; - } - if (LastPosHighLatencyParentScheduled[Block->getID()] > - (unsigned)LastPosWaitedHighLatency) - LastPosWaitedHighLatency = - LastPosHighLatencyParentScheduled[Block->getID()]; - ++NumBlockScheduled; -} - -std::vector<int> -SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs, - std::set<unsigned> &OutRegs) { - std::vector<int> DiffSetPressure; - DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0); - - for (unsigned Reg : InRegs) { - // For now only track virtual registers. - if (!TargetRegisterInfo::isVirtualRegister(Reg)) - continue; - if (LiveRegsConsumers[Reg] > 1) - continue; - PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); - for (; PSetI.isValid(); ++PSetI) { - DiffSetPressure[*PSetI] -= PSetI.getWeight(); - } - } - - for (unsigned Reg : OutRegs) { - // For now only track virtual registers. - if (!TargetRegisterInfo::isVirtualRegister(Reg)) - continue; - PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); - for (; PSetI.isValid(); ++PSetI) { - DiffSetPressure[*PSetI] += PSetI.getWeight(); - } - } - - return DiffSetPressure; -} - -// SIScheduler // - -struct SIScheduleBlockResult -SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant, - SISchedulerBlockSchedulerVariant ScheduleVariant) { - SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant); - SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks); - std::vector<SIScheduleBlock*> ScheduledBlocks; - struct SIScheduleBlockResult Res; - - ScheduledBlocks = Scheduler.getBlocks(); - - for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) { - SIScheduleBlock *Block = ScheduledBlocks[b]; - std::vector<SUnit*> SUs = Block->getScheduledUnits(); - - for (SUnit* SU : SUs) - Res.SUs.push_back(SU->NodeNum); - } - - Res.MaxSGPRUsage = Scheduler.getSGPRUsage(); - Res.MaxVGPRUsage = Scheduler.getVGPRUsage(); - return Res; -} - -// SIScheduleDAGMI // - -SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) : - ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C)) { - SITII = static_cast<const SIInstrInfo*>(TII); - SITRI = static_cast<const SIRegisterInfo*>(TRI); - - VGPRSetID = SITRI->getVGPRPressureSet(); - SGPRSetID = SITRI->getSGPRPressureSet(); -} - -SIScheduleDAGMI::~SIScheduleDAGMI() = default; - -// Code adapted from scheduleDAG.cpp -// Does a topological sort over the SUs. -// Both TopDown and BottomUp -void SIScheduleDAGMI::topologicalSort() { - Topo.InitDAGTopologicalSorting(); - - TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end()); - BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend()); -} - -// Move low latencies further from their user without -// increasing SGPR usage (in general) -// This is to be replaced by a better pass that would -// take into account SGPR usage (based on VGPR Usage -// and the corresponding wavefront count), that would -// try to merge groups of loads if it make sense, etc -void SIScheduleDAGMI::moveLowLatencies() { - unsigned DAGSize = SUnits.size(); - int LastLowLatencyUser = -1; - int LastLowLatencyPos = -1; - - for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) { - SUnit *SU = &SUnits[ScheduledSUnits[i]]; - bool IsLowLatencyUser = false; - unsigned MinPos = 0; - - for (SDep& PredDep : SU->Preds) { - SUnit *Pred = PredDep.getSUnit(); - if (SITII->isLowLatencyInstruction(*Pred->getInstr())) { - IsLowLatencyUser = true; - } - if (Pred->NodeNum >= DAGSize) - continue; - unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum]; - if (PredPos >= MinPos) - MinPos = PredPos + 1; - } - - if (SITII->isLowLatencyInstruction(*SU->getInstr())) { - unsigned BestPos = LastLowLatencyUser + 1; - if ((int)BestPos <= LastLowLatencyPos) - BestPos = LastLowLatencyPos + 1; - if (BestPos < MinPos) - BestPos = MinPos; - if (BestPos < i) { - for (unsigned u = i; u > BestPos; --u) { - ++ScheduledSUnitsInv[ScheduledSUnits[u-1]]; - ScheduledSUnits[u] = ScheduledSUnits[u-1]; - } - ScheduledSUnits[BestPos] = SU->NodeNum; - ScheduledSUnitsInv[SU->NodeNum] = BestPos; - } - LastLowLatencyPos = BestPos; - if (IsLowLatencyUser) - LastLowLatencyUser = BestPos; - } else if (IsLowLatencyUser) { - LastLowLatencyUser = i; - // Moves COPY instructions on which depends - // the low latency instructions too. - } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) { - bool CopyForLowLat = false; - for (SDep& SuccDep : SU->Succs) { - SUnit *Succ = SuccDep.getSUnit(); - if (SITII->isLowLatencyInstruction(*Succ->getInstr())) { - CopyForLowLat = true; - } - } - if (!CopyForLowLat) - continue; - if (MinPos < i) { - for (unsigned u = i; u > MinPos; --u) { - ++ScheduledSUnitsInv[ScheduledSUnits[u-1]]; - ScheduledSUnits[u] = ScheduledSUnits[u-1]; - } - ScheduledSUnits[MinPos] = SU->NodeNum; - ScheduledSUnitsInv[SU->NodeNum] = MinPos; - } - } - } -} - -void SIScheduleDAGMI::restoreSULinksLeft() { - for (unsigned i = 0, e = SUnits.size(); i != e; ++i) { - SUnits[i].isScheduled = false; - SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft; - SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft; - SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft; - SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft; - } -} - -// Return the Vgpr and Sgpr usage corresponding to some virtual registers. -template<typename _Iterator> void -SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End, - unsigned &VgprUsage, unsigned &SgprUsage) { - VgprUsage = 0; - SgprUsage = 0; - for (_Iterator RegI = First; RegI != End; ++RegI) { - unsigned Reg = *RegI; - // For now only track virtual registers - if (!TargetRegisterInfo::isVirtualRegister(Reg)) - continue; - PSetIterator PSetI = MRI.getPressureSets(Reg); - for (; PSetI.isValid(); ++PSetI) { - if (*PSetI == VGPRSetID) - VgprUsage += PSetI.getWeight(); - else if (*PSetI == SGPRSetID) - SgprUsage += PSetI.getWeight(); - } - } -} - -void SIScheduleDAGMI::schedule() -{ - SmallVector<SUnit*, 8> TopRoots, BotRoots; - SIScheduleBlockResult Best, Temp; - LLVM_DEBUG(dbgs() << "Preparing Scheduling\n"); - - buildDAGWithRegPressure(); - LLVM_DEBUG(dump()); - - topologicalSort(); - findRootsAndBiasEdges(TopRoots, BotRoots); - // We reuse several ScheduleDAGMI and ScheduleDAGMILive - // functions, but to make them happy we must initialize - // the default Scheduler implementation (even if we do not - // run it) - SchedImpl->initialize(this); - initQueues(TopRoots, BotRoots); - - // Fill some stats to help scheduling. - - SUnitsLinksBackup = SUnits; - IsLowLatencySU.clear(); - LowLatencyOffset.clear(); - IsHighLatencySU.clear(); - - IsLowLatencySU.resize(SUnits.size(), 0); - LowLatencyOffset.resize(SUnits.size(), 0); - IsHighLatencySU.resize(SUnits.size(), 0); - - for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { - SUnit *SU = &SUnits[i]; - MachineOperand *BaseLatOp; - int64_t OffLatReg; - if (SITII->isLowLatencyInstruction(*SU->getInstr())) { - IsLowLatencySU[i] = 1; - if (SITII->getMemOperandWithOffset(*SU->getInstr(), BaseLatOp, OffLatReg, - TRI)) - LowLatencyOffset[i] = OffLatReg; - } else if (SITII->isHighLatencyInstruction(*SU->getInstr())) - IsHighLatencySU[i] = 1; - } - - SIScheduler Scheduler(this); - Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone, - SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage); - - // if VGPR usage is extremely high, try other good performing variants - // which could lead to lower VGPR usage - if (Best.MaxVGPRUsage > 180) { - static const std::pair<SISchedulerBlockCreatorVariant, - SISchedulerBlockSchedulerVariant> - Variants[] = { - { LatenciesAlone, BlockRegUsageLatency }, -// { LatenciesAlone, BlockRegUsage }, - { LatenciesGrouped, BlockLatencyRegUsage }, -// { LatenciesGrouped, BlockRegUsageLatency }, -// { LatenciesGrouped, BlockRegUsage }, - { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage }, -// { LatenciesAlonePlusConsecutive, BlockRegUsageLatency }, -// { LatenciesAlonePlusConsecutive, BlockRegUsage } - }; - for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) { - Temp = Scheduler.scheduleVariant(v.first, v.second); - if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage) - Best = Temp; - } - } - // if VGPR usage is still extremely high, we may spill. Try other variants - // which are less performing, but that could lead to lower VGPR usage. - if (Best.MaxVGPRUsage > 200) { - static const std::pair<SISchedulerBlockCreatorVariant, - SISchedulerBlockSchedulerVariant> - Variants[] = { -// { LatenciesAlone, BlockRegUsageLatency }, - { LatenciesAlone, BlockRegUsage }, -// { LatenciesGrouped, BlockLatencyRegUsage }, - { LatenciesGrouped, BlockRegUsageLatency }, - { LatenciesGrouped, BlockRegUsage }, -// { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage }, - { LatenciesAlonePlusConsecutive, BlockRegUsageLatency }, - { LatenciesAlonePlusConsecutive, BlockRegUsage } - }; - for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) { - Temp = Scheduler.scheduleVariant(v.first, v.second); - if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage) - Best = Temp; - } - } - - ScheduledSUnits = Best.SUs; - ScheduledSUnitsInv.resize(SUnits.size()); - - for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { - ScheduledSUnitsInv[ScheduledSUnits[i]] = i; - } - - moveLowLatencies(); - - // Tell the outside world about the result of the scheduling. - - assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker"); - TopRPTracker.setPos(CurrentTop); - - for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(), - E = ScheduledSUnits.end(); I != E; ++I) { - SUnit *SU = &SUnits[*I]; - - scheduleMI(SU, true); - - LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " - << *SU->getInstr()); - } - - assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone."); - - placeDebugValues(); - - LLVM_DEBUG({ - dbgs() << "*** Final schedule for " - << printMBBReference(*begin()->getParent()) << " ***\n"; - dumpSchedule(); - dbgs() << '\n'; - }); -} |
