Remove LLVM 8.0.1 files.

author: patrick <patrick@openbsd.org> 2020-08-03 15:06:44 +0000
committer: patrick <patrick@openbsd.org> 2020-08-03 15:06:44 +0000
commit: b64793999546ed8adebaeebd9d8345d18db8927d (patch)
tree: 4357c27b561d73b0e089727c6ed659f2ceff5f47 /gnu/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
parent: Add support for UTF-8 DISPLAY-HINTs with octet length. For now only (diff)
download: wireguard-openbsd-b64793999546ed8adebaeebd9d8345d18db8927d.tar.xz
wireguard-openbsd-b64793999546ed8adebaeebd9d8345d18db8927d.zip
1 files changed, 0 insertions, 2050 deletions
diff --git a/gnu/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/gnu/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
deleted file mode 100644
index fb7e670068f..00000000000
--- a/gnu/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ /dev/null
@@ -1,2050 +0,0 @@
-//===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// SI Machine Scheduler interface
-//
-//===----------------------------------------------------------------------===//
-
-#include "SIMachineScheduler.h"
-#include "AMDGPU.h"
-#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/LiveInterval.h"
-#include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineScheduler.h"
-#include "llvm/CodeGen/RegisterPressure.h"
-#include "llvm/CodeGen/SlotIndexes.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <cassert>
-#include <map>
-#include <set>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "machine-scheduler"
-
-// This scheduler implements a different scheduling algorithm than
-// GenericScheduler.
-//
-// There are several specific architecture behaviours that can't be modelled
-// for GenericScheduler:
-// . When accessing the result of an SGPR load instruction, you have to wait
-// for all the SGPR load instructions before your current instruction to
-// have finished.
-// . When accessing the result of an VGPR load instruction, you have to wait
-// for all the VGPR load instructions previous to the VGPR load instruction
-// you are interested in to finish.
-// . The less the register pressure, the best load latencies are hidden
-//
-// Moreover some specifities (like the fact a lot of instructions in the shader
-// have few dependencies) makes the generic scheduler have some unpredictable
-// behaviours. For example when register pressure becomes high, it can either
-// manage to prevent register pressure from going too high, or it can
-// increase register pressure even more than if it hadn't taken register
-// pressure into account.
-//
-// Also some other bad behaviours are generated, like loading at the beginning
-// of the shader a constant in VGPR you won't need until the end of the shader.
-//
-// The scheduling problem for SI can distinguish three main parts:
-// . Hiding high latencies (texture sampling, etc)
-// . Hiding low latencies (SGPR constant loading, etc)
-// . Keeping register usage low for better latency hiding and general
-//   performance
-//
-// Some other things can also affect performance, but are hard to predict
-// (cache usage, the fact the HW can issue several instructions from different
-// wavefronts if different types, etc)
-//
-// This scheduler tries to solve the scheduling problem by dividing it into
-// simpler sub-problems. It divides the instructions into blocks, schedules
-// locally inside the blocks where it takes care of low latencies, and then
-// chooses the order of the blocks by taking care of high latencies.
-// Dividing the instructions into blocks helps control keeping register
-// usage low.
-//
-// First the instructions are put into blocks.
-//   We want the blocks help control register usage and hide high latencies
-//   later. To help control register usage, we typically want all local
-//   computations, when for example you create a result that can be comsummed
-//   right away, to be contained in a block. Block inputs and outputs would
-//   typically be important results that are needed in several locations of
-//   the shader. Since we do want blocks to help hide high latencies, we want
-//   the instructions inside the block to have a minimal set of dependencies
-//   on high latencies. It will make it easy to pick blocks to hide specific
-//   high latencies.
-//   The block creation algorithm is divided into several steps, and several
-//   variants can be tried during the scheduling process.
-//
-// Second the order of the instructions inside the blocks is chosen.
-//   At that step we do take into account only register usage and hiding
-//   low latency instructions
-//
-// Third the block order is chosen, there we try to hide high latencies
-// and keep register usage low.
-//
-// After the third step, a pass is done to improve the hiding of low
-// latencies.
-//
-// Actually when talking about 'low latency' or 'high latency' it includes
-// both the latency to get the cache (or global mem) data go to the register,
-// and the bandwidth limitations.
-// Increasing the number of active wavefronts helps hide the former, but it
-// doesn't solve the latter, thus why even if wavefront count is high, we have
-// to try have as many instructions hiding high latencies as possible.
-// The OpenCL doc says for example latency of 400 cycles for a global mem access,
-// which is hidden by 10 instructions if the wavefront count is 10.
-
-// Some figures taken from AMD docs:
-// Both texture and constant L1 caches are 4-way associative with 64 bytes
-// lines.
-// Constant cache is shared with 4 CUs.
-// For texture sampling, the address generation unit receives 4 texture
-// addresses per cycle, thus we could expect texture sampling latency to be
-// equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
-// instructions in a wavefront group are executed every 4 cycles),
-// or 16 instructions if the other wavefronts associated to the 3 other VALUs
-// of the CU do texture sampling too. (Don't take these figures too seriously,
-// as I'm not 100% sure of the computation)
-// Data exports should get similar latency.
-// For constant loading, the cache is shader with 4 CUs.
-// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
-// I guess if the other CU don't read the cache, it can go up to 64B/cycle.
-// It means a simple s_buffer_load should take one instruction to hide, as
-// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
-// cache line.
-//
-// As of today the driver doesn't preload the constants in cache, thus the
-// first loads get extra latency. The doc says global memory access can be
-// 300-600 cycles. We do not specially take that into account when scheduling
-// As we expect the driver to be able to preload the constants soon.
-
-// common code //
-
-#ifndef NDEBUG
-
-static const char *getReasonStr(SIScheduleCandReason Reason) {
-  switch (Reason) {
-  case NoCand:         return "NOCAND";
-  case RegUsage:       return "REGUSAGE";
-  case Latency:        return "LATENCY";
-  case Successor:      return "SUCCESSOR";
-  case Depth:          return "DEPTH";
-  case NodeOrder:      return "ORDER";
-  }
-  llvm_unreachable("Unknown reason!");
-}
-
-#endif
-
-namespace llvm {
-namespace SISched {
-static bool tryLess(int TryVal, int CandVal,
-                    SISchedulerCandidate &TryCand,
-                    SISchedulerCandidate &Cand,
-                    SIScheduleCandReason Reason) {
-  if (TryVal < CandVal) {
-    TryCand.Reason = Reason;
-    return true;
-  }
-  if (TryVal > CandVal) {
-    if (Cand.Reason > Reason)
-      Cand.Reason = Reason;
-    return true;
-  }
-  Cand.setRepeat(Reason);
-  return false;
-}
-
-static bool tryGreater(int TryVal, int CandVal,
-                       SISchedulerCandidate &TryCand,
-                       SISchedulerCandidate &Cand,
-                       SIScheduleCandReason Reason) {
-  if (TryVal > CandVal) {
-    TryCand.Reason = Reason;
-    return true;
-  }
-  if (TryVal < CandVal) {
-    if (Cand.Reason > Reason)
-      Cand.Reason = Reason;
-    return true;
-  }
-  Cand.setRepeat(Reason);
-  return false;
-}
-} // end namespace SISched
-} // end namespace llvm
-
-// SIScheduleBlock //
-
-void SIScheduleBlock::addUnit(SUnit *SU) {
-  NodeNum2Index[SU->NodeNum] = SUnits.size();
-  SUnits.push_back(SU);
-}
-
-#ifndef NDEBUG
-void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
-
-  dbgs() << "  SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
-  dbgs() << '\n';
-}
-#endif
-
-void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
-                                          SISchedCandidate &TryCand) {
-  // Initialize the candidate if needed.
-  if (!Cand.isValid()) {
-    TryCand.Reason = NodeOrder;
-    return;
-  }
-
-  if (Cand.SGPRUsage > 60 &&
-      SISched::tryLess(TryCand.SGPRUsage, Cand.SGPRUsage,
-                       TryCand, Cand, RegUsage))
-    return;
-
-  // Schedule low latency instructions as top as possible.
-  // Order of priority is:
-  // . Low latency instructions which do not depend on other low latency
-  //   instructions we haven't waited for
-  // . Other instructions which do not depend on low latency instructions
-  //   we haven't waited for
-  // . Low latencies
-  // . All other instructions
-  // Goal is to get: low latency instructions - independent instructions
-  //     - (eventually some more low latency instructions)
-  //     - instructions that depend on the first low latency instructions.
-  // If in the block there is a lot of constant loads, the SGPR usage
-  // could go quite high, thus above the arbitrary limit of 60 will encourage
-  // use the already loaded constants (in order to release some SGPRs) before
-  // loading more.
-  if (SISched::tryLess(TryCand.HasLowLatencyNonWaitedParent,
-                       Cand.HasLowLatencyNonWaitedParent,
-                       TryCand, Cand, SIScheduleCandReason::Depth))
-    return;
-
-  if (SISched::tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
-                          TryCand, Cand, SIScheduleCandReason::Depth))
-    return;
-
-  if (TryCand.IsLowLatency &&
-      SISched::tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
-                       TryCand, Cand, SIScheduleCandReason::Depth))
-    return;
-
-  if (SISched::tryLess(TryCand.VGPRUsage, Cand.VGPRUsage,
-                       TryCand, Cand, RegUsage))
-    return;
-
-  // Fall through to original instruction order.
-  if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
-    TryCand.Reason = NodeOrder;
-  }
-}
-
-SUnit* SIScheduleBlock::pickNode() {
-  SISchedCandidate TopCand;
-
-  for (SUnit* SU : TopReadySUs) {
-    SISchedCandidate TryCand;
-    std::vector<unsigned> pressure;
-    std::vector<unsigned> MaxPressure;
-    // Predict register usage after this instruction.
-    TryCand.SU = SU;
-    TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
-    TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()];
-    TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()];
-    TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
-    TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
-    TryCand.HasLowLatencyNonWaitedParent =
-      HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
-    tryCandidateTopDown(TopCand, TryCand);
-    if (TryCand.Reason != NoCand)
-      TopCand.setBest(TryCand);
-  }
-
-  return TopCand.SU;
-}
-
-
-// Schedule something valid.
-void SIScheduleBlock::fastSchedule() {
-  TopReadySUs.clear();
-  if (Scheduled)
-    undoSchedule();
-
-  for (SUnit* SU : SUnits) {
-    if (!SU->NumPredsLeft)
-      TopReadySUs.push_back(SU);
-  }
-
-  while (!TopReadySUs.empty()) {
-    SUnit *SU = TopReadySUs[0];
-    ScheduledSUnits.push_back(SU);
-    nodeScheduled(SU);
-  }
-
-  Scheduled = true;
-}
-
-// Returns if the register was set between first and last.
-static bool isDefBetween(unsigned Reg,
-                           SlotIndex First, SlotIndex Last,
-                           const MachineRegisterInfo *MRI,
-                           const LiveIntervals *LIS) {
-  for (MachineRegisterInfo::def_instr_iterator
-       UI = MRI->def_instr_begin(Reg),
-       UE = MRI->def_instr_end(); UI != UE; ++UI) {
-    const MachineInstr* MI = &*UI;
-    if (MI->isDebugValue())
-      continue;
-    SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
-    if (InstSlot >= First && InstSlot <= Last)
-      return true;
-  }
-  return false;
-}
-
-void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
-                                      MachineBasicBlock::iterator EndBlock) {
-  IntervalPressure Pressure, BotPressure;
-  RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
-  LiveIntervals *LIS = DAG->getLIS();
-  MachineRegisterInfo *MRI = DAG->getMRI();
-  DAG->initRPTracker(TopRPTracker);
-  DAG->initRPTracker(BotRPTracker);
-  DAG->initRPTracker(RPTracker);
-
-  // Goes though all SU. RPTracker captures what had to be alive for the SUs
-  // to execute, and what is still alive at the end.
-  for (SUnit* SU : ScheduledSUnits) {
-    RPTracker.setPos(SU->getInstr());
-    RPTracker.advance();
-  }
-
-  // Close the RPTracker to finalize live ins/outs.
-  RPTracker.closeRegion();
-
-  // Initialize the live ins and live outs.
-  TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
-  BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
-
-  // Do not Track Physical Registers, because it messes up.
-  for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
-    if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit))
-      LiveInRegs.insert(RegMaskPair.RegUnit);
-  }
-  LiveOutRegs.clear();
-  // There is several possibilities to distinguish:
-  // 1) Reg is not input to any instruction in the block, but is output of one
-  // 2) 1) + read in the block and not needed after it
-  // 3) 1) + read in the block but needed in another block
-  // 4) Reg is input of an instruction but another block will read it too
-  // 5) Reg is input of an instruction and then rewritten in the block.
-  //    result is not read in the block (implies used in another block)
-  // 6) Reg is input of an instruction and then rewritten in the block.
-  //    result is read in the block and not needed in another block
-  // 7) Reg is input of an instruction and then rewritten in the block.
-  //    result is read in the block but also needed in another block
-  // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
-  // We want LiveOutRegs to contain only Regs whose content will be read after
-  // in another block, and whose content was written in the current block,
-  // that is we want it to get 1, 3, 5, 7
-  // Since we made the MIs of a block to be packed all together before
-  // scheduling, then the LiveIntervals were correct, and the RPTracker was
-  // able to correctly handle 5 vs 6, 2 vs 3.
-  // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
-  // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
-  // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
-  // The use of findDefBetween removes the case 4.
-  for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
-    unsigned Reg = RegMaskPair.RegUnit;
-    if (TargetRegisterInfo::isVirtualRegister(Reg) &&
-        isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
-                     LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
-                     LIS)) {
-      LiveOutRegs.insert(Reg);
-    }
-  }
-
-  // Pressure = sum_alive_registers register size
-  // Internally llvm will represent some registers as big 128 bits registers
-  // for example, but they actually correspond to 4 actual 32 bits registers.
-  // Thus Pressure is not equal to num_alive_registers * constant.
-  LiveInPressure = TopPressure.MaxSetPressure;
-  LiveOutPressure = BotPressure.MaxSetPressure;
-
-  // Prepares TopRPTracker for top down scheduling.
-  TopRPTracker.closeTop();
-}
-
-void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
-                               MachineBasicBlock::iterator EndBlock) {
-  if (!Scheduled)
-    fastSchedule();
-
-  // PreScheduling phase to set LiveIn and LiveOut.
-  initRegPressure(BeginBlock, EndBlock);
-  undoSchedule();
-
-  // Schedule for real now.
-
-  TopReadySUs.clear();
-
-  for (SUnit* SU : SUnits) {
-    if (!SU->NumPredsLeft)
-      TopReadySUs.push_back(SU);
-  }
-
-  while (!TopReadySUs.empty()) {
-    SUnit *SU = pickNode();
-    ScheduledSUnits.push_back(SU);
-    TopRPTracker.setPos(SU->getInstr());
-    TopRPTracker.advance();
-    nodeScheduled(SU);
-  }
-
-  // TODO: compute InternalAdditionnalPressure.
-  InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size());
-
-  // Check everything is right.
-#ifndef NDEBUG
-  assert(SUnits.size() == ScheduledSUnits.size() &&
-            TopReadySUs.empty());
-  for (SUnit* SU : SUnits) {
-    assert(SU->isScheduled &&
-              SU->NumPredsLeft == 0);
-  }
-#endif
-
-  Scheduled = true;
-}
-
-void SIScheduleBlock::undoSchedule() {
-  for (SUnit* SU : SUnits) {
-    SU->isScheduled = false;
-    for (SDep& Succ : SU->Succs) {
-      if (BC->isSUInBlock(Succ.getSUnit(), ID))
-        undoReleaseSucc(SU, &Succ);
-    }
-  }
-  HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
-  ScheduledSUnits.clear();
-  Scheduled = false;
-}
-
-void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) {
-  SUnit *SuccSU = SuccEdge->getSUnit();
-
-  if (SuccEdge->isWeak()) {
-    ++SuccSU->WeakPredsLeft;
-    return;
-  }
-  ++SuccSU->NumPredsLeft;
-}
-
-void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) {
-  SUnit *SuccSU = SuccEdge->getSUnit();
-
-  if (SuccEdge->isWeak()) {
-    --SuccSU->WeakPredsLeft;
-    return;
-  }
-#ifndef NDEBUG
-  if (SuccSU->NumPredsLeft == 0) {
-    dbgs() << "*** Scheduling failed! ***\n";
-    DAG->dumpNode(*SuccSU);
-    dbgs() << " has been released too many times!\n";
-    llvm_unreachable(nullptr);
-  }
-#endif
-
-  --SuccSU->NumPredsLeft;
-}
-
-/// Release Successors of the SU that are in the block or not.
-void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
-  for (SDep& Succ : SU->Succs) {
-    SUnit *SuccSU = Succ.getSUnit();
-
-    if (SuccSU->NodeNum >= DAG->SUnits.size())
-        continue;
-
-    if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
-      continue;
-
-    releaseSucc(SU, &Succ);
-    if (SuccSU->NumPredsLeft == 0 && InOrOutBlock)
-      TopReadySUs.push_back(SuccSU);
-  }
-}
-
-void SIScheduleBlock::nodeScheduled(SUnit *SU) {
-  // Is in TopReadySUs
-  assert (!SU->NumPredsLeft);
-  std::vector<SUnit *>::iterator I = llvm::find(TopReadySUs, SU);
-  if (I == TopReadySUs.end()) {
-    dbgs() << "Data Structure Bug in SI Scheduler\n";
-    llvm_unreachable(nullptr);
-  }
-  TopReadySUs.erase(I);
-
-  releaseSuccessors(SU, true);
-  // Scheduling this node will trigger a wait,
-  // thus propagate to other instructions that they do not need to wait either.
-  if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
-    HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
-
-  if (DAG->IsLowLatencySU[SU->NodeNum]) {
-     for (SDep& Succ : SU->Succs) {
-      std::map<unsigned, unsigned>::iterator I =
-        NodeNum2Index.find(Succ.getSUnit()->NodeNum);
-      if (I != NodeNum2Index.end())
-        HasLowLatencyNonWaitedParent[I->second] = 1;
-    }
-  }
-  SU->isScheduled = true;
-}
-
-void SIScheduleBlock::finalizeUnits() {
-  // We remove links from outside blocks to enable scheduling inside the block.
-  for (SUnit* SU : SUnits) {
-    releaseSuccessors(SU, false);
-    if (DAG->IsHighLatencySU[SU->NodeNum])
-      HighLatencyBlock = true;
-  }
-  HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
-}
-
-// we maintain ascending order of IDs
-void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
-  unsigned PredID = Pred->getID();
-
-  // Check if not already predecessor.
-  for (SIScheduleBlock* P : Preds) {
-    if (PredID == P->getID())
-      return;
-  }
-  Preds.push_back(Pred);
-
-  assert(none_of(Succs,
-                 [=](std::pair<SIScheduleBlock*,
-                     SIScheduleBlockLinkKind> S) {
-                   return PredID == S.first->getID();
-                    }) &&
-         "Loop in the Block Graph!");
-}
-
-void SIScheduleBlock::addSucc(SIScheduleBlock *Succ,
-                              SIScheduleBlockLinkKind Kind) {
-  unsigned SuccID = Succ->getID();
-
-  // Check if not already predecessor.
-  for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) {
-    if (SuccID == S.first->getID()) {
-      if (S.second == SIScheduleBlockLinkKind::NoData &&
-          Kind == SIScheduleBlockLinkKind::Data)
-        S.second = Kind;
-      return;
-    }
-  }
-  if (Succ->isHighLatencyBlock())
-    ++NumHighLatencySuccessors;
-  Succs.push_back(std::make_pair(Succ, Kind));
-
-  assert(none_of(Preds,
-                 [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&
-         "Loop in the Block Graph!");
-}
-
-#ifndef NDEBUG
-void SIScheduleBlock::printDebug(bool full) {
-  dbgs() << "Block (" << ID << ")\n";
-  if (!full)
-    return;
-
-  dbgs() << "\nContains High Latency Instruction: "
-         << HighLatencyBlock << '\n';
-  dbgs() << "\nDepends On:\n";
-  for (SIScheduleBlock* P : Preds) {
-    P->printDebug(false);
-  }
-
-  dbgs() << "\nSuccessors:\n";
-  for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) {
-    if (S.second == SIScheduleBlockLinkKind::Data)
-      dbgs() << "(Data Dep) ";
-    S.first->printDebug(false);
-  }
-
-  if (Scheduled) {
-    dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' '
-           << LiveInPressure[DAG->getVGPRSetID()] << '\n';
-    dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' '
-           << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n";
-    dbgs() << "LiveIns:\n";
-    for (unsigned Reg : LiveInRegs)
-      dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
-
-    dbgs() << "\nLiveOuts:\n";
-    for (unsigned Reg : LiveOutRegs)
-      dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
-  }
-
-  dbgs() << "\nInstructions:\n";
-  if (!Scheduled) {
-    for (const SUnit* SU : SUnits)
-      DAG->dumpNode(*SU);
-  } else {
-    for (const SUnit* SU : SUnits)
-      DAG->dumpNode(*SU);
-  }
-
-  dbgs() << "///////////////////////\n";
-}
-#endif
-
-// SIScheduleBlockCreator //
-
-SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) :
-DAG(DAG) {
-}
-
-SIScheduleBlockCreator::~SIScheduleBlockCreator() = default;
-
-SIScheduleBlocks
-SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
-  std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
-    Blocks.find(BlockVariant);
-  if (B == Blocks.end()) {
-    SIScheduleBlocks Res;
-    createBlocksForVariant(BlockVariant);
-    topologicalSort();
-    scheduleInsideBlocks();
-    fillStats();
-    Res.Blocks = CurrentBlocks;
-    Res.TopDownIndex2Block = TopDownIndex2Block;
-    Res.TopDownBlock2Index = TopDownBlock2Index;
-    Blocks[BlockVariant] = Res;
-    return Res;
-  } else {
-    return B->second;
-  }
-}
-
-bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) {
-  if (SU->NodeNum >= DAG->SUnits.size())
-    return false;
-  return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
-}
-
-void SIScheduleBlockCreator::colorHighLatenciesAlone() {
-  unsigned DAGSize = DAG->SUnits.size();
-
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &DAG->SUnits[i];
-    if (DAG->IsHighLatencySU[SU->NodeNum]) {
-      CurrentColoring[SU->NodeNum] = NextReservedID++;
-    }
-  }
-}
-
-static bool
-hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) {
-  for (const auto &PredDep : SU.Preds) {
-    if (PredDep.getSUnit() == &FromSU &&
-        PredDep.getKind() == llvm::SDep::Data)
-      return true;
-  }
-  return false;
-}
-
-void SIScheduleBlockCreator::colorHighLatenciesGroups() {
-  unsigned DAGSize = DAG->SUnits.size();
-  unsigned NumHighLatencies = 0;
-  unsigned GroupSize;
-  int Color = NextReservedID;
-  unsigned Count = 0;
-  std::set<unsigned> FormingGroup;
-
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &DAG->SUnits[i];
-    if (DAG->IsHighLatencySU[SU->NodeNum])
-      ++NumHighLatencies;
-  }
-
-  if (NumHighLatencies == 0)
-    return;
-
-  if (NumHighLatencies <= 6)
-    GroupSize = 2;
-  else if (NumHighLatencies <= 12)
-    GroupSize = 3;
-  else
-    GroupSize = 4;
-
-  for (unsigned SUNum : DAG->TopDownIndex2SU) {
-    const SUnit &SU = DAG->SUnits[SUNum];
-    if (DAG->IsHighLatencySU[SU.NodeNum]) {
-      unsigned CompatibleGroup = true;
-      int ProposedColor = Color;
-      std::vector<int> AdditionalElements;
-
-      // We don't want to put in the same block
-      // two high latency instructions that depend
-      // on each other.
-      // One way would be to check canAddEdge
-      // in both directions, but that currently is not
-      // enough because there the high latency order is
-      // enforced (via links).
-      // Instead, look at the dependencies between the
-      // high latency instructions and deduce if it is
-      // a data dependency or not.
-      for (unsigned j : FormingGroup) {
-        bool HasSubGraph;
-        std::vector<int> SubGraph;
-        // By construction (topological order), if SU and
-        // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
-        // in the parent graph of SU.
-#ifndef NDEBUG
-        SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
-                                               HasSubGraph);
-        assert(!HasSubGraph);
-#endif
-        SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
-                                               HasSubGraph);
-        if (!HasSubGraph)
-          continue; // No dependencies between each other
-        else if (SubGraph.size() > 5) {
-          // Too many elements would be required to be added to the block.
-          CompatibleGroup = false;
-          break;
-        }
-        else {
-          // Check the type of dependency
-          for (unsigned k : SubGraph) {
-            // If in the path to join the two instructions,
-            // there is another high latency instruction,
-            // or instructions colored for another block
-            // abort the merge.
-            if (DAG->IsHighLatencySU[k] ||
-                (CurrentColoring[k] != ProposedColor &&
-                 CurrentColoring[k] != 0)) {
-              CompatibleGroup = false;
-              break;
-            }
-            // If one of the SU in the subgraph depends on the result of SU j,
-            // there'll be a data dependency.
-            if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) {
-              CompatibleGroup = false;
-              break;
-            }
-          }
-          if (!CompatibleGroup)
-            break;
-          // Same check for the SU
-          if (hasDataDependencyPred(SU, DAG->SUnits[j])) {
-            CompatibleGroup = false;
-            break;
-          }
-          // Add all the required instructions to the block
-          // These cannot live in another block (because they
-          // depend (order dependency) on one of the
-          // instruction in the block, and are required for the
-          // high latency instruction we add.
-          AdditionalElements.insert(AdditionalElements.end(),
-                                    SubGraph.begin(), SubGraph.end());
-        }
-      }
-      if (CompatibleGroup) {
-        FormingGroup.insert(SU.NodeNum);
-        for (unsigned j : AdditionalElements)
-          CurrentColoring[j] = ProposedColor;
-        CurrentColoring[SU.NodeNum] = ProposedColor;
-        ++Count;
-      }
-      // Found one incompatible instruction,
-      // or has filled a big enough group.
-      // -> start a new one.
-      if (!CompatibleGroup) {
-        FormingGroup.clear();
-        Color = ++NextReservedID;
-        ProposedColor = Color;
-        FormingGroup.insert(SU.NodeNum);
-        CurrentColoring[SU.NodeNum] = ProposedColor;
-        Count = 0;
-      } else if (Count == GroupSize) {
-        FormingGroup.clear();
-        Color = ++NextReservedID;
-        ProposedColor = Color;
-        Count = 0;
-      }
-    }
-  }
-}
-
-void SIScheduleBlockCreator::colorComputeReservedDependencies() {
-  unsigned DAGSize = DAG->SUnits.size();
-  std::map<std::set<unsigned>, unsigned> ColorCombinations;
-
-  CurrentTopDownReservedDependencyColoring.clear();
-  CurrentBottomUpReservedDependencyColoring.clear();
-
-  CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
-  CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
-
-  // Traverse TopDown, and give different colors to SUs depending
-  // on which combination of High Latencies they depend on.
-
-  for (unsigned SUNum : DAG->TopDownIndex2SU) {
-    SUnit *SU = &DAG->SUnits[SUNum];
-    std::set<unsigned> SUColors;
-
-    // Already given.
-    if (CurrentColoring[SU->NodeNum]) {
-      CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
-        CurrentColoring[SU->NodeNum];
-      continue;
-    }
-
-   for (SDep& PredDep : SU->Preds) {
-      SUnit *Pred = PredDep.getSUnit();
-      if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
-        continue;
-      if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
-        SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
-    }
-    // Color 0 by default.
-    if (SUColors.empty())
-      continue;
-    // Same color than parents.
-    if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
-      CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
-        *SUColors.begin();
-    else {
-      std::map<std::set<unsigned>, unsigned>::iterator Pos =
-        ColorCombinations.find(SUColors);
-      if (Pos != ColorCombinations.end()) {
-          CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
-      } else {
-        CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
-          NextNonReservedID;
-        ColorCombinations[SUColors] = NextNonReservedID++;
-      }
-    }
-  }
-
-  ColorCombinations.clear();
-
-  // Same as before, but BottomUp.
-
-  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
-    SUnit *SU = &DAG->SUnits[SUNum];
-    std::set<unsigned> SUColors;
-
-    // Already given.
-    if (CurrentColoring[SU->NodeNum]) {
-      CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
-        CurrentColoring[SU->NodeNum];
-      continue;
-    }
-
-    for (SDep& SuccDep : SU->Succs) {
-      SUnit *Succ = SuccDep.getSUnit();
-      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
-        continue;
-      if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
-        SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
-    }
-    // Keep color 0.
-    if (SUColors.empty())
-      continue;
-    // Same color than parents.
-    if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
-      CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
-        *SUColors.begin();
-    else {
-      std::map<std::set<unsigned>, unsigned>::iterator Pos =
-        ColorCombinations.find(SUColors);
-      if (Pos != ColorCombinations.end()) {
-        CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
-      } else {
-        CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
-          NextNonReservedID;
-        ColorCombinations[SUColors] = NextNonReservedID++;
-      }
-    }
-  }
-}
-
-void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
-  unsigned DAGSize = DAG->SUnits.size();
-  std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
-
-  // Every combination of colors given by the top down
-  // and bottom up Reserved node dependency
-
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &DAG->SUnits[i];
-    std::pair<unsigned, unsigned> SUColors;
-
-    // High latency instructions: already given.
-    if (CurrentColoring[SU->NodeNum])
-      continue;
-
-    SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum];
-    SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum];
-
-    std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
-      ColorCombinations.find(SUColors);
-    if (Pos != ColorCombinations.end()) {
-      CurrentColoring[SU->NodeNum] = Pos->second;
-    } else {
-      CurrentColoring[SU->NodeNum] = NextNonReservedID;
-      ColorCombinations[SUColors] = NextNonReservedID++;
-    }
-  }
-}
-
-void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
-  unsigned DAGSize = DAG->SUnits.size();
-  std::vector<int> PendingColoring = CurrentColoring;
-
-  assert(DAGSize >= 1 &&
-         CurrentBottomUpReservedDependencyColoring.size() == DAGSize &&
-         CurrentTopDownReservedDependencyColoring.size() == DAGSize);
-  // If there is no reserved block at all, do nothing. We don't want
-  // everything in one block.
-  if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(),
-                        CurrentBottomUpReservedDependencyColoring.end()) == 0 &&
-      *std::max_element(CurrentTopDownReservedDependencyColoring.begin(),
-                        CurrentTopDownReservedDependencyColoring.end()) == 0)
-    return;
-
-  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
-    SUnit *SU = &DAG->SUnits[SUNum];
-    std::set<unsigned> SUColors;
-    std::set<unsigned> SUColorsPending;
-
-    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
-      continue;
-
-    if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 ||
-        CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0)
-      continue;
-
-    for (SDep& SuccDep : SU->Succs) {
-      SUnit *Succ = SuccDep.getSUnit();
-      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
-        continue;
-      if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 ||
-          CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
-        SUColors.insert(CurrentColoring[Succ->NodeNum]);
-      SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
-    }
-    // If there is only one child/parent block, and that block
-    // is not among the ones we are removing in this path, then
-    // merge the instruction to that block
-    if (SUColors.size() == 1 && SUColorsPending.size() == 1)
-      PendingColoring[SU->NodeNum] = *SUColors.begin();
-    else // TODO: Attribute new colors depending on color
-         // combination of children.
-      PendingColoring[SU->NodeNum] = NextNonReservedID++;
-  }
-  CurrentColoring = PendingColoring;
-}
-
-
-void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
-  unsigned DAGSize = DAG->SUnits.size();
-  unsigned PreviousColor;
-  std::set<unsigned> SeenColors;
-
-  if (DAGSize <= 1)
-    return;
-
-  PreviousColor = CurrentColoring[0];
-
-  for (unsigned i = 1, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &DAG->SUnits[i];
-    unsigned CurrentColor = CurrentColoring[i];
-    unsigned PreviousColorSave = PreviousColor;
-    assert(i == SU->NodeNum);
-
-    if (CurrentColor != PreviousColor)
-      SeenColors.insert(PreviousColor);
-    PreviousColor = CurrentColor;
-
-    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
-      continue;
-
-    if (SeenColors.find(CurrentColor) == SeenColors.end())
-      continue;
-
-    if (PreviousColorSave != CurrentColor)
-      CurrentColoring[i] = NextNonReservedID++;
-    else
-      CurrentColoring[i] = CurrentColoring[i-1];
-  }
-}
-
-void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
-  unsigned DAGSize = DAG->SUnits.size();
-
-  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
-    SUnit *SU = &DAG->SUnits[SUNum];
-    std::set<unsigned> SUColors;
-
-    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
-      continue;
-
-    // No predecessor: Vgpr constant loading.
-    // Low latency instructions usually have a predecessor (the address)
-    if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum])
-      continue;
-
-    for (SDep& SuccDep : SU->Succs) {
-      SUnit *Succ = SuccDep.getSUnit();
-      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
-        continue;
-      SUColors.insert(CurrentColoring[Succ->NodeNum]);
-    }
-    if (SUColors.size() == 1)
-      CurrentColoring[SU->NodeNum] = *SUColors.begin();
-  }
-}
-
-void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
-  unsigned DAGSize = DAG->SUnits.size();
-
-  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
-    SUnit *SU = &DAG->SUnits[SUNum];
-    std::set<unsigned> SUColors;
-
-    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
-      continue;
-
-    for (SDep& SuccDep : SU->Succs) {
-       SUnit *Succ = SuccDep.getSUnit();
-      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
-        continue;
-      SUColors.insert(CurrentColoring[Succ->NodeNum]);
-    }
-    if (SUColors.size() == 1)
-      CurrentColoring[SU->NodeNum] = *SUColors.begin();
-  }
-}
-
-void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
-  unsigned DAGSize = DAG->SUnits.size();
-
-  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
-    SUnit *SU = &DAG->SUnits[SUNum];
-    std::set<unsigned> SUColors;
-
-    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
-      continue;
-
-    for (SDep& SuccDep : SU->Succs) {
-       SUnit *Succ = SuccDep.getSUnit();
-      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
-        continue;
-      SUColors.insert(CurrentColoring[Succ->NodeNum]);
-    }
-    if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
-      CurrentColoring[SU->NodeNum] = *SUColors.begin();
-  }
-}
-
-void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
-  unsigned DAGSize = DAG->SUnits.size();
-  std::map<unsigned, unsigned> ColorCount;
-
-  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
-    SUnit *SU = &DAG->SUnits[SUNum];
-    unsigned color = CurrentColoring[SU->NodeNum];
-     ++ColorCount[color];
-  }
-
-  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
-    SUnit *SU = &DAG->SUnits[SUNum];
-    unsigned color = CurrentColoring[SU->NodeNum];
-    std::set<unsigned> SUColors;
-
-    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
-      continue;
-
-    if (ColorCount[color] > 1)
-      continue;
-
-    for (SDep& SuccDep : SU->Succs) {
-       SUnit *Succ = SuccDep.getSUnit();
-      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
-        continue;
-      SUColors.insert(CurrentColoring[Succ->NodeNum]);
-    }
-    if (SUColors.size() == 1 && *SUColors.begin() != color) {
-      --ColorCount[color];
-      CurrentColoring[SU->NodeNum] = *SUColors.begin();
-      ++ColorCount[*SUColors.begin()];
-    }
-  }
-}
-
-void SIScheduleBlockCreator::cutHugeBlocks() {
-  // TODO
-}
-
-void SIScheduleBlockCreator::regroupNoUserInstructions() {
-  unsigned DAGSize = DAG->SUnits.size();
-  int GroupID = NextNonReservedID++;
-
-  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
-    SUnit *SU = &DAG->SUnits[SUNum];
-    bool hasSuccessor = false;
-
-    if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
-      continue;
-
-    for (SDep& SuccDep : SU->Succs) {
-       SUnit *Succ = SuccDep.getSUnit();
-      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
-        continue;
-      hasSuccessor = true;
-    }
-    if (!hasSuccessor)
-      CurrentColoring[SU->NodeNum] = GroupID;
-  }
-}
-
-void SIScheduleBlockCreator::colorExports() {
-  unsigned ExportColor = NextNonReservedID++;
-  SmallVector<unsigned, 8> ExpGroup;
-
-  // Put all exports together in a block.
-  // The block will naturally end up being scheduled last,
-  // thus putting exports at the end of the schedule, which
-  // is better for performance.
-  // However we must ensure, for safety, the exports can be put
-  // together in the same block without any other instruction.
-  // This could happen, for example, when scheduling after regalloc
-  // if reloading a spilled register from memory using the same
-  // register than used in a previous export.
-  // If that happens, do not regroup the exports.
-  for (unsigned SUNum : DAG->TopDownIndex2SU) {
-    const SUnit &SU = DAG->SUnits[SUNum];
-    if (SIInstrInfo::isEXP(*SU.getInstr())) {
-      // Check the EXP can be added to the group safely,
-      // ie without needing any other instruction.
-      // The EXP is allowed to depend on other EXP
-      // (they will be in the same group).
-      for (unsigned j : ExpGroup) {
-        bool HasSubGraph;
-        std::vector<int> SubGraph;
-        // By construction (topological order), if SU and
-        // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
-        // in the parent graph of SU.
-#ifndef NDEBUG
-        SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
-                                               HasSubGraph);
-        assert(!HasSubGraph);
-#endif
-        SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
-                                               HasSubGraph);
-        if (!HasSubGraph)
-          continue; // No dependencies between each other
-
-        // SubGraph contains all the instructions required
-        // between EXP SUnits[j] and EXP SU.
-        for (unsigned k : SubGraph) {
-          if (!SIInstrInfo::isEXP(*DAG->SUnits[k].getInstr()))
-            // Other instructions than EXP would be required in the group.
-            // Abort the groupping.
-            return;
-        }
-      }
-
-      ExpGroup.push_back(SUNum);
-    }
-  }
-
-  // The group can be formed. Give the color.
-  for (unsigned j : ExpGroup)
-    CurrentColoring[j] = ExportColor;
-}
-
-void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
-  unsigned DAGSize = DAG->SUnits.size();
-  std::map<unsigned,unsigned> RealID;
-
-  CurrentBlocks.clear();
-  CurrentColoring.clear();
-  CurrentColoring.resize(DAGSize, 0);
-  Node2CurrentBlock.clear();
-
-  // Restore links previous scheduling variant has overridden.
-  DAG->restoreSULinksLeft();
-
-  NextReservedID = 1;
-  NextNonReservedID = DAGSize + 1;
-
-  LLVM_DEBUG(dbgs() << "Coloring the graph\n");
-
-  if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
-    colorHighLatenciesGroups();
-  else
-    colorHighLatenciesAlone();
-  colorComputeReservedDependencies();
-  colorAccordingToReservedDependencies();
-  colorEndsAccordingToDependencies();
-  if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
-    colorForceConsecutiveOrderInGroup();
-  regroupNoUserInstructions();
-  colorMergeConstantLoadsNextGroup();
-  colorMergeIfPossibleNextGroupOnlyForReserved();
-  colorExports();
-
-  // Put SUs of same color into same block
-  Node2CurrentBlock.resize(DAGSize, -1);
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &DAG->SUnits[i];
-    unsigned Color = CurrentColoring[SU->NodeNum];
-    if (RealID.find(Color) == RealID.end()) {
-      int ID = CurrentBlocks.size();
-      BlockPtrs.push_back(llvm::make_unique<SIScheduleBlock>(DAG, this, ID));
-      CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
-      RealID[Color] = ID;
-    }
-    CurrentBlocks[RealID[Color]]->addUnit(SU);
-    Node2CurrentBlock[SU->NodeNum] = RealID[Color];
-  }
-
-  // Build dependencies between blocks.
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &DAG->SUnits[i];
-    int SUID = Node2CurrentBlock[i];
-     for (SDep& SuccDep : SU->Succs) {
-       SUnit *Succ = SuccDep.getSUnit();
-      if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
-        continue;
-      if (Node2CurrentBlock[Succ->NodeNum] != SUID)
-        CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]],
-                                     SuccDep.isCtrl() ? NoData : Data);
-    }
-    for (SDep& PredDep : SU->Preds) {
-      SUnit *Pred = PredDep.getSUnit();
-      if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
-        continue;
-      if (Node2CurrentBlock[Pred->NodeNum] != SUID)
-        CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
-    }
-  }
-
-  // Free root and leafs of all blocks to enable scheduling inside them.
-  for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
-    SIScheduleBlock *Block = CurrentBlocks[i];
-    Block->finalizeUnits();
-  }
-  LLVM_DEBUG(dbgs() << "Blocks created:\n\n";
-             for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
-               SIScheduleBlock *Block = CurrentBlocks[i];
-               Block->printDebug(true);
-             });
-}
-
-// Two functions taken from Codegen/MachineScheduler.cpp
-
-/// Non-const version.
-static MachineBasicBlock::iterator
-nextIfDebug(MachineBasicBlock::iterator I,
-            MachineBasicBlock::const_iterator End) {
-  for (; I != End; ++I) {
-    if (!I->isDebugInstr())
-      break;
-  }
-  return I;
-}
-
-void SIScheduleBlockCreator::topologicalSort() {
-  unsigned DAGSize = CurrentBlocks.size();
-  std::vector<int> WorkList;
-
-  LLVM_DEBUG(dbgs() << "Topological Sort\n");
-
-  WorkList.reserve(DAGSize);
-  TopDownIndex2Block.resize(DAGSize);
-  TopDownBlock2Index.resize(DAGSize);
-  BottomUpIndex2Block.resize(DAGSize);
-
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SIScheduleBlock *Block = CurrentBlocks[i];
-    unsigned Degree = Block->getSuccs().size();
-    TopDownBlock2Index[i] = Degree;
-    if (Degree == 0) {
-      WorkList.push_back(i);
-    }
-  }
-
-  int Id = DAGSize;
-  while (!WorkList.empty()) {
-    int i = WorkList.back();
-    SIScheduleBlock *Block = CurrentBlocks[i];
-    WorkList.pop_back();
-    TopDownBlock2Index[i] = --Id;
-    TopDownIndex2Block[Id] = i;
-    for (SIScheduleBlock* Pred : Block->getPreds()) {
-      if (!--TopDownBlock2Index[Pred->getID()])
-        WorkList.push_back(Pred->getID());
-    }
-  }
-
-#ifndef NDEBUG
-  // Check correctness of the ordering.
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SIScheduleBlock *Block = CurrentBlocks[i];
-    for (SIScheduleBlock* Pred : Block->getPreds()) {
-      assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
-      "Wrong Top Down topological sorting");
-    }
-  }
-#endif
-
-  BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
-                                         TopDownIndex2Block.rend());
-}
-
-void SIScheduleBlockCreator::scheduleInsideBlocks() {
-  unsigned DAGSize = CurrentBlocks.size();
-
-  LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n");
-
-  // We do schedule a valid scheduling such that a Block corresponds
-  // to a range of instructions.
-  LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SIScheduleBlock *Block = CurrentBlocks[i];
-    Block->fastSchedule();
-  }
-
-  // Note: the following code, and the part restoring previous position
-  // is by far the most expensive operation of the Scheduler.
-
-  // Do not update CurrentTop.
-  MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
-  std::vector<MachineBasicBlock::iterator> PosOld;
-  std::vector<MachineBasicBlock::iterator> PosNew;
-  PosOld.reserve(DAG->SUnits.size());
-  PosNew.reserve(DAG->SUnits.size());
-
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    int BlockIndice = TopDownIndex2Block[i];
-    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
-    std::vector<SUnit*> SUs = Block->getScheduledUnits();
-
-    for (SUnit* SU : SUs) {
-      MachineInstr *MI = SU->getInstr();
-      MachineBasicBlock::iterator Pos = MI;
-      PosOld.push_back(Pos);
-      if (&*CurrentTopFastSched == MI) {
-        PosNew.push_back(Pos);
-        CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched,
-                                          DAG->getCurrentBottom());
-      } else {
-        // Update the instruction stream.
-        DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
-
-        // Update LiveIntervals.
-        // Note: Moving all instructions and calling handleMove every time
-        // is the most cpu intensive operation of the scheduler.
-        // It would gain a lot if there was a way to recompute the
-        // LiveIntervals for the entire scheduling region.
-        DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true);
-        PosNew.push_back(CurrentTopFastSched);
-      }
-    }
-  }
-
-  // Now we have Block of SUs == Block of MI.
-  // We do the final schedule for the instructions inside the block.
-  // The property that all the SUs of the Block are grouped together as MI
-  // is used for correct reg usage tracking.
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SIScheduleBlock *Block = CurrentBlocks[i];
-    std::vector<SUnit*> SUs = Block->getScheduledUnits();
-    Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
-  }
-
-  LLVM_DEBUG(dbgs() << "Restoring MI Pos\n");
-  // Restore old ordering (which prevents a LIS->handleMove bug).
-  for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
-    MachineBasicBlock::iterator POld = PosOld[i-1];
-    MachineBasicBlock::iterator PNew = PosNew[i-1];
-    if (PNew != POld) {
-      // Update the instruction stream.
-      DAG->getBB()->splice(POld, DAG->getBB(), PNew);
-
-      // Update LiveIntervals.
-      DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true);
-    }
-  }
-
-  LLVM_DEBUG(for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
-    SIScheduleBlock *Block = CurrentBlocks[i];
-    Block->printDebug(true);
-  });
-}
-
-void SIScheduleBlockCreator::fillStats() {
-  unsigned DAGSize = CurrentBlocks.size();
-
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    int BlockIndice = TopDownIndex2Block[i];
-    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
-    if (Block->getPreds().empty())
-      Block->Depth = 0;
-    else {
-      unsigned Depth = 0;
-      for (SIScheduleBlock *Pred : Block->getPreds()) {
-        if (Depth < Pred->Depth + Pred->getCost())
-          Depth = Pred->Depth + Pred->getCost();
-      }
-      Block->Depth = Depth;
-    }
-  }
-
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    int BlockIndice = BottomUpIndex2Block[i];
-    SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
-    if (Block->getSuccs().empty())
-      Block->Height = 0;
-    else {
-      unsigned Height = 0;
-      for (const auto &Succ : Block->getSuccs())
-        Height = std::max(Height, Succ.first->Height + Succ.first->getCost());
-      Block->Height = Height;
-    }
-  }
-}
-
-// SIScheduleBlockScheduler //
-
-SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
-                                                   SISchedulerBlockSchedulerVariant Variant,
-                                                   SIScheduleBlocks  BlocksStruct) :
-  DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
-  LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
-  SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
-
-  // Fill the usage of every output
-  // Warning: while by construction we always have a link between two blocks
-  // when one needs a result from the other, the number of users of an output
-  // is not the sum of child blocks having as input the same virtual register.
-  // Here is an example. A produces x and y. B eats x and produces x'.
-  // C eats x' and y. The register coalescer may have attributed the same
-  // virtual register to x and x'.
-  // To count accurately, we do a topological sort. In case the register is
-  // found for several parents, we increment the usage of the one with the
-  // highest topological index.
-  LiveOutRegsNumUsages.resize(Blocks.size());
-  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
-    SIScheduleBlock *Block = Blocks[i];
-    for (unsigned Reg : Block->getInRegs()) {
-      bool Found = false;
-      int topoInd = -1;
-      for (SIScheduleBlock* Pred: Block->getPreds()) {
-        std::set<unsigned> PredOutRegs = Pred->getOutRegs();
-        std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
-
-        if (RegPos != PredOutRegs.end()) {
-          Found = true;
-          if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) {
-            topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
-          }
-        }
-      }
-
-      if (!Found)
-        continue;
-
-      int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
-      ++LiveOutRegsNumUsages[PredID][Reg];
-    }
-  }
-
-  LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0);
-  BlockNumPredsLeft.resize(Blocks.size());
-  BlockNumSuccsLeft.resize(Blocks.size());
-
-  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
-    SIScheduleBlock *Block = Blocks[i];
-    BlockNumPredsLeft[i] = Block->getPreds().size();
-    BlockNumSuccsLeft[i] = Block->getSuccs().size();
-  }
-
-#ifndef NDEBUG
-  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
-    SIScheduleBlock *Block = Blocks[i];
-    assert(Block->getID() == i);
-  }
-#endif
-
-  std::set<unsigned> InRegs = DAG->getInRegs();
-  addLiveRegs(InRegs);
-
-  // Increase LiveOutRegsNumUsages for blocks
-  // producing registers consumed in another
-  // scheduling region.
-  for (unsigned Reg : DAG->getOutRegs()) {
-    for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
-      // Do reverse traversal
-      int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i];
-      SIScheduleBlock *Block = Blocks[ID];
-      const std::set<unsigned> &OutRegs = Block->getOutRegs();
-
-      if (OutRegs.find(Reg) == OutRegs.end())
-        continue;
-
-      ++LiveOutRegsNumUsages[ID][Reg];
-      break;
-    }
-  }
-
-  // Fill LiveRegsConsumers for regs that were already
-  // defined before scheduling.
-  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
-    SIScheduleBlock *Block = Blocks[i];
-    for (unsigned Reg : Block->getInRegs()) {
-      bool Found = false;
-      for (SIScheduleBlock* Pred: Block->getPreds()) {
-        std::set<unsigned> PredOutRegs = Pred->getOutRegs();
-        std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
-
-        if (RegPos != PredOutRegs.end()) {
-          Found = true;
-          break;
-        }
-      }
-
-      if (!Found)
-        ++LiveRegsConsumers[Reg];
-    }
-  }
-
-  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
-    SIScheduleBlock *Block = Blocks[i];
-    if (BlockNumPredsLeft[i] == 0) {
-      ReadyBlocks.push_back(Block);
-    }
-  }
-
-  while (SIScheduleBlock *Block = pickBlock()) {
-    BlocksScheduled.push_back(Block);
-    blockScheduled(Block);
-  }
-
-  LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock *Block
-                                            : BlocksScheduled) {
-    dbgs() << ' ' << Block->getID();
-  } dbgs() << '\n';);
-}
-
-bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
-                                                   SIBlockSchedCandidate &TryCand) {
-  if (!Cand.isValid()) {
-    TryCand.Reason = NodeOrder;
-    return true;
-  }
-
-  // Try to hide high latencies.
-  if (SISched::tryLess(TryCand.LastPosHighLatParentScheduled,
-                 Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
-    return true;
-  // Schedule high latencies early so you can hide them better.
-  if (SISched::tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
-                          TryCand, Cand, Latency))
-    return true;
-  if (TryCand.IsHighLatency && SISched::tryGreater(TryCand.Height, Cand.Height,
-                                                   TryCand, Cand, Depth))
-    return true;
-  if (SISched::tryGreater(TryCand.NumHighLatencySuccessors,
-                          Cand.NumHighLatencySuccessors,
-                          TryCand, Cand, Successor))
-    return true;
-  return false;
-}
-
-bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
-                                                    SIBlockSchedCandidate &TryCand) {
-  if (!Cand.isValid()) {
-    TryCand.Reason = NodeOrder;
-    return true;
-  }
-
-  if (SISched::tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
-                       TryCand, Cand, RegUsage))
-    return true;
-  if (SISched::tryGreater(TryCand.NumSuccessors > 0,
-                          Cand.NumSuccessors > 0,
-                          TryCand, Cand, Successor))
-    return true;
-  if (SISched::tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
-    return true;
-  if (SISched::tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
-                       TryCand, Cand, RegUsage))
-    return true;
-  return false;
-}
-
-SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
-  SIBlockSchedCandidate Cand;
-  std::vector<SIScheduleBlock*>::iterator Best;
-  SIScheduleBlock *Block;
-  if (ReadyBlocks.empty())
-    return nullptr;
-
-  DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(),
-                        VregCurrentUsage, SregCurrentUsage);
-  if (VregCurrentUsage > maxVregUsage)
-    maxVregUsage = VregCurrentUsage;
-  if (SregCurrentUsage > maxSregUsage)
-    maxSregUsage = SregCurrentUsage;
-  LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: ";
-             for (SIScheduleBlock *Block
-                  : ReadyBlocks) dbgs()
-             << Block->getID() << ' ';
-             dbgs() << "\nCurrent Live:\n";
-             for (unsigned Reg
-                  : LiveRegs) dbgs()
-             << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
-             dbgs() << '\n';
-             dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
-             dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';);
-
-  Cand.Block = nullptr;
-  for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
-       E = ReadyBlocks.end(); I != E; ++I) {
-    SIBlockSchedCandidate TryCand;
-    TryCand.Block = *I;
-    TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
-    TryCand.VGPRUsageDiff =
-      checkRegUsageImpact(TryCand.Block->getInRegs(),
-                          TryCand.Block->getOutRegs())[DAG->getVGPRSetID()];
-    TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
-    TryCand.NumHighLatencySuccessors =
-      TryCand.Block->getNumHighLatencySuccessors();
-    TryCand.LastPosHighLatParentScheduled =
-      (unsigned int) std::max<int> (0,
-         LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
-           LastPosWaitedHighLatency);
-    TryCand.Height = TryCand.Block->Height;
-    // Try not to increase VGPR usage too much, else we may spill.
-    if (VregCurrentUsage > 120 ||
-        Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
-      if (!tryCandidateRegUsage(Cand, TryCand) &&
-          Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
-        tryCandidateLatency(Cand, TryCand);
-    } else {
-      if (!tryCandidateLatency(Cand, TryCand))
-        tryCandidateRegUsage(Cand, TryCand);
-    }
-    if (TryCand.Reason != NoCand) {
-      Cand.setBest(TryCand);
-      Best = I;
-      LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
-                        << getReasonStr(Cand.Reason) << '\n');
-    }
-  }
-
-  LLVM_DEBUG(dbgs() << "Picking: " << Cand.Block->getID() << '\n';
-             dbgs() << "Is a block with high latency instruction: "
-                    << (Cand.IsHighLatency ? "yes\n" : "no\n");
-             dbgs() << "Position of last high latency dependency: "
-                    << Cand.LastPosHighLatParentScheduled << '\n';
-             dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
-             dbgs() << '\n';);
-
-  Block = Cand.Block;
-  ReadyBlocks.erase(Best);
-  return Block;
-}
-
-// Tracking of currently alive registers to determine VGPR Usage.
-
-void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
-  for (unsigned Reg : Regs) {
-    // For now only track virtual registers.
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
-      continue;
-    // If not already in the live set, then add it.
-    (void) LiveRegs.insert(Reg);
-  }
-}
-
-void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
-                                       std::set<unsigned> &Regs) {
-  for (unsigned Reg : Regs) {
-    // For now only track virtual registers.
-    std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
-    assert (Pos != LiveRegs.end() && // Reg must be live.
-               LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
-               LiveRegsConsumers[Reg] >= 1);
-    --LiveRegsConsumers[Reg];
-    if (LiveRegsConsumers[Reg] == 0)
-      LiveRegs.erase(Pos);
-  }
-}
-
-void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
-  for (const auto &Block : Parent->getSuccs()) {
-    if (--BlockNumPredsLeft[Block.first->getID()] == 0)
-      ReadyBlocks.push_back(Block.first);
-
-    if (Parent->isHighLatencyBlock() &&
-        Block.second == SIScheduleBlockLinkKind::Data)
-      LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled;
-  }
-}
-
-void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
-  decreaseLiveRegs(Block, Block->getInRegs());
-  addLiveRegs(Block->getOutRegs());
-  releaseBlockSuccs(Block);
-  for (std::map<unsigned, unsigned>::iterator RegI =
-       LiveOutRegsNumUsages[Block->getID()].begin(),
-       E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
-    std::pair<unsigned, unsigned> RegP = *RegI;
-    // We produce this register, thus it must not be previously alive.
-    assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() ||
-           LiveRegsConsumers[RegP.first] == 0);
-    LiveRegsConsumers[RegP.first] += RegP.second;
-  }
-  if (LastPosHighLatencyParentScheduled[Block->getID()] >
-        (unsigned)LastPosWaitedHighLatency)
-    LastPosWaitedHighLatency =
-      LastPosHighLatencyParentScheduled[Block->getID()];
-  ++NumBlockScheduled;
-}
-
-std::vector<int>
-SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
-                                     std::set<unsigned> &OutRegs) {
-  std::vector<int> DiffSetPressure;
-  DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
-
-  for (unsigned Reg : InRegs) {
-    // For now only track virtual registers.
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
-      continue;
-    if (LiveRegsConsumers[Reg] > 1)
-      continue;
-    PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
-    for (; PSetI.isValid(); ++PSetI) {
-      DiffSetPressure[*PSetI] -= PSetI.getWeight();
-    }
-  }
-
-  for (unsigned Reg : OutRegs) {
-    // For now only track virtual registers.
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
-      continue;
-    PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
-    for (; PSetI.isValid(); ++PSetI) {
-      DiffSetPressure[*PSetI] += PSetI.getWeight();
-    }
-  }
-
-  return DiffSetPressure;
-}
-
-// SIScheduler //
-
-struct SIScheduleBlockResult
-SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
-                             SISchedulerBlockSchedulerVariant ScheduleVariant) {
-  SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
-  SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
-  std::vector<SIScheduleBlock*> ScheduledBlocks;
-  struct SIScheduleBlockResult Res;
-
-  ScheduledBlocks = Scheduler.getBlocks();
-
-  for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) {
-    SIScheduleBlock *Block = ScheduledBlocks[b];
-    std::vector<SUnit*> SUs = Block->getScheduledUnits();
-
-    for (SUnit* SU : SUs)
-      Res.SUs.push_back(SU->NodeNum);
-  }
-
-  Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
-  Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
-  return Res;
-}
-
-// SIScheduleDAGMI //
-
-SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
-  ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C)) {
-  SITII = static_cast<const SIInstrInfo*>(TII);
-  SITRI = static_cast<const SIRegisterInfo*>(TRI);
-
-  VGPRSetID = SITRI->getVGPRPressureSet();
-  SGPRSetID = SITRI->getSGPRPressureSet();
-}
-
-SIScheduleDAGMI::~SIScheduleDAGMI() = default;
-
-// Code adapted from scheduleDAG.cpp
-// Does a topological sort over the SUs.
-// Both TopDown and BottomUp
-void SIScheduleDAGMI::topologicalSort() {
-  Topo.InitDAGTopologicalSorting();
-
-  TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end());
-  BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend());
-}
-
-// Move low latencies further from their user without
-// increasing SGPR usage (in general)
-// This is to be replaced by a better pass that would
-// take into account SGPR usage (based on VGPR Usage
-// and the corresponding wavefront count), that would
-// try to merge groups of loads if it make sense, etc
-void SIScheduleDAGMI::moveLowLatencies() {
-   unsigned DAGSize = SUnits.size();
-   int LastLowLatencyUser = -1;
-   int LastLowLatencyPos = -1;
-
-   for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
-    SUnit *SU = &SUnits[ScheduledSUnits[i]];
-    bool IsLowLatencyUser = false;
-    unsigned MinPos = 0;
-
-    for (SDep& PredDep : SU->Preds) {
-      SUnit *Pred = PredDep.getSUnit();
-      if (SITII->isLowLatencyInstruction(*Pred->getInstr())) {
-        IsLowLatencyUser = true;
-      }
-      if (Pred->NodeNum >= DAGSize)
-        continue;
-      unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
-      if (PredPos >= MinPos)
-        MinPos = PredPos + 1;
-    }
-
-    if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
-      unsigned BestPos = LastLowLatencyUser + 1;
-      if ((int)BestPos <= LastLowLatencyPos)
-        BestPos = LastLowLatencyPos + 1;
-      if (BestPos < MinPos)
-        BestPos = MinPos;
-      if (BestPos < i) {
-        for (unsigned u = i; u > BestPos; --u) {
-          ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
-          ScheduledSUnits[u] = ScheduledSUnits[u-1];
-        }
-        ScheduledSUnits[BestPos] = SU->NodeNum;
-        ScheduledSUnitsInv[SU->NodeNum] = BestPos;
-      }
-      LastLowLatencyPos = BestPos;
-      if (IsLowLatencyUser)
-        LastLowLatencyUser = BestPos;
-    } else if (IsLowLatencyUser) {
-      LastLowLatencyUser = i;
-    // Moves COPY instructions on which depends
-    // the low latency instructions too.
-    } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
-      bool CopyForLowLat = false;
-      for (SDep& SuccDep : SU->Succs) {
-        SUnit *Succ = SuccDep.getSUnit();
-        if (SITII->isLowLatencyInstruction(*Succ->getInstr())) {
-          CopyForLowLat = true;
-        }
-      }
-      if (!CopyForLowLat)
-        continue;
-      if (MinPos < i) {
-        for (unsigned u = i; u > MinPos; --u) {
-          ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
-          ScheduledSUnits[u] = ScheduledSUnits[u-1];
-        }
-        ScheduledSUnits[MinPos] = SU->NodeNum;
-        ScheduledSUnitsInv[SU->NodeNum] = MinPos;
-      }
-    }
-  }
-}
-
-void SIScheduleDAGMI::restoreSULinksLeft() {
-  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
-    SUnits[i].isScheduled = false;
-    SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
-    SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
-    SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
-    SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
-  }
-}
-
-// Return the Vgpr and Sgpr usage corresponding to some virtual registers.
-template<typename _Iterator> void
-SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
-                                  unsigned &VgprUsage, unsigned &SgprUsage) {
-  VgprUsage = 0;
-  SgprUsage = 0;
-  for (_Iterator RegI = First; RegI != End; ++RegI) {
-    unsigned Reg = *RegI;
-    // For now only track virtual registers
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
-      continue;
-    PSetIterator PSetI = MRI.getPressureSets(Reg);
-    for (; PSetI.isValid(); ++PSetI) {
-      if (*PSetI == VGPRSetID)
-        VgprUsage += PSetI.getWeight();
-      else if (*PSetI == SGPRSetID)
-        SgprUsage += PSetI.getWeight();
-    }
-  }
-}
-
-void SIScheduleDAGMI::schedule()
-{
-  SmallVector<SUnit*, 8> TopRoots, BotRoots;
-  SIScheduleBlockResult Best, Temp;
-  LLVM_DEBUG(dbgs() << "Preparing Scheduling\n");
-
-  buildDAGWithRegPressure();
-  LLVM_DEBUG(dump());
-
-  topologicalSort();
-  findRootsAndBiasEdges(TopRoots, BotRoots);
-  // We reuse several ScheduleDAGMI and ScheduleDAGMILive
-  // functions, but to make them happy we must initialize
-  // the default Scheduler implementation (even if we do not
-  // run it)
-  SchedImpl->initialize(this);
-  initQueues(TopRoots, BotRoots);
-
-  // Fill some stats to help scheduling.
-
-  SUnitsLinksBackup = SUnits;
-  IsLowLatencySU.clear();
-  LowLatencyOffset.clear();
-  IsHighLatencySU.clear();
-
-  IsLowLatencySU.resize(SUnits.size(), 0);
-  LowLatencyOffset.resize(SUnits.size(), 0);
-  IsHighLatencySU.resize(SUnits.size(), 0);
-
-  for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
-    SUnit *SU = &SUnits[i];
-    MachineOperand *BaseLatOp;
-    int64_t OffLatReg;
-    if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
-      IsLowLatencySU[i] = 1;
-      if (SITII->getMemOperandWithOffset(*SU->getInstr(), BaseLatOp, OffLatReg,
-                                         TRI))
-        LowLatencyOffset[i] = OffLatReg;
-    } else if (SITII->isHighLatencyInstruction(*SU->getInstr()))
-      IsHighLatencySU[i] = 1;
-  }
-
-  SIScheduler Scheduler(this);
-  Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone,
-                                   SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
-
-  // if VGPR usage is extremely high, try other good performing variants
-  // which could lead to lower VGPR usage
-  if (Best.MaxVGPRUsage > 180) {
-    static const std::pair<SISchedulerBlockCreatorVariant,
-                           SISchedulerBlockSchedulerVariant>
-        Variants[] = {
-      { LatenciesAlone, BlockRegUsageLatency },
-//      { LatenciesAlone, BlockRegUsage },
-      { LatenciesGrouped, BlockLatencyRegUsage },
-//      { LatenciesGrouped, BlockRegUsageLatency },
-//      { LatenciesGrouped, BlockRegUsage },
-      { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
-//      { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
-//      { LatenciesAlonePlusConsecutive, BlockRegUsage }
-    };
-    for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
-      Temp = Scheduler.scheduleVariant(v.first, v.second);
-      if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
-        Best = Temp;
-    }
-  }
-  // if VGPR usage is still extremely high, we may spill. Try other variants
-  // which are less performing, but that could lead to lower VGPR usage.
-  if (Best.MaxVGPRUsage > 200) {
-    static const std::pair<SISchedulerBlockCreatorVariant,
-                           SISchedulerBlockSchedulerVariant>
-        Variants[] = {
-//      { LatenciesAlone, BlockRegUsageLatency },
-      { LatenciesAlone, BlockRegUsage },
-//      { LatenciesGrouped, BlockLatencyRegUsage },
-      { LatenciesGrouped, BlockRegUsageLatency },
-      { LatenciesGrouped, BlockRegUsage },
-//      { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
-      { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
-      { LatenciesAlonePlusConsecutive, BlockRegUsage }
-    };
-    for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
-      Temp = Scheduler.scheduleVariant(v.first, v.second);
-      if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
-        Best = Temp;
-    }
-  }
-
-  ScheduledSUnits = Best.SUs;
-  ScheduledSUnitsInv.resize(SUnits.size());
-
-  for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
-    ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
-  }
-
-  moveLowLatencies();
-
-  // Tell the outside world about the result of the scheduling.
-
-  assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
-  TopRPTracker.setPos(CurrentTop);
-
-  for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
-       E = ScheduledSUnits.end(); I != E; ++I) {
-    SUnit *SU = &SUnits[*I];
-
-    scheduleMI(SU, true);
-
-    LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
-                      << *SU->getInstr());
-  }
-
-  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
-
-  placeDebugValues();
-
-  LLVM_DEBUG({
-    dbgs() << "*** Final schedule for "
-           << printMBBReference(*begin()->getParent()) << " ***\n";
-    dumpSchedule();
-    dbgs() << '\n';
-  });
-}
author	patrick <patrick@openbsd.org>	2020-08-03 15:06:44 +0000
committer	patrick <patrick@openbsd.org>	2020-08-03 15:06:44 +0000
commit	b64793999546ed8adebaeebd9d8345d18db8927d (patch)
tree	4357c27b561d73b0e089727c6ed659f2ceff5f47 /gnu/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
parent	Add support for UTF-8 DISPLAY-HINTs with octet length. For now only (diff)
download	wireguard-openbsd-b64793999546ed8adebaeebd9d8345d18db8927d.tar.xz wireguard-openbsd-b64793999546ed8adebaeebd9d8345d18db8927d.zip