diff options
Diffstat (limited to 'gnu/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp')
| -rw-r--r-- | gnu/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 309 |
1 files changed, 240 insertions, 69 deletions
diff --git a/gnu/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/gnu/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index a613a220e29..53aefe82973 100644 --- a/gnu/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/gnu/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -9,7 +9,7 @@ // /// \file /// \brief This pass adds instructions to enable whole quad mode for pixel -/// shaders. +/// shaders, and whole wavefront mode for all programs. /// /// Whole quad mode is required for derivative computations, but it interferes /// with shader side effects (stores and atomics). This pass is run on the @@ -29,6 +29,13 @@ /// ... /// S_MOV_B64 EXEC, Tmp /// +/// We also compute when a sequence of instructions requires Whole Wavefront +/// Mode (WWM) and insert instructions to save and restore it: +/// +/// S_OR_SAVEEXEC_B64 Tmp, -1 +/// ... +/// S_MOV_B64 EXEC, Tmp +/// /// In order to avoid excessive switching during sequences of Exact /// instructions, the pass first analyzes which instructions must be run in WQM /// (aka which instructions produce values that lead to derivative @@ -54,10 +61,11 @@ #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/LiveInterval.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -66,13 +74,13 @@ #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DebugLoc.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetRegisterInfo.h" #include <cassert> #include <vector> @@ -84,7 +92,8 @@ namespace { enum { StateWQM = 0x1, - StateExact = 0x2, + StateWWM = 0x2, + StateExact = 0x4, }; struct PrintState { @@ -94,20 +103,28 @@ public: explicit PrintState(int State) : State(State) {} }; +#ifndef NDEBUG static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) { if (PS.State & StateWQM) OS << "WQM"; - if (PS.State & StateExact) { + if (PS.State & StateWWM) { if (PS.State & StateWQM) OS << '|'; + OS << "WWM"; + } + if (PS.State & StateExact) { + if (PS.State & (StateWQM | StateWWM)) + OS << '|'; OS << "Exact"; } return OS; } +#endif struct InstrInfo { char Needs = 0; + char Disabled = 0; char OutNeeds = 0; }; @@ -128,6 +145,7 @@ struct WorkItem { class SIWholeQuadMode : public MachineFunctionPass { private: + CallingConv::ID CallingConv; const SIInstrInfo *TII; const SIRegisterInfo *TRI; MachineRegisterInfo *MRI; @@ -136,12 +154,14 @@ private: DenseMap<const MachineInstr *, InstrInfo> Instructions; DenseMap<MachineBasicBlock *, BlockInfo> Blocks; SmallVector<MachineInstr *, 1> LiveMaskQueries; + SmallVector<MachineInstr *, 4> LowerToCopyInstrs; void printInfo(); void markInstruction(MachineInstr &MI, char Flag, std::vector<WorkItem> &Worklist); - void markUsesWQM(const MachineInstr &MI, std::vector<WorkItem> &Worklist); + void markInstructionUses(const MachineInstr &MI, char Flag, + std::vector<WorkItem> &Worklist); char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist); void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist); void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist); @@ -159,9 +179,14 @@ private: unsigned SaveWQM, unsigned LiveMaskReg); void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, unsigned SavedWQM); + void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, + unsigned SaveOrig); + void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, + unsigned SavedOrig); void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry); void lowerLiveMaskQueries(unsigned LiveMaskReg); + void lowerCopyInstrs(); public: static char ID; @@ -196,9 +221,11 @@ FunctionPass *llvm::createSIWholeQuadModePass() { return new SIWholeQuadMode; } -void SIWholeQuadMode::printInfo() { +#ifndef NDEBUG +LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() { for (const auto &BII : Blocks) { - dbgs() << "\nBB#" << BII.first->getNumber() << ":\n" + dbgs() << "\n" + << printMBBReference(*BII.first) << ":\n" << " InNeeds = " << PrintState(BII.second.InNeeds) << ", Needs = " << PrintState(BII.second.Needs) << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n"; @@ -213,27 +240,32 @@ void SIWholeQuadMode::printInfo() { } } } +#endif void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag, std::vector<WorkItem> &Worklist) { InstrInfo &II = Instructions[&MI]; - assert(Flag == StateWQM || Flag == StateExact); + assert(!(Flag & StateExact) && Flag != 0); - // Ignore if the instruction is already marked. The typical case is that we - // mark an instruction WQM multiple times, but for atomics it can happen that - // Flag is StateWQM, but Needs is already set to StateExact. In this case, - // letting the atomic run in StateExact is correct as per the relevant specs. - if (II.Needs) + // Remove any disabled states from the flag. The user that required it gets + // an undefined value in the helper lanes. For example, this can happen if + // the result of an atomic is used by instruction that requires WQM, where + // ignoring the request for WQM is correct as per the relevant specs. + Flag &= ~II.Disabled; + + // Ignore if the flag is already encompassed by the existing needs, or we + // just disabled everything. + if ((II.Needs & Flag) == Flag) return; - II.Needs = Flag; + II.Needs |= Flag; Worklist.push_back(&MI); } -/// Mark all instructions defining the uses in \p MI as WQM. -void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI, - std::vector<WorkItem> &Worklist) { +/// Mark all instructions defining the uses in \p MI with \p Flag. +void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag, + std::vector<WorkItem> &Worklist) { for (const MachineOperand &Use : MI.uses()) { if (!Use.isReg() || !Use.isUse()) continue; @@ -258,7 +290,7 @@ void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI, if (Value->isPHIDef()) continue; - markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM, + markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag, Worklist); } @@ -266,7 +298,7 @@ void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI, } for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) - markInstruction(DefMI, StateWQM, Worklist); + markInstruction(DefMI, Flag, Worklist); } } @@ -275,27 +307,72 @@ void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI, char SIWholeQuadMode::scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist) { char GlobalFlags = 0; - bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs"); - - for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { - MachineBasicBlock &MBB = *BI; + bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs"); + SmallVector<MachineInstr *, 4> SetInactiveInstrs; + + // We need to visit the basic blocks in reverse post-order so that we visit + // defs before uses, in particular so that we don't accidentally mark an + // instruction as needing e.g. WQM before visiting it and realizing it needs + // WQM disabled. + ReversePostOrderTraversal<MachineFunction *> RPOT(&MF); + for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) { + MachineBasicBlock &MBB = **BI; + BlockInfo &BBI = Blocks[&MBB]; for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) { MachineInstr &MI = *II; + InstrInfo &III = Instructions[&MI]; unsigned Opcode = MI.getOpcode(); char Flags = 0; - if (TII->isDS(Opcode)) { + if (TII->isDS(Opcode) && CallingConv == CallingConv::AMDGPU_PS) { Flags = StateWQM; } else if (TII->isWQM(Opcode)) { // Sampling instructions don't need to produce results for all pixels // in a quad, they just require all inputs of a quad to have been // computed for derivatives. - markUsesWQM(MI, Worklist); + markInstructionUses(MI, StateWQM, Worklist); GlobalFlags |= StateWQM; continue; + } else if (Opcode == AMDGPU::WQM) { + // The WQM intrinsic requires its output to have all the helper lanes + // correct, so we need it to be in WQM. + Flags = StateWQM; + LowerToCopyInstrs.push_back(&MI); + } else if (Opcode == AMDGPU::WWM) { + // The WWM intrinsic doesn't make the same guarantee, and plus it needs + // to be executed in WQM or Exact so that its copy doesn't clobber + // inactive lanes. + markInstructionUses(MI, StateWWM, Worklist); + GlobalFlags |= StateWWM; + LowerToCopyInstrs.push_back(&MI); + continue; + } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 || + Opcode == AMDGPU::V_SET_INACTIVE_B64) { + III.Disabled = StateWWM; + MachineOperand &Inactive = MI.getOperand(2); + if (Inactive.isReg()) { + if (Inactive.isUndef()) { + LowerToCopyInstrs.push_back(&MI); + } else { + unsigned Reg = Inactive.getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + for (MachineInstr &DefMI : MRI->def_instructions(Reg)) + markInstruction(DefMI, StateWWM, Worklist); + } + } + } + SetInactiveInstrs.push_back(&MI); + continue; } else if (TII->isDisableWQM(MI)) { - Flags = StateExact; + BBI.Needs |= StateExact; + if (!(BBI.InNeeds & StateExact)) { + BBI.InNeeds |= StateExact; + Worklist.push_back(&MBB); + } + GlobalFlags |= StateExact; + III.Disabled = StateWQM | StateWWM; + continue; } else { if (Opcode == AMDGPU::SI_PS_LIVE) { LiveMaskQueries.push_back(&MI); @@ -326,6 +403,14 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, } } + // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is + // ever used anywhere in the function. This implements the corresponding + // semantics of @llvm.amdgcn.set.inactive. + if (GlobalFlags & StateWQM) { + for (MachineInstr *MI : SetInactiveInstrs) + markInstruction(*MI, StateWQM, Worklist); + } + return GlobalFlags; } @@ -337,22 +422,24 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, // Control flow-type instructions and stores to temporary memory that are // followed by WQM computations must themselves be in WQM. - if ((II.OutNeeds & StateWQM) && !II.Needs && + if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) && (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) { Instructions[&MI].Needs = StateWQM; II.Needs = StateWQM; } // Propagate to block level - BI.Needs |= II.Needs; - if ((BI.InNeeds | II.Needs) != BI.InNeeds) { - BI.InNeeds |= II.Needs; - Worklist.push_back(MBB); + if (II.Needs & StateWQM) { + BI.Needs |= StateWQM; + if (!(BI.InNeeds & StateWQM)) { + BI.InNeeds |= StateWQM; + Worklist.push_back(MBB); + } } // Propagate backwards within block if (MachineInstr *PrevMI = MI.getPrevNode()) { - char InNeeds = II.Needs | II.OutNeeds; + char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds; if (!PrevMI->isPHI()) { InstrInfo &PrevII = Instructions[PrevMI]; if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) { @@ -363,10 +450,10 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, } // Propagate WQM flag to instruction inputs - assert(II.Needs != (StateWQM | StateExact)); + assert(!(II.Needs & StateExact)); - if (II.Needs == StateWQM) - markUsesWQM(MI, Worklist); + if (II.Needs != 0) + markInstructionUses(MI, II.Needs, Worklist); } void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB, @@ -558,6 +645,29 @@ void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, LIS->InsertMachineInstrInMaps(*MI); } +void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB, + MachineBasicBlock::iterator Before, + unsigned SaveOrig) { + MachineInstr *MI; + + assert(SaveOrig); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_OR_SAVEEXEC_B64), + SaveOrig) + .addImm(-1); + LIS->InsertMachineInstrInMaps(*MI); +} + +void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB, + MachineBasicBlock::iterator Before, + unsigned SavedOrig) { + MachineInstr *MI; + + assert(SavedOrig); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), AMDGPU::EXEC) + .addReg(SavedOrig); + LIS->InsertMachineInstrInMaps(*MI); +} + void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry) { auto BII = Blocks.find(&MBB); @@ -566,45 +676,66 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, const BlockInfo &BI = BII->second; - if (!(BI.InNeeds & StateWQM)) - return; - // This is a non-entry block that is WQM throughout, so no need to do // anything. - if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact) + if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) return; - DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n"); + DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB) << ":\n"); unsigned SavedWQMReg = 0; + unsigned SavedNonWWMReg = 0; bool WQMFromExec = isEntry; - char State = isEntry ? StateExact : StateWQM; + char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM; + char NonWWMState = 0; auto II = MBB.getFirstNonPHI(), IE = MBB.end(); if (isEntry) ++II; // Skip the instruction that saves LiveMask - MachineBasicBlock::iterator First = IE; + // This stores the first instruction where it's safe to switch from WQM to + // Exact or vice versa. + MachineBasicBlock::iterator FirstWQM = IE; + + // This stores the first instruction where it's safe to switch from WWM to + // Exact/WQM or to switch to WWM. It must always be the same as, or after, + // FirstWQM since if it's safe to switch to/from WWM, it must be safe to + // switch to/from WQM as well. + MachineBasicBlock::iterator FirstWWM = IE; for (;;) { MachineBasicBlock::iterator Next = II; - char Needs = 0; + char Needs = StateExact | StateWQM; // WWM is disabled by default char OutNeeds = 0; - if (First == IE) - First = II; + if (FirstWQM == IE) + FirstWQM = II; + + if (FirstWWM == IE) + FirstWWM = II; + // First, figure out the allowed states (Needs) based on the propagated + // flags. if (II != IE) { MachineInstr &MI = *II; if (requiresCorrectState(MI)) { auto III = Instructions.find(&MI); if (III != Instructions.end()) { - Needs = III->second.Needs; + if (III->second.Needs & StateWWM) + Needs = StateWWM; + else if (III->second.Needs & StateWQM) + Needs = StateWQM; + else + Needs &= ~III->second.Disabled; OutNeeds = III->second.OutNeeds; } + } else { + // If the instruction doesn't actually need a correct EXEC, then we can + // safely leave WWM enabled. + Needs = StateExact | StateWQM | StateWWM; } - if (MI.isTerminator() && !Needs && OutNeeds == StateExact) + if (MI.isTerminator() && OutNeeds == StateExact) Needs = StateExact; if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact) @@ -617,20 +748,45 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, Needs = StateWQM; else if (BI.OutNeeds == StateExact) Needs = StateExact; + else + Needs = StateWQM | StateExact; } - if (Needs) { - if (Needs != State) { - MachineBasicBlock::iterator Before = - prepareInsertion(MBB, First, II, Needs == StateWQM, - Needs == StateExact || WQMFromExec); + // Now, transition if necessary. + if (!(Needs & State)) { + MachineBasicBlock::iterator First; + if (State == StateWWM || Needs == StateWWM) { + // We must switch to or from WWM + First = FirstWWM; + } else { + // We only need to switch to/from WQM, so we can use FirstWQM + First = FirstWQM; + } - if (Needs == StateExact) { + MachineBasicBlock::iterator Before = + prepareInsertion(MBB, First, II, Needs == StateWQM, + Needs == StateExact || WQMFromExec); + + if (State == StateWWM) { + assert(SavedNonWWMReg); + fromWWM(MBB, Before, SavedNonWWMReg); + State = NonWWMState; + } + + if (Needs == StateWWM) { + NonWWMState = State; + SavedNonWWMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + toWWM(MBB, Before, SavedNonWWMReg); + State = StateWWM; + } else { + if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) { if (!WQMFromExec && (OutNeeds & StateWQM)) SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); toExact(MBB, Before, SavedWQMReg, LiveMaskReg); - } else { + State = StateExact; + } else if (State == StateExact && (Needs & StateWQM) && + !(Needs & StateExact)) { assert(WQMFromExec == (SavedWQMReg == 0)); toWQM(MBB, Before, SavedWQMReg); @@ -639,12 +795,19 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, LIS->createAndComputeVirtRegInterval(SavedWQMReg); SavedWQMReg = 0; } + State = StateWQM; + } else { + // We can get here if we transitioned from WWM to a non-WWM state that + // already matches our needs, but we shouldn't need to do anything. + assert(Needs & State); } - - State = Needs; } + } - First = IE; + if (Needs != (StateExact | StateWQM | StateWWM)) { + if (Needs != (StateExact | StateWQM)) + FirstWQM = IE; + FirstWWM = IE; } if (II == IE) @@ -666,13 +829,20 @@ void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) { } } -bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { - if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS) - return false; +void SIWholeQuadMode::lowerCopyInstrs() { + for (MachineInstr *MI : LowerToCopyInstrs) { + for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--) + MI->RemoveOperand(i); + MI->setDesc(TII->get(AMDGPU::COPY)); + } +} +bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { Instructions.clear(); Blocks.clear(); LiveMaskQueries.clear(); + LowerToCopyInstrs.clear(); + CallingConv = MF.getFunction().getCallingConv(); const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); @@ -682,14 +852,13 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { LIS = &getAnalysis<LiveIntervals>(); char GlobalFlags = analyzeFunction(MF); + unsigned LiveMaskReg = 0; if (!(GlobalFlags & StateWQM)) { lowerLiveMaskQueries(AMDGPU::EXEC); - return !LiveMaskQueries.empty(); - } - - // Store a copy of the original live mask when required - unsigned LiveMaskReg = 0; - { + if (!(GlobalFlags & StateWWM)) + return !LiveMaskQueries.empty(); + } else { + // Store a copy of the original live mask when required MachineBasicBlock &Entry = MF.front(); MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI(); @@ -701,13 +870,15 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { LIS->InsertMachineInstrInMaps(*MI); } + lowerLiveMaskQueries(LiveMaskReg); + if (GlobalFlags == StateWQM) { // For a shader that needs only WQM, we can just set it once. BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC) .addReg(AMDGPU::EXEC); - lowerLiveMaskQueries(LiveMaskReg); + lowerCopyInstrs(); // EntryMI may become invalid here return true; } @@ -715,7 +886,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { DEBUG(printInfo()); - lowerLiveMaskQueries(LiveMaskReg); + lowerCopyInstrs(); // Handle the general case for (auto BII : Blocks) |
