static unsigned numberCtrlPredInSU(SUnit *SU) { unsigned NumberDeps = 0; for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); I != E; ++I) if (I->isCtrl()) NumberDeps++; return NumberDeps; }
/// WillCreateCycle - Returns true if adding an edge from SU to TargetSU will /// create a cycle. bool ScheduleDAGTopologicalSort::WillCreateCycle(SUnit *SU, SUnit *TargetSU) { if (IsReachable(TargetSU, SU)) return true; for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); I != E; ++I) if (I->isAssignedRegDep() && IsReachable(TargetSU, I->getSUnit())) return true; return false; }
void SUnit::biasCriticalPath() { if (NumPreds < 2) return; SUnit::pred_iterator BestI = Preds.begin(); unsigned MaxDepth = BestI->getSUnit()->getDepth(); for (SUnit::pred_iterator I = std::next(BestI), E = Preds.end(); I != E; ++I) { if (I->getKind() == SDep::Data && I->getSUnit()->getDepth() > MaxDepth) BestI = I; } if (BestI != Preds.begin()) std::swap(*Preds.begin(), *BestI); }
unsigned ResourcePriorityQueue::numberRCValPredInSU(SUnit *SU, unsigned RCId) { unsigned NumberDeps = 0; for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); I != E; ++I) { if (I->isCtrl()) continue; SUnit *PredSU = I->getSUnit(); const SDNode *ScegN = PredSU->getNode(); if (!ScegN) continue; // If value is passed to CopyToReg, it is probably // live outside BB. switch (ScegN->getOpcode()) { default: break; case ISD::TokenFactor: break; case ISD::CopyFromReg: NumberDeps++; break; case ISD::CopyToReg: break; case ISD::INLINEASM: break; } if (!ScegN->isMachineOpcode()) continue; for (unsigned i = 0, e = ScegN->getNumValues(); i != e; ++i) { MVT VT = ScegN->getSimpleValueType(i); if (TLI->isTypeLegal(VT) && (TLI->getRegClassFor(VT)->getID() == RCId)) { NumberDeps++; break; } } } return NumberDeps; }
/// CriticalPathStep - Return the next SUnit after SU on the bottom-up /// critical path. static SDep *CriticalPathStep(SUnit *SU) { SDep *Next = 0; unsigned NextDepth = 0; // Find the predecessor edge with the greatest depth. for (SUnit::pred_iterator P = SU->Preds.begin(), PE = SU->Preds.end(); P != PE; ++P) { SUnit *PredSU = P->getSUnit(); unsigned PredLatency = P->getLatency(); unsigned PredTotalLatency = PredSU->getDepth() + PredLatency; // In the case of a latency tie, prefer an anti-dependency edge over // other types of edges. if (NextDepth < PredTotalLatency || (NextDepth == PredTotalLatency && P->getKind() == SDep::Anti)) { NextDepth = PredTotalLatency; Next = &*P; } } return Next; }
/// Main resource tracking point. void ResourcePriorityQueue::scheduledNode(SUnit *SU) { // Use NULL entry as an event marker to reset // the DFA state. if (!SU) { ResourcesModel->clearResources(); Packet.clear(); return; } const SDNode *ScegN = SU->getNode(); // Update reg pressure tracking. // First update current node. if (ScegN->isMachineOpcode()) { // Estimate generated regs. for (unsigned i = 0, e = ScegN->getNumValues(); i != e; ++i) { MVT VT = ScegN->getSimpleValueType(i); if (TLI->isTypeLegal(VT)) { const TargetRegisterClass *RC = TLI->getRegClassFor(VT); if (RC) RegPressure[RC->getID()] += numberRCValSuccInSU(SU, RC->getID()); } } // Estimate killed regs. for (unsigned i = 0, e = ScegN->getNumOperands(); i != e; ++i) { const SDValue &Op = ScegN->getOperand(i); MVT VT = Op.getNode()->getSimpleValueType(Op.getResNo()); if (TLI->isTypeLegal(VT)) { const TargetRegisterClass *RC = TLI->getRegClassFor(VT); if (RC) { if (RegPressure[RC->getID()] > (numberRCValPredInSU(SU, RC->getID()))) RegPressure[RC->getID()] -= numberRCValPredInSU(SU, RC->getID()); else RegPressure[RC->getID()] = 0; } } } for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); I != E; ++I) { if (I->isCtrl() || (I->getSUnit()->NumRegDefsLeft == 0)) continue; --I->getSUnit()->NumRegDefsLeft; } } // Reserve resources for this SU. reserveResources(SU); // Adjust number of parallel live ranges. // Heuristic is simple - node with no data successors reduces // number of live ranges. All others, increase it. unsigned NumberNonControlDeps = 0; for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); I != E; ++I) { adjustPriorityOfUnscheduledPreds(I->getSUnit()); if (!I->isCtrl()) NumberNonControlDeps++; } if (!NumberNonControlDeps) { if (ParallelLiveRanges >= SU->NumPreds) ParallelLiveRanges -= SU->NumPreds; else ParallelLiveRanges = 0; } else ParallelLiveRanges += SU->NumRegDefsLeft; // Track parallel live chains. HorizontalVerticalBalance += (SU->Succs.size() - numberCtrlDepsInSU(SU)); HorizontalVerticalBalance -= (SU->Preds.size() - numberCtrlPredInSU(SU)); }
void PatmosPostRASchedStrategy::postprocessDAG(ScheduleDAGPostRA *dag) { DAG = dag; SUnit *CFL = NULL; // Find the inline asm statement, if any. Note that asm is a barrier, // therefore there is at most one CFL or inline asm. SUnit *Asm = NULL; // Push up loads to ensure load delay slot across BBs // TODO For some reasons, loads do not always have exit edges, and a latency // of 1; find out why. Happens e.g. in coremark with 16k methods setup. for (std::vector<SUnit>::reverse_iterator it = DAG->SUnits.rbegin(), ie = DAG->SUnits.rend(); it != ie; it++) { MachineInstr *MI = it->getInstr(); if (!MI) continue; if (MI->mayLoad()) { SDep Dep(&*it, SDep::Artificial); Dep.setLatency(computeExitLatency(*it)); DAG->ExitSU.addPred(Dep); } } // Find the branch/call/ret instruction if available for (std::vector<SUnit>::reverse_iterator it = DAG->SUnits.rbegin(), ie = DAG->SUnits.rend(); it != ie; it++) { MachineInstr *MI = it->getInstr(); if (!MI) continue; if (isPatmosCFL(MI->getOpcode(), MI->getDesc().TSFlags)) { CFL = &*it; break; } if (MI->isInlineAsm()) { Asm = &*it; break; } } const PatmosSubtarget *PST = PTM.getSubtargetImpl(); unsigned DelaySlot = CFL ? PST->getDelaySlotCycles(CFL->getInstr()) : 0; if (CFL) { // RET and CALL have implicit deps on the return values and call // arguments. Remove all those edges to schedule them into the delay slot // if the registers are not actually used by CALL and RET if (CFL->getInstr()->isReturn() || CFL->getInstr()->isCall()) removeImplicitCFLDeps(*CFL); // Add an artificial dep from CFL to exit for the delay slot SDep DelayDep(CFL, SDep::Artificial); DelayDep.setLatency(DelaySlot + 1); DAG->ExitSU.addPred(DelayDep); CFL->isScheduleLow = true; if (PTM.getSubtargetImpl()->getCFLType() != PatmosSubtarget::CFL_DELAYED) { // Push up single instructions that can be scheduled in the same // cycle as the branch unsigned LowCount = 0; SUnit *LowSU = 0; for (std::vector<SUnit>::reverse_iterator it = DAG->SUnits.rbegin(), ie = DAG->SUnits.rend(); it != ie; it++) { if (&*it == CFL) continue; MachineInstr *MI = it->getInstr(); if (!MI) continue; if (it->getHeight() <= DelaySlot) { LowCount++; if (PII.canIssueInSlot(MI, LowCount)) { LowSU = &*it; } } } if (LowSU && LowCount == 1) { SDep Dep(LowSU, SDep::Artificial); Dep.setLatency(DelaySlot + 1); DAG->ExitSU.addPred(Dep); } } if (PTM.getSubtargetImpl()->getCFLType() == PatmosSubtarget::CFL_NON_DELAYED) { // Add dependencies from all other instructions to exit for (std::vector<SUnit>::reverse_iterator it = DAG->SUnits.rbegin(), ie = DAG->SUnits.rend(); it != ie; it++) { if (&*it == CFL) continue; MachineInstr *MI = it->getInstr(); if (!MI) continue; SDep Dep(&*it, SDep::Artificial); Dep.setLatency(DelaySlot + 1); DAG->ExitSU.addPred(Dep); } } } // Add an exit delay between loads and inline asm, in case asm is empty if (Asm) { std::vector<SUnit*> PredLoads; for (SUnit::pred_iterator it = Asm->Preds.begin(), ie = Asm->Preds.end(); it != ie; it++) { if (!it->getSUnit()) continue; MachineInstr *MI = it->getSUnit()->getInstr(); // Check for loads if (!MI || !MI->mayLoad()) continue; PredLoads.push_back(it->getSUnit()); } for (std::vector<SUnit*>::iterator it = PredLoads.begin(), ie = PredLoads.end(); it != ie; it++) { // Add a delay between loads and inline-asm, even if the operand is not // used. SDep Dep(*it, SDep::Artificial); Dep.setLatency( computeExitLatency(**it) ); Asm->addPred(Dep); } } // remove barriers between loads/stores with different memory type removeTypedMemBarriers(); // remove any dependency between instructions with mutually exclusive // predicates removeExclusivePredDeps(); // TODO SWS and LWS do not have ST as implicit def edges // TODO CALL has chain edges to all SWS/.. instructions, remove // TODO remove edges from MUL to other MULs to overlap MUL and MFS for // pipelined muls. }
unsigned CriticalAntiDepBreaker:: BreakAntiDependencies(std::vector<SUnit>& SUnits, MachineBasicBlock::iterator& Begin, MachineBasicBlock::iterator& End, unsigned InsertPosIndex) { // The code below assumes that there is at least one instruction, // so just duck out immediately if the block is empty. if (SUnits.empty()) return 0; // Find the node at the bottom of the critical path. SUnit *Max = 0; for (unsigned i = 0, e = SUnits.size(); i != e; ++i) { SUnit *SU = &SUnits[i]; if (!Max || SU->getDepth() + SU->Latency > Max->getDepth() + Max->Latency) Max = SU; } #ifndef NDEBUG { DEBUG(errs() << "Critical path has total latency " << (Max->getDepth() + Max->Latency) << "\n"); DEBUG(errs() << "Available regs:"); for (unsigned Reg = 0; Reg < TRI->getNumRegs(); ++Reg) { if (KillIndices[Reg] == ~0u) DEBUG(errs() << " " << TRI->getName(Reg)); } DEBUG(errs() << '\n'); } #endif // Track progress along the critical path through the SUnit graph as we walk // the instructions. SUnit *CriticalPathSU = Max; MachineInstr *CriticalPathMI = CriticalPathSU->getInstr(); // Consider this pattern: // A = ... // ... = A // A = ... // ... = A // A = ... // ... = A // A = ... // ... = A // There are three anti-dependencies here, and without special care, // we'd break all of them using the same register: // A = ... // ... = A // B = ... // ... = B // B = ... // ... = B // B = ... // ... = B // because at each anti-dependence, B is the first register that // isn't A which is free. This re-introduces anti-dependencies // at all but one of the original anti-dependencies that we were // trying to break. To avoid this, keep track of the most recent // register that each register was replaced with, avoid // using it to repair an anti-dependence on the same register. // This lets us produce this: // A = ... // ... = A // B = ... // ... = B // C = ... // ... = C // B = ... // ... = B // This still has an anti-dependence on B, but at least it isn't on the // original critical path. // // TODO: If we tracked more than one register here, we could potentially // fix that remaining critical edge too. This is a little more involved, // because unlike the most recent register, less recent registers should // still be considered, though only if no other registers are available. unsigned LastNewReg[TargetRegisterInfo::FirstVirtualRegister] = {}; // Attempt to break anti-dependence edges on the critical path. Walk the // instructions from the bottom up, tracking information about liveness // as we go to help determine which registers are available. unsigned Broken = 0; unsigned Count = InsertPosIndex - 1; for (MachineBasicBlock::iterator I = End, E = Begin; I != E; --Count) { MachineInstr *MI = --I; // Check if this instruction has a dependence on the critical path that // is an anti-dependence that we may be able to break. If it is, set // AntiDepReg to the non-zero register associated with the anti-dependence. // // We limit our attention to the critical path as a heuristic to avoid // breaking anti-dependence edges that aren't going to significantly // impact the overall schedule. There are a limited number of registers // and we want to save them for the important edges. // // TODO: Instructions with multiple defs could have multiple // anti-dependencies. The current code here only knows how to break one // edge per instruction. Note that we'd have to be able to break all of // the anti-dependencies in an instruction in order to be effective. unsigned AntiDepReg = 0; if (MI == CriticalPathMI) { if (SDep *Edge = CriticalPathStep(CriticalPathSU)) { SUnit *NextSU = Edge->getSUnit(); // Only consider anti-dependence edges. if (Edge->getKind() == SDep::Anti) { AntiDepReg = Edge->getReg(); assert(AntiDepReg != 0 && "Anti-dependence on reg0?"); if (!AllocatableSet.test(AntiDepReg)) // Don't break anti-dependencies on non-allocatable registers. AntiDepReg = 0; else if (KeepRegs.count(AntiDepReg)) // Don't break anti-dependencies if an use down below requires // this exact register. AntiDepReg = 0; else { // If the SUnit has other dependencies on the SUnit that it // anti-depends on, don't bother breaking the anti-dependency // since those edges would prevent such units from being // scheduled past each other regardless. // // Also, if there are dependencies on other SUnits with the // same register as the anti-dependency, don't attempt to // break it. for (SUnit::pred_iterator P = CriticalPathSU->Preds.begin(), PE = CriticalPathSU->Preds.end(); P != PE; ++P) if (P->getSUnit() == NextSU ? (P->getKind() != SDep::Anti || P->getReg() != AntiDepReg) : (P->getKind() == SDep::Data && P->getReg() == AntiDepReg)) { AntiDepReg = 0; break; } } } CriticalPathSU = NextSU; CriticalPathMI = CriticalPathSU->getInstr(); } else { // We've reached the end of the critical path. CriticalPathSU = 0; CriticalPathMI = 0; } } PrescanInstruction(MI); if (MI->getDesc().hasExtraDefRegAllocReq()) // If this instruction's defs have special allocation requirement, don't // break this anti-dependency. AntiDepReg = 0; else if (AntiDepReg) { // If this instruction has a use of AntiDepReg, breaking it // is invalid. for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { MachineOperand &MO = MI->getOperand(i); if (!MO.isReg()) continue; unsigned Reg = MO.getReg(); if (Reg == 0) continue; if (MO.isUse() && AntiDepReg == Reg) { AntiDepReg = 0; break; } } } // Determine AntiDepReg's register class, if it is live and is // consistently used within a single class. const TargetRegisterClass *RC = AntiDepReg != 0 ? Classes[AntiDepReg] : 0; assert((AntiDepReg == 0 || RC != NULL) && "Register should be live if it's causing an anti-dependence!"); if (RC == reinterpret_cast<TargetRegisterClass *>(-1)) AntiDepReg = 0; // Look for a suitable register to use to break the anti-depenence. // // TODO: Instead of picking the first free register, consider which might // be the best. if (AntiDepReg != 0) { if (unsigned NewReg = findSuitableFreeRegister(AntiDepReg, LastNewReg[AntiDepReg], RC)) { DEBUG(errs() << "Breaking anti-dependence edge on " << TRI->getName(AntiDepReg) << " with " << RegRefs.count(AntiDepReg) << " references" << " using " << TRI->getName(NewReg) << "!\n"); // Update the references to the old register to refer to the new // register. std::pair<std::multimap<unsigned, MachineOperand *>::iterator, std::multimap<unsigned, MachineOperand *>::iterator> Range = RegRefs.equal_range(AntiDepReg); for (std::multimap<unsigned, MachineOperand *>::iterator Q = Range.first, QE = Range.second; Q != QE; ++Q) Q->second->setReg(NewReg); // We just went back in time and modified history; the // liveness information for the anti-depenence reg is now // inconsistent. Set the state as if it were dead. Classes[NewReg] = Classes[AntiDepReg]; DefIndices[NewReg] = DefIndices[AntiDepReg]; KillIndices[NewReg] = KillIndices[AntiDepReg]; assert(((KillIndices[NewReg] == ~0u) != (DefIndices[NewReg] == ~0u)) && "Kill and Def maps aren't consistent for NewReg!"); Classes[AntiDepReg] = 0; DefIndices[AntiDepReg] = KillIndices[AntiDepReg]; KillIndices[AntiDepReg] = ~0u; assert(((KillIndices[AntiDepReg] == ~0u) != (DefIndices[AntiDepReg] == ~0u)) && "Kill and Def maps aren't consistent for AntiDepReg!"); RegRefs.erase(AntiDepReg); LastNewReg[AntiDepReg] = NewReg; ++Broken; } } ScanInstruction(MI, Count); } return Broken; }