/// Terminate all open ranges at the end of the current basic block. bool LiveDebugValues::transferTerminatorInst(MachineInstr &MI, VarLocList &OpenRanges, VarLocInMBB &OutLocs) { bool Changed = false; const MachineBasicBlock *CurMBB = MI.getParent(); if (!(MI.isTerminator() || (&MI == &CurMBB->instr_back()))) return false; if (OpenRanges.empty()) return false; VarLocList &VLL = OutLocs[CurMBB]; for (auto OR : OpenRanges) { // Copy OpenRanges to OutLocs, if not already present. assert(OR.MI->isDebugValue()); DEBUG(dbgs() << "Add to OutLocs: "; OR.MI->dump();); if (std::find_if(VLL.begin(), VLL.end(), [&](const VarLoc &V) { return (OR == V); }) == VLL.end()) { VLL.push_back(std::move(OR)); Changed = true; } }
/// Generate a conditional transfer, copying the value SrcOp to the /// destination register DstR:DstSR, and using the predicate register from /// PredOp. The Cond argument specifies whether the predicate is to be /// if(PredOp), or if(!PredOp). MachineInstr *HexagonExpandCondsets::genTfrFor(MachineOperand &SrcOp, unsigned DstR, unsigned DstSR, const MachineOperand &PredOp, bool Cond) { MachineInstr *MI = SrcOp.getParent(); MachineBasicBlock &B = *MI->getParent(); MachineBasicBlock::iterator At = MI; DebugLoc DL = MI->getDebugLoc(); // Don't avoid identity copies here (i.e. if the source and the destination // are the same registers). It is actually better to generate them here, // since this would cause the copy to potentially be predicated in the next // step. The predication will remove such a copy if it is unable to /// predicate. unsigned Opc = getCondTfrOpcode(SrcOp, Cond); MachineInstr *TfrI = BuildMI(B, At, DL, HII->get(Opc)) .addReg(DstR, RegState::Define, DstSR) .addOperand(PredOp) .addOperand(SrcOp); // We don't want any kills yet. TfrI->clearKillInfo(); DEBUG(dbgs() << "created an initial copy: " << *TfrI); return TfrI; }
/// Given a basic block \p Successor that potentially contains PHIs, this /// function will look for any incoming values in the PHIs that are supposed to /// be coming from \p OrigMBB but whose definition is actually in \p NewMBB. /// Any such PHIs will be updated to reflect reality. static void updatePHIs(MachineBasicBlock *Successor, MachineBasicBlock *OrigMBB, MachineBasicBlock *NewMBB, MachineRegisterInfo *MRI) { for (auto &MI : Successor->instrs()) { if (!MI.isPHI()) continue; // This is a really ugly-looking loop, but it was pillaged directly from // MachineBasicBlock::transferSuccessorsAndUpdatePHIs(). for (unsigned i = 2, e = MI.getNumOperands() + 1; i != e; i += 2) { MachineOperand &MO = MI.getOperand(i); if (MO.getMBB() == OrigMBB) { // Check if the instruction is actualy defined in NewMBB. if (MI.getOperand(i - 1).isReg()) { MachineInstr *DefMI = MRI->getVRegDef(MI.getOperand(i - 1).getReg()); if (DefMI->getParent() == NewMBB || !OrigMBB->isSuccessor(Successor)) { MO.setMBB(NewMBB); break; } } } } } }
/// isProfitableToSinkTo - Return true if it is profitable to sink MI. bool MachineSinking::isProfitableToSinkTo(unsigned Reg, MachineInstr *MI, MachineBasicBlock *MBB, MachineBasicBlock *SuccToSinkTo) { assert (MI && "Invalid MachineInstr!"); assert (SuccToSinkTo && "Invalid SinkTo Candidate BB"); if (MBB == SuccToSinkTo) return false; // It is profitable if SuccToSinkTo does not post dominate current block. if (!isPostDominatedBy(MBB, SuccToSinkTo)) return true; // Check if only use in post dominated block is PHI instruction. bool NonPHIUse = false; for (MachineRegisterInfo::use_nodbg_iterator I = MRI->use_nodbg_begin(Reg), E = MRI->use_nodbg_end(); I != E; ++I) { MachineInstr *UseInst = &*I; MachineBasicBlock *UseBlock = UseInst->getParent(); if (UseBlock == SuccToSinkTo && !UseInst->isPHI()) NonPHIUse = true; } if (!NonPHIUse) return true; // If SuccToSinkTo post dominates then also it may be profitable if MI // can further profitably sinked into another block in next round. bool BreakPHIEdge = false; // FIXME - If finding successor is compile time expensive then catch results. if (MachineBasicBlock *MBB2 = FindSuccToSinkTo(MI, SuccToSinkTo, BreakPHIEdge)) return isProfitableToSinkTo(Reg, MI, SuccToSinkTo, MBB2); // If SuccToSinkTo is final destination and it is a post dominator of current // block then it is not profitable to sink MI into SuccToSinkTo block. return false; }
bool MachineCSE::PerformTrivialCoalescing(MachineInstr *MI, MachineBasicBlock *MBB) { bool Changed = false; for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { MachineOperand &MO = MI->getOperand(i); if (!MO.isReg() || !MO.isUse()) continue; unsigned Reg = MO.getReg(); if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue; if (!MRI->hasOneNonDBGUse(Reg)) // Only coalesce single use copies. This ensure the copy will be // deleted. continue; MachineInstr *DefMI = MRI->getVRegDef(Reg); if (DefMI->getParent() != MBB) continue; if (!DefMI->isCopy()) continue; unsigned SrcReg = DefMI->getOperand(1).getReg(); if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) continue; if (DefMI->getOperand(0).getSubReg() || DefMI->getOperand(1).getSubReg()) continue; if (!MRI->constrainRegClass(SrcReg, MRI->getRegClass(Reg))) continue; DEBUG(dbgs() << "Coalescing: " << *DefMI); DEBUG(dbgs() << "*** to: " << *MI); MO.setReg(SrcReg); MRI->clearKillFlags(SrcReg); DefMI->eraseFromParent(); ++NumCoalesces; Changed = true; } return Changed; }
bool AArch64LoadStoreOpt::tryToMergeLdStInst( MachineBasicBlock::iterator &MBBI) { MachineInstr *MI = MBBI; MachineBasicBlock::iterator E = MI->getParent()->end(); // If this is a volatile load/store, don't mess with it. if (MI->hasOrderedMemoryRef()) return false; // Make sure this is a reg+imm (as opposed to an address reloc). if (!getLdStOffsetOp(MI).isImm()) return false; // Check if this load/store has a hint to avoid pair formation. // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. if (TII->isLdStPairSuppressed(MI)) return false; // Look ahead up to ScanLimit instructions for a pairable instruction. LdStPairFlags Flags; MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, ScanLimit); if (Paired != E) { if (isSmallTypeLdMerge(MI)) { ++NumSmallTypeMerged; } else { ++NumPairCreated; if (isUnscaledLdSt(MI)) ++NumUnscaledPairCreated; } // Merge the loads into a pair. Keeping the iterator straight is a // pain, so we let the merge routine tell us what the next instruction // is after it's done mucking about. MBBI = mergePairedInsns(MBBI, Paired, Flags); return true; } return false; }
// Compare compares the result of MI against zero. If MI is a suitable load // instruction and if CCUsers is a single conditional trap on zero, eliminate // the load and convert the branch to a load-and-trap. Return true on success. bool SystemZElimCompare::convertToLoadAndTrap( MachineInstr &MI, MachineInstr &Compare, SmallVectorImpl<MachineInstr *> &CCUsers) { unsigned LATOpcode = TII->getLoadAndTrap(MI.getOpcode()); if (!LATOpcode) return false; // Check whether we have a single CondTrap that traps on zero. if (CCUsers.size() != 1) return false; MachineInstr *Branch = CCUsers[0]; if (Branch->getOpcode() != SystemZ::CondTrap || Branch->getOperand(0).getImm() != SystemZ::CCMASK_ICMP || Branch->getOperand(1).getImm() != SystemZ::CCMASK_CMP_EQ) return false; // We already know that there are no references to the register between // MI and Compare. Make sure that there are also no references between // Compare and Branch. unsigned SrcReg = getCompareSourceReg(Compare); MachineBasicBlock::iterator MBBI = Compare, MBBE = Branch; for (++MBBI; MBBI != MBBE; ++MBBI) if (getRegReferences(*MBBI, SrcReg)) return false; // The transformation is OK. Rebuild Branch as a load-and-trap. while (Branch->getNumOperands()) Branch->RemoveOperand(0); Branch->setDesc(TII->get(LATOpcode)); MachineInstrBuilder(*Branch->getParent()->getParent(), Branch) .addOperand(MI.getOperand(0)) .addOperand(MI.getOperand(1)) .addOperand(MI.getOperand(2)) .addOperand(MI.getOperand(3)); MI.eraseFromParent(); return true; }
void SIInsertSkips::kill(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); const MachineOperand &Op = MI.getOperand(0); #ifndef NDEBUG CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv(); // Kill is only allowed in pixel / geometry shaders. assert(CallConv == CallingConv::AMDGPU_PS || CallConv == CallingConv::AMDGPU_GS); #endif // Clear this thread from the exec mask if the operand is negative. if (Op.isImm()) { // Constant operand: Set exec mask to 0 or do nothing if (Op.getImm() & 0x80000000) { BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) .addImm(0); } } else { BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32)) .addImm(0) .addOperand(Op); } }
void RegDefsUses::setCallerSaved(const MachineInstr &MI) { assert(MI.isCall()); // Add RA/RA_64 to Defs to prevent users of RA/RA_64 from going into // the delay slot. The reason is that RA/RA_64 must not be changed // in the delay slot so that the callee can return to the caller. if (MI.definesRegister(Mips::RA) || MI.definesRegister(Mips::RA_64)) { Defs.set(Mips::RA); Defs.set(Mips::RA_64); } // If MI is a call, add all caller-saved registers to Defs. BitVector CallerSavedRegs(TRI.getNumRegs(), true); CallerSavedRegs.reset(Mips::ZERO); CallerSavedRegs.reset(Mips::ZERO_64); for (const MCPhysReg *R = TRI.getCalleeSavedRegs(MI.getParent()->getParent()); *R; ++R) for (MCRegAliasIterator AI(*R, &TRI, true); AI.isValid(); ++AI) CallerSavedRegs.reset(*AI); Defs |= CallerSavedRegs; }
bool OptimizePICCall::isCallViaRegister(MachineInstr &MI, unsigned &Reg, ValueType &Val) const { if (!MI.isCall()) return false; MachineOperand *MO = getCallTargetRegOpnd(MI); // Return if MI is not a function call via a register. if (!MO) return false; // Get the instruction that loads the function address from the GOT. Reg = MO->getReg(); Val = (Value*)nullptr; MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); MachineInstr *DefMI = MRI.getVRegDef(Reg); assert(DefMI); // See if DefMI is an instruction that loads from a GOT entry that holds the // address of a lazy binding stub. if (!DefMI->mayLoad() || DefMI->getNumOperands() < 3) return true; unsigned Flags = DefMI->getOperand(2).getTargetFlags(); if (Flags != MipsII::MO_GOT_CALL && Flags != MipsII::MO_CALL_LO16) return true; // Return the underlying object for the GOT entry in Val. assert(DefMI->hasOneMemOperand()); Val = (*DefMI->memoperands_begin())->getValue(); if (!Val) Val = (*DefMI->memoperands_begin())->getPseudoValue(); return true; }
bool InstructionSelector::constrainSelectedInstRegOperands( MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI) const { MachineBasicBlock &MBB = *I.getParent(); MachineFunction &MF = *MBB.getParent(); MachineRegisterInfo &MRI = MF.getRegInfo(); for (unsigned OpI = 0, OpE = I.getNumExplicitOperands(); OpI != OpE; ++OpI) { MachineOperand &MO = I.getOperand(OpI); // There's nothing to be done on non-register operands. if (!MO.isReg()) continue; DEBUG(dbgs() << "Converting operand: " << MO << '\n'); assert(MO.isReg() && "Unsupported non-reg operand"); // Physical registers don't need to be constrained. if (TRI.isPhysicalRegister(MO.getReg())) continue; const TargetRegisterClass *RC = TII.getRegClass(I.getDesc(), OpI, &TRI, MF); assert(RC && "Selected inst should have regclass operand"); // If the operand is a vreg, we should constrain its regclass, and only // insert COPYs if that's impossible. // If the operand is a physreg, we only insert COPYs if the register class // doesn't contain the register. if (RBI.constrainGenericRegister(MO.getReg(), *RC, MRI)) continue; DEBUG(dbgs() << "Constraining with COPYs isn't implemented yet"); return false; } return true; }
bool TargetInstrInfo::hasReassociableSibling(const MachineInstr &Inst, bool &Commuted) const { const MachineBasicBlock *MBB = Inst.getParent(); const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); MachineInstr *MI1 = MRI.getUniqueVRegDef(Inst.getOperand(1).getReg()); MachineInstr *MI2 = MRI.getUniqueVRegDef(Inst.getOperand(2).getReg()); unsigned AssocOpcode = Inst.getOpcode(); // If only one operand has the same opcode and it's the second source operand, // the operands must be commuted. Commuted = MI1->getOpcode() != AssocOpcode && MI2->getOpcode() == AssocOpcode; if (Commuted) std::swap(MI1, MI2); // 1. The previous instruction must be the same type as Inst. // 2. The previous instruction must have virtual register definitions for its // operands in the same basic block as Inst. // 3. The previous instruction's result must only be used by Inst. if (MI1->getOpcode() == AssocOpcode && hasReassociableOperands(*MI1, MBB) && MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg())) return true; return false; }
ScheduleHazardRecognizer::HazardType HexagonHazardRecognizer::getHazardType(SUnit *SU, int stalls) { MachineInstr *MI = SU->getInstr(); if (!MI || TII->isZeroCost(MI->getOpcode())) return NoHazard; if (!Resources->canReserveResources(*MI)) { DEBUG(dbgs() << "*** Hazard in cycle " << PacketNum << ", " << *MI); HazardType RetVal = Hazard; if (TII->mayBeNewStore(*MI)) { // Make sure the register to be stored is defined by an instruction in the // packet. MachineOperand &MO = MI->getOperand(MI->getNumOperands() - 1); if (!MO.isReg() || RegDefs.count(MO.getReg()) == 0) return Hazard; // The .new store version uses different resources so check if it // causes a hazard. MachineFunction *MF = MI->getParent()->getParent(); MachineInstr *NewMI = MF->CreateMachineInstr(TII->get(TII->getDotNewOp(*MI)), MI->getDebugLoc()); if (Resources->canReserveResources(*NewMI)) RetVal = NoHazard; DEBUG(dbgs() << "*** Try .new version? " << (RetVal == NoHazard) << "\n"); MF->DeleteMachineInstr(NewMI); } return RetVal; } if (SU == UsesDotCur && DotCurPNum != (int)PacketNum) { DEBUG(dbgs() << "*** .cur Hazard in cycle " << PacketNum << ", " << *MI); return Hazard; } return NoHazard; }
/// Return true if MI is likely to be usable as a memory operation by the /// implicit null check optimization. /// /// This is a "best effort" heuristic, and should not be relied upon for /// correctness. This returning true does not guarantee that the implicit null /// check optimization is legal over MI, and this returning false does not /// guarantee MI cannot possibly be used to do a null check. static bool SinkingPreventsImplicitNullCheck(MachineInstr &MI, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI) { using MachineBranchPredicate = TargetInstrInfo::MachineBranchPredicate; auto *MBB = MI.getParent(); if (MBB->pred_size() != 1) return false; auto *PredMBB = *MBB->pred_begin(); auto *PredBB = PredMBB->getBasicBlock(); // Frontends that don't use implicit null checks have no reason to emit // branches with make.implicit metadata, and this function should always // return false for them. if (!PredBB || !PredBB->getTerminator()->getMetadata(LLVMContext::MD_make_implicit)) return false; unsigned BaseReg; int64_t Offset; if (!TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI)) return false; if (!(MI.mayLoad() && !MI.isPredicable())) return false; MachineBranchPredicate MBP; if (TII->analyzeBranchPredicate(*PredMBB, MBP, false)) return false; return MBP.LHS.isReg() && MBP.RHS.isImm() && MBP.RHS.getImm() == 0 && (MBP.Predicate == MachineBranchPredicate::PRED_NE || MBP.Predicate == MachineBranchPredicate::PRED_EQ) && MBP.LHS.getReg() == BaseReg; }
MachineInstr *TwoAddressInstructionPass::FindLastUseInMBB(unsigned Reg, MachineBasicBlock *MBB, unsigned Dist) { unsigned LastUseDist = 0; MachineInstr *LastUse = 0; for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Reg), E = MRI->reg_end(); I != E; ++I) { MachineOperand &MO = I.getOperand(); MachineInstr *MI = MO.getParent(); if (MI->getParent() != MBB) continue; DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(MI); if (DI == DistanceMap.end()) continue; if (DI->second >= Dist) continue; if (MO.isUse() && DI->second > LastUseDist) { LastUse = DI->first; LastUseDist = DI->second; } } return LastUse; }
/// traceSiblingValue - Trace a value that is about to be spilled back to the /// real defining instructions by looking through sibling copies. Always stay /// within the range of OrigVNI so the registers are known to carry the same /// value. /// /// Determine if the value is defined by all reloads, so spilling isn't /// necessary - the value is already in the stack slot. /// /// Return a defining instruction that may be a candidate for rematerialization. /// MachineInstr *InlineSpiller::traceSiblingValue(unsigned UseReg, VNInfo *UseVNI, VNInfo *OrigVNI) { // Check if a cached value already exists. SibValueMap::iterator SVI; bool Inserted; tie(SVI, Inserted) = SibValues.insert(std::make_pair(UseVNI, SibValueInfo(UseReg, UseVNI))); if (!Inserted) { DEBUG(dbgs() << "Cached value " << PrintReg(UseReg) << ':' << UseVNI->id << '@' << UseVNI->def << ' ' << SVI->second); return SVI->second.DefMI; } DEBUG(dbgs() << "Tracing value " << PrintReg(UseReg) << ':' << UseVNI->id << '@' << UseVNI->def << '\n'); // List of (Reg, VNI) that have been inserted into SibValues, but need to be // processed. SmallVector<std::pair<unsigned, VNInfo*>, 8> WorkList; WorkList.push_back(std::make_pair(UseReg, UseVNI)); do { unsigned Reg; VNInfo *VNI; tie(Reg, VNI) = WorkList.pop_back_val(); DEBUG(dbgs() << " " << PrintReg(Reg) << ':' << VNI->id << '@' << VNI->def << ":\t"); // First check if this value has already been computed. SVI = SibValues.find(VNI); assert(SVI != SibValues.end() && "Missing SibValues entry"); // Trace through PHI-defs created by live range splitting. if (VNI->isPHIDef()) { // Stop at original PHIs. We don't know the value at the predecessors. if (VNI->def == OrigVNI->def) { DEBUG(dbgs() << "orig phi value\n"); SVI->second.DefByOrigPHI = true; SVI->second.AllDefsAreReloads = false; propagateSiblingValue(SVI); continue; } // This is a PHI inserted by live range splitting. We could trace the // live-out value from predecessor blocks, but that search can be very // expensive if there are many predecessors and many more PHIs as // generated by tail-dup when it sees an indirectbr. Instead, look at // all the non-PHI defs that have the same value as OrigVNI. They must // jointly dominate VNI->def. This is not optimal since VNI may actually // be jointly dominated by a smaller subset of defs, so there is a change // we will miss a AllDefsAreReloads optimization. // Separate all values dominated by OrigVNI into PHIs and non-PHIs. SmallVector<VNInfo*, 8> PHIs, NonPHIs; LiveInterval &LI = LIS.getInterval(Reg); LiveInterval &OrigLI = LIS.getInterval(Original); for (LiveInterval::vni_iterator VI = LI.vni_begin(), VE = LI.vni_end(); VI != VE; ++VI) { VNInfo *VNI2 = *VI; if (VNI2->isUnused()) continue; if (!OrigLI.containsOneValue() && OrigLI.getVNInfoAt(VNI2->def) != OrigVNI) continue; if (VNI2->isPHIDef() && VNI2->def != OrigVNI->def) PHIs.push_back(VNI2); else NonPHIs.push_back(VNI2); } DEBUG(dbgs() << "split phi value, checking " << PHIs.size() << " phi-defs, and " << NonPHIs.size() << " non-phi/orig defs\n"); // Create entries for all the PHIs. Don't add them to the worklist, we // are processing all of them in one go here. for (unsigned i = 0, e = PHIs.size(); i != e; ++i) SibValues.insert(std::make_pair(PHIs[i], SibValueInfo(Reg, PHIs[i]))); // Add every PHI as a dependent of all the non-PHIs. for (unsigned i = 0, e = NonPHIs.size(); i != e; ++i) { VNInfo *NonPHI = NonPHIs[i]; // Known value? Try an insertion. tie(SVI, Inserted) = SibValues.insert(std::make_pair(NonPHI, SibValueInfo(Reg, NonPHI))); // Add all the PHIs as dependents of NonPHI. for (unsigned pi = 0, pe = PHIs.size(); pi != pe; ++pi) SVI->second.Deps.push_back(PHIs[pi]); // This is the first time we see NonPHI, add it to the worklist. if (Inserted) WorkList.push_back(std::make_pair(Reg, NonPHI)); else // Propagate to all inserted PHIs, not just VNI. propagateSiblingValue(SVI); } // Next work list item. continue; } MachineInstr *MI = LIS.getInstructionFromIndex(VNI->def); assert(MI && "Missing def"); // Trace through sibling copies. if (unsigned SrcReg = isFullCopyOf(MI, Reg)) { if (isSibling(SrcReg)) { LiveInterval &SrcLI = LIS.getInterval(SrcReg); LiveRangeQuery SrcQ(SrcLI, VNI->def); assert(SrcQ.valueIn() && "Copy from non-existing value"); // Check if this COPY kills its source. SVI->second.KillsSource = SrcQ.isKill(); VNInfo *SrcVNI = SrcQ.valueIn(); DEBUG(dbgs() << "copy of " << PrintReg(SrcReg) << ':' << SrcVNI->id << '@' << SrcVNI->def << " kill=" << unsigned(SVI->second.KillsSource) << '\n'); // Known sibling source value? Try an insertion. tie(SVI, Inserted) = SibValues.insert(std::make_pair(SrcVNI, SibValueInfo(SrcReg, SrcVNI))); // This is the first time we see Src, add it to the worklist. if (Inserted) WorkList.push_back(std::make_pair(SrcReg, SrcVNI)); propagateSiblingValue(SVI, VNI); // Next work list item. continue; } } // Track reachable reloads. SVI->second.DefMI = MI; SVI->second.SpillMBB = MI->getParent(); int FI; if (Reg == TII.isLoadFromStackSlot(MI, FI) && FI == StackSlot) { DEBUG(dbgs() << "reload\n"); propagateSiblingValue(SVI); // Next work list item. continue; } // Potential remat candidate. DEBUG(dbgs() << "def " << *MI); SVI->second.AllDefsAreReloads = false; propagateSiblingValue(SVI); } while (!WorkList.empty()); // Look up the value we were looking for. We already did this lookup at the // top of the function, but SibValues may have been invalidated. SVI = SibValues.find(UseVNI); assert(SVI != SibValues.end() && "Didn't compute requested info"); DEBUG(dbgs() << " traced to:\t" << SVI->second); return SVI->second.DefMI; }
/// canSpeculateInstrs - Returns true if all the instructions in MBB can safely /// be speculated. The terminators are not considered. /// /// If instructions use any values that are defined in the head basic block, /// the defining instructions are added to InsertAfter. /// /// Any clobbered regunits are added to ClobberedRegUnits. /// bool SSAIfConv::canSpeculateInstrs(MachineBasicBlock *MBB) { // Reject any live-in physregs. It's probably CPSR/EFLAGS, and very hard to // get right. if (!MBB->livein_empty()) { DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has live-ins.\n"); return false; } unsigned InstrCount = 0; // Check all instructions, except the terminators. It is assumed that // terminators never have side effects or define any used register values. for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->getFirstTerminator(); I != E; ++I) { if (I->isDebugValue()) continue; if (++InstrCount > BlockInstrLimit && !Stress) { DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has more than " << BlockInstrLimit << " instructions.\n"); return false; } // There shouldn't normally be any phis in a single-predecessor block. if (I->isPHI()) { DEBUG(dbgs() << "Can't hoist: " << *I); return false; } // Don't speculate loads. Note that it may be possible and desirable to // speculate GOT or constant pool loads that are guaranteed not to trap, // but we don't support that for now. if (I->mayLoad()) { DEBUG(dbgs() << "Won't speculate load: " << *I); return false; } // We never speculate stores, so an AA pointer isn't necessary. bool DontMoveAcrossStore = true; if (!I->isSafeToMove(nullptr, DontMoveAcrossStore)) { DEBUG(dbgs() << "Can't speculate: " << *I); return false; } // Check for any dependencies on Head instructions. for (MIOperands MO(I); MO.isValid(); ++MO) { if (MO->isRegMask()) { DEBUG(dbgs() << "Won't speculate regmask: " << *I); return false; } if (!MO->isReg()) continue; unsigned Reg = MO->getReg(); // Remember clobbered regunits. if (MO->isDef() && TargetRegisterInfo::isPhysicalRegister(Reg)) for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) ClobberedRegUnits.set(*Units); if (!MO->readsReg() || !TargetRegisterInfo::isVirtualRegister(Reg)) continue; MachineInstr *DefMI = MRI->getVRegDef(Reg); if (!DefMI || DefMI->getParent() != Head) continue; if (InsertAfter.insert(DefMI).second) DEBUG(dbgs() << "BB#" << MBB->getNumber() << " depends on " << *DefMI); if (DefMI->isTerminator()) { DEBUG(dbgs() << "Can't insert instructions below terminator.\n"); return false; } } } return true; }
/// fixupConditionalBranch - Fix up a conditional branch whose destination is /// too far away to fit in its displacement field. It is converted to an inverse /// conditional branch + an unconditional branch to the destination. bool BranchRelaxation::fixupConditionalBranch(MachineInstr &MI) { DebugLoc DL = MI.getDebugLoc(); MachineBasicBlock *MBB = MI.getParent(); MachineBasicBlock *TBB = nullptr, *FBB = nullptr; MachineBasicBlock *NewBB = nullptr; SmallVector<MachineOperand, 4> Cond; auto insertUncondBranch = [&](MachineBasicBlock *MBB, MachineBasicBlock *DestBB) { unsigned &BBSize = BlockInfo[MBB->getNumber()].Size; int NewBrSize = 0; TII->insertUnconditionalBranch(*MBB, DestBB, DL, &NewBrSize); BBSize += NewBrSize; }; auto insertBranch = [&](MachineBasicBlock *MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, SmallVectorImpl<MachineOperand>& Cond) { unsigned &BBSize = BlockInfo[MBB->getNumber()].Size; int NewBrSize = 0; TII->insertBranch(*MBB, TBB, FBB, Cond, DL, &NewBrSize); BBSize += NewBrSize; }; auto removeBranch = [&](MachineBasicBlock *MBB) { unsigned &BBSize = BlockInfo[MBB->getNumber()].Size; int RemovedSize = 0; TII->removeBranch(*MBB, &RemovedSize); BBSize -= RemovedSize; }; auto finalizeBlockChanges = [&](MachineBasicBlock *MBB, MachineBasicBlock *NewBB) { // Keep the block offsets up to date. adjustBlockOffsets(*MBB); // Need to fix live-in lists if we track liveness. if (NewBB && TRI->trackLivenessAfterRegAlloc(*MF)) computeAndAddLiveIns(LiveRegs, *NewBB); }; bool Fail = TII->analyzeBranch(*MBB, TBB, FBB, Cond); assert(!Fail && "branches to be relaxed must be analyzable"); (void)Fail; // Add an unconditional branch to the destination and invert the branch // condition to jump over it: // tbz L1 // => // tbnz L2 // b L1 // L2: bool ReversedCond = !TII->reverseBranchCondition(Cond); if (ReversedCond) { if (FBB && isBlockInRange(MI, *FBB)) { // Last MI in the BB is an unconditional branch. We can simply invert the // condition and swap destinations: // beq L1 // b L2 // => // bne L2 // b L1 LLVM_DEBUG(dbgs() << " Invert condition and swap " "its destination with " << MBB->back()); removeBranch(MBB); insertBranch(MBB, FBB, TBB, Cond); finalizeBlockChanges(MBB, nullptr); return true; } if (FBB) { // We need to split the basic block here to obtain two long-range // unconditional branches. NewBB = createNewBlockAfter(*MBB); insertUncondBranch(NewBB, FBB); // Update the succesor lists according to the transformation to follow. // Do it here since if there's no split, no update is needed. MBB->replaceSuccessor(FBB, NewBB); NewBB->addSuccessor(FBB); } // We now have an appropriate fall-through block in place (either naturally or // just created), so we can use the inverted the condition. MachineBasicBlock &NextBB = *std::next(MachineFunction::iterator(MBB)); LLVM_DEBUG(dbgs() << " Insert B to " << printMBBReference(*TBB) << ", invert condition and change dest. to " << printMBBReference(NextBB) << '\n'); removeBranch(MBB); // Insert a new conditional branch and a new unconditional branch. insertBranch(MBB, &NextBB, TBB, Cond); finalizeBlockChanges(MBB, NewBB); return true; } // Branch cond can't be inverted. // In this case we always add a block after the MBB. LLVM_DEBUG(dbgs() << " The branch condition can't be inverted. " << " Insert a new BB after " << MBB->back()); if (!FBB) FBB = &(*std::next(MachineFunction::iterator(MBB))); // This is the block with cond. branch and the distance to TBB is too long. // beq L1 // L2: // We do the following transformation: // beq NewBB // b L2 // NewBB: // b L1 // L2: NewBB = createNewBlockAfter(*MBB); insertUncondBranch(NewBB, TBB); LLVM_DEBUG(dbgs() << " Insert cond B to the new BB " << printMBBReference(*NewBB) << " Keep the exiting condition.\n" << " Insert B to " << printMBBReference(*FBB) << ".\n" << " In the new BB: Insert B to " << printMBBReference(*TBB) << ".\n"); // Update the successor lists according to the transformation to follow. MBB->replaceSuccessor(TBB, NewBB); NewBB->addSuccessor(TBB); // Replace branch in the current (MBB) block. removeBranch(MBB); insertBranch(MBB, NewBB, FBB, Cond); finalizeBlockChanges(MBB, NewBB); return true; }
/// spillAroundUses - insert spill code around each use of Reg. void InlineSpiller::spillAroundUses(unsigned Reg) { DEBUG(dbgs() << "spillAroundUses " << PrintReg(Reg) << '\n'); LiveInterval &OldLI = LIS.getInterval(Reg); // Iterate over instructions using Reg. for (MachineRegisterInfo::reg_bundle_iterator RegI = MRI.reg_bundle_begin(Reg), E = MRI.reg_bundle_end(); RegI != E; ) { MachineInstr *MI = &*(RegI++); // Debug values are not allowed to affect codegen. if (MI->isDebugValue()) { // Modify DBG_VALUE now that the value is in a spill slot. bool IsIndirect = MI->isIndirectDebugValue(); uint64_t Offset = IsIndirect ? MI->getOperand(1).getImm() : 0; const MDNode *MDPtr = MI->getOperand(2).getMetadata(); DebugLoc DL = MI->getDebugLoc(); DEBUG(dbgs() << "Modifying debug info due to spill:" << "\t" << *MI); MachineBasicBlock *MBB = MI->getParent(); BuildMI(*MBB, MBB->erase(MI), DL, TII.get(TargetOpcode::DBG_VALUE)) .addFrameIndex(StackSlot).addImm(Offset).addMetadata(MDPtr); continue; } // Ignore copies to/from snippets. We'll delete them. if (SnippetCopies.count(MI)) continue; // Stack slot accesses may coalesce away. if (coalesceStackAccess(MI, Reg)) continue; // Analyze instruction. SmallVector<std::pair<MachineInstr*, unsigned>, 8> Ops; MIBundleOperands::VirtRegInfo RI = MIBundleOperands(MI).analyzeVirtReg(Reg, &Ops); // Find the slot index where this instruction reads and writes OldLI. // This is usually the def slot, except for tied early clobbers. SlotIndex Idx = LIS.getInstructionIndex(MI).getRegSlot(); if (VNInfo *VNI = OldLI.getVNInfoAt(Idx.getRegSlot(true))) if (SlotIndex::isSameInstr(Idx, VNI->def)) Idx = VNI->def; // Check for a sibling copy. unsigned SibReg = isFullCopyOf(MI, Reg); if (SibReg && isSibling(SibReg)) { // This may actually be a copy between snippets. if (isRegToSpill(SibReg)) { DEBUG(dbgs() << "Found new snippet copy: " << *MI); SnippetCopies.insert(MI); continue; } if (RI.Writes) { // Hoist the spill of a sib-reg copy. if (hoistSpill(OldLI, MI)) { // This COPY is now dead, the value is already in the stack slot. MI->getOperand(0).setIsDead(); DeadDefs.push_back(MI); continue; } } else { // This is a reload for a sib-reg copy. Drop spills downstream. LiveInterval &SibLI = LIS.getInterval(SibReg); eliminateRedundantSpills(SibLI, SibLI.getVNInfoAt(Idx)); // The COPY will fold to a reload below. } } // Attempt to fold memory ops. if (foldMemoryOperand(Ops)) continue; // Create a new virtual register for spill/fill. // FIXME: Infer regclass from instruction alone. unsigned NewVReg = Edit->createFrom(Reg); if (RI.Reads) insertReload(NewVReg, Idx, MI); // Rewrite instruction operands. bool hasLiveDef = false; for (unsigned i = 0, e = Ops.size(); i != e; ++i) { MachineOperand &MO = Ops[i].first->getOperand(Ops[i].second); MO.setReg(NewVReg); if (MO.isUse()) { if (!Ops[i].first->isRegTiedToDefOperand(Ops[i].second)) MO.setIsKill(); } else { if (!MO.isDead()) hasLiveDef = true; } } DEBUG(dbgs() << "\trewrite: " << Idx << '\t' << *MI << '\n'); // FIXME: Use a second vreg if instruction has no tied ops. if (RI.Writes) if (hasLiveDef) insertSpill(NewVReg, true, MI); } }
/// TailDuplicate - If it is profitable, duplicate TailBB's contents in each /// of its predecessors. bool TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB, MachineFunction &MF, SmallVector<MachineBasicBlock*, 8> &TDBBs, SmallVector<MachineInstr*, 16> &Copies) { if (!shouldTailDuplicate(MF, *TailBB)) return false; DEBUG(dbgs() << "\n*** Tail-duplicating BB#" << TailBB->getNumber() << '\n'); // Iterate through all the unique predecessors and tail-duplicate this // block into them, if possible. Copying the list ahead of time also // avoids trouble with the predecessor list reallocating. bool Changed = false; SmallSetVector<MachineBasicBlock*, 8> Preds(TailBB->pred_begin(), TailBB->pred_end()); DenseSet<unsigned> UsedByPhi; getRegsUsedByPHIs(*TailBB, &UsedByPhi); for (SmallSetVector<MachineBasicBlock *, 8>::iterator PI = Preds.begin(), PE = Preds.end(); PI != PE; ++PI) { MachineBasicBlock *PredBB = *PI; assert(TailBB != PredBB && "Single-block loop should have been rejected earlier!"); // EH edges are ignored by AnalyzeBranch. if (PredBB->succ_size() > 1) continue; MachineBasicBlock *PredTBB, *PredFBB; SmallVector<MachineOperand, 4> PredCond; if (TII->AnalyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true)) continue; if (!PredCond.empty()) continue; // Don't duplicate into a fall-through predecessor (at least for now). if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough()) continue; DEBUG(dbgs() << "\nTail-duplicating into PredBB: " << *PredBB << "From Succ: " << *TailBB); TDBBs.push_back(PredBB); // Remove PredBB's unconditional branch. TII->RemoveBranch(*PredBB); // Clone the contents of TailBB into PredBB. DenseMap<unsigned, unsigned> LocalVRMap; SmallVector<std::pair<unsigned,unsigned>, 4> CopyInfos; MachineBasicBlock::iterator I = TailBB->begin(); while (I != TailBB->end()) { MachineInstr *MI = &*I; ++I; if (MI->isPHI()) { // Replace the uses of the def of the PHI with the register coming // from PredBB. ProcessPHI(MI, TailBB, PredBB, LocalVRMap, CopyInfos, UsedByPhi, true); } else { // Replace def of virtual registers with new registers, and update // uses with PHI source register or the new registers. DuplicateInstruction(MI, TailBB, PredBB, MF, LocalVRMap, UsedByPhi); } } MachineBasicBlock::iterator Loc = PredBB->getFirstTerminator(); for (unsigned i = 0, e = CopyInfos.size(); i != e; ++i) { Copies.push_back(BuildMI(*PredBB, Loc, DebugLoc(), TII->get(TargetOpcode::COPY), CopyInfos[i].first).addReg(CopyInfos[i].second)); } // Simplify TII->AnalyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true); NumInstrDups += TailBB->size() - 1; // subtract one for removed branch // Update the CFG. PredBB->removeSuccessor(PredBB->succ_begin()); assert(PredBB->succ_empty() && "TailDuplicate called on block with multiple successors!"); for (MachineBasicBlock::succ_iterator I = TailBB->succ_begin(), E = TailBB->succ_end(); I != E; ++I) PredBB->addSuccessor(*I); Changed = true; ++NumTailDups; } // If TailBB was duplicated into all its predecessors except for the prior // block, which falls through unconditionally, move the contents of this // block into the prior block. MachineBasicBlock *PrevBB = prior(MachineFunction::iterator(TailBB)); MachineBasicBlock *PriorTBB = 0, *PriorFBB = 0; SmallVector<MachineOperand, 4> PriorCond; // This has to check PrevBB->succ_size() because EH edges are ignored by // AnalyzeBranch. if (PrevBB->succ_size() == 1 && !TII->AnalyzeBranch(*PrevBB, PriorTBB, PriorFBB, PriorCond, true) && PriorCond.empty() && !PriorTBB && TailBB->pred_size() == 1 && !TailBB->hasAddressTaken()) { DEBUG(dbgs() << "\nMerging into block: " << *PrevBB << "From MBB: " << *TailBB); if (PreRegAlloc) { DenseMap<unsigned, unsigned> LocalVRMap; SmallVector<std::pair<unsigned,unsigned>, 4> CopyInfos; MachineBasicBlock::iterator I = TailBB->begin(); // Process PHI instructions first. while (I != TailBB->end() && I->isPHI()) { // Replace the uses of the def of the PHI with the register coming // from PredBB. MachineInstr *MI = &*I++; ProcessPHI(MI, TailBB, PrevBB, LocalVRMap, CopyInfos, UsedByPhi, true); if (MI->getParent()) MI->eraseFromParent(); } // Now copy the non-PHI instructions. while (I != TailBB->end()) { // Replace def of virtual registers with new registers, and update // uses with PHI source register or the new registers. MachineInstr *MI = &*I++; DuplicateInstruction(MI, TailBB, PrevBB, MF, LocalVRMap, UsedByPhi); MI->eraseFromParent(); } MachineBasicBlock::iterator Loc = PrevBB->getFirstTerminator(); for (unsigned i = 0, e = CopyInfos.size(); i != e; ++i) { Copies.push_back(BuildMI(*PrevBB, Loc, DebugLoc(), TII->get(TargetOpcode::COPY), CopyInfos[i].first) .addReg(CopyInfos[i].second)); } } else { // No PHIs to worry about, just splice the instructions over. PrevBB->splice(PrevBB->end(), TailBB, TailBB->begin(), TailBB->end()); } PrevBB->removeSuccessor(PrevBB->succ_begin()); assert(PrevBB->succ_empty()); PrevBB->transferSuccessors(TailBB); TDBBs.push_back(PrevBB); Changed = true; } // If this is after register allocation, there are no phis to fix. if (!PreRegAlloc) return Changed; // If we made no changes so far, we are safe. if (!Changed) return Changed; // Handle the nasty case in that we duplicated a block that is part of a loop // into some but not all of its predecessors. For example: // 1 -> 2 <-> 3 | // \ | // \---> rest | // if we duplicate 2 into 1 but not into 3, we end up with // 12 -> 3 <-> 2 -> rest | // \ / | // \----->-----/ | // If there was a "var = phi(1, 3)" in 2, it has to be ultimately replaced // with a phi in 3 (which now dominates 2). // What we do here is introduce a copy in 3 of the register defined by the // phi, just like when we are duplicating 2 into 3, but we don't copy any // real instructions or remove the 3 -> 2 edge from the phi in 2. for (SmallSetVector<MachineBasicBlock *, 8>::iterator PI = Preds.begin(), PE = Preds.end(); PI != PE; ++PI) { MachineBasicBlock *PredBB = *PI; if (std::find(TDBBs.begin(), TDBBs.end(), PredBB) != TDBBs.end()) continue; // EH edges if (PredBB->succ_size() != 1) continue; DenseMap<unsigned, unsigned> LocalVRMap; SmallVector<std::pair<unsigned,unsigned>, 4> CopyInfos; MachineBasicBlock::iterator I = TailBB->begin(); // Process PHI instructions first. while (I != TailBB->end() && I->isPHI()) { // Replace the uses of the def of the PHI with the register coming // from PredBB. MachineInstr *MI = &*I++; ProcessPHI(MI, TailBB, PredBB, LocalVRMap, CopyInfos, UsedByPhi, false); } MachineBasicBlock::iterator Loc = PredBB->getFirstTerminator(); for (unsigned i = 0, e = CopyInfos.size(); i != e; ++i) { Copies.push_back(BuildMI(*PredBB, Loc, DebugLoc(), TII->get(TargetOpcode::COPY), CopyInfos[i].first).addReg(CopyInfos[i].second)); } } return Changed; }
/// TailDuplicateBlocks - Look for small blocks that are unconditionally /// branched to and do not fall through. Tail-duplicate their instructions /// into their predecessors to eliminate (dynamic) branches. bool TailDuplicatePass::TailDuplicateBlocks(MachineFunction &MF) { bool MadeChange = false; if (PreRegAlloc && TailDupVerify) { DEBUG(dbgs() << "\n*** Before tail-duplicating\n"); VerifyPHIs(MF, true); } SmallVector<MachineInstr*, 8> NewPHIs; MachineSSAUpdater SSAUpdate(MF, &NewPHIs); for (MachineFunction::iterator I = ++MF.begin(), E = MF.end(); I != E; ) { MachineBasicBlock *MBB = I++; if (NumTails == TailDupLimit) break; // Save the successors list. SmallSetVector<MachineBasicBlock*, 8> Succs(MBB->succ_begin(), MBB->succ_end()); SmallVector<MachineBasicBlock*, 8> TDBBs; SmallVector<MachineInstr*, 16> Copies; if (TailDuplicate(MBB, MF, TDBBs, Copies)) { ++NumTails; // TailBB's immediate successors are now successors of those predecessors // which duplicated TailBB. Add the predecessors as sources to the PHI // instructions. bool isDead = MBB->pred_empty(); if (PreRegAlloc) UpdateSuccessorsPHIs(MBB, isDead, TDBBs, Succs); // If it is dead, remove it. if (isDead) { NumInstrDups -= MBB->size(); RemoveDeadBlock(MBB); ++NumDeadBlocks; } // Update SSA form. if (!SSAUpdateVRs.empty()) { for (unsigned i = 0, e = SSAUpdateVRs.size(); i != e; ++i) { unsigned VReg = SSAUpdateVRs[i]; SSAUpdate.Initialize(VReg); // If the original definition is still around, add it as an available // value. MachineInstr *DefMI = MRI->getVRegDef(VReg); MachineBasicBlock *DefBB = 0; if (DefMI) { DefBB = DefMI->getParent(); SSAUpdate.AddAvailableValue(DefBB, VReg); } // Add the new vregs as available values. DenseMap<unsigned, AvailableValsTy>::iterator LI = SSAUpdateVals.find(VReg); for (unsigned j = 0, ee = LI->second.size(); j != ee; ++j) { MachineBasicBlock *SrcBB = LI->second[j].first; unsigned SrcReg = LI->second[j].second; SSAUpdate.AddAvailableValue(SrcBB, SrcReg); } // Rewrite uses that are outside of the original def's block. MachineRegisterInfo::use_iterator UI = MRI->use_begin(VReg); while (UI != MRI->use_end()) { MachineOperand &UseMO = UI.getOperand(); MachineInstr *UseMI = &*UI; ++UI; if (UseMI->isDebugValue()) { // SSAUpdate can replace the use with an undef. That creates // a debug instruction that is a kill. // FIXME: Should it SSAUpdate job to delete debug instructions // instead of replacing the use with undef? UseMI->eraseFromParent(); continue; } if (UseMI->getParent() == DefBB && !UseMI->isPHI()) continue; SSAUpdate.RewriteUse(UseMO); } } SSAUpdateVRs.clear(); SSAUpdateVals.clear(); } // Eliminate some of the copies inserted by tail duplication to maintain // SSA form. for (unsigned i = 0, e = Copies.size(); i != e; ++i) { MachineInstr *Copy = Copies[i]; if (!Copy->isCopy()) continue; unsigned Dst = Copy->getOperand(0).getReg(); unsigned Src = Copy->getOperand(1).getReg(); MachineRegisterInfo::use_iterator UI = MRI->use_begin(Src); if (++UI == MRI->use_end()) { // Copy is the only use. Do trivial copy propagation here. MRI->replaceRegWith(Dst, Src); Copy->eraseFromParent(); } } if (PreRegAlloc && TailDupVerify) VerifyPHIs(MF, false); MadeChange = true; } } NumAddedPHIs += NewPHIs.size(); return MadeChange; }
bool AMDGPUInstructionSelector::selectSMRD(MachineInstr &I, ArrayRef<GEPInfo> AddrInfo) const { if (!I.hasOneMemOperand()) return false; if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUAS::CONSTANT_ADDRESS && (*I.memoperands_begin())->getAddrSpace() != AMDGPUAS::CONSTANT_ADDRESS_32BIT) return false; if (!isInstrUniform(I)) return false; if (hasVgprParts(AddrInfo)) return false; MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>(); MachineRegisterInfo &MRI = MF->getRegInfo(); unsigned DstReg = I.getOperand(0).getReg(); const DebugLoc &DL = I.getDebugLoc(); unsigned Opcode; unsigned LoadSize = RBI.getSizeInBits(DstReg, MRI, TRI); if (!AddrInfo.empty() && AddrInfo[0].SgprParts.size() == 1) { const GEPInfo &GEPInfo = AddrInfo[0]; unsigned PtrReg = GEPInfo.SgprParts[0]; int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(Subtarget, GEPInfo.Imm); if (AMDGPU::isLegalSMRDImmOffset(Subtarget, GEPInfo.Imm)) { Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM, LoadSize); MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) .addReg(PtrReg) .addImm(EncodedImm) .addImm(0); // glc return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); } if (Subtarget.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS && isUInt<32>(EncodedImm)) { Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM_ci, LoadSize); MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) .addReg(PtrReg) .addImm(EncodedImm) .addImm(0); // glc return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); } if (isUInt<32>(GEPInfo.Imm)) { Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_SGPR, LoadSize); unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), OffsetReg) .addImm(GEPInfo.Imm); MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) .addReg(PtrReg) .addReg(OffsetReg) .addImm(0); // glc return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); } } unsigned PtrReg = I.getOperand(1).getReg(); Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM, LoadSize); MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg) .addReg(PtrReg) .addImm(0) .addImm(0); // glc return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI); }
bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); MachineOperand &ImmOp = I.getOperand(1); // The AMDGPU backend only supports Imm operands and not CImm or FPImm. if (ImmOp.isFPImm()) { const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); ImmOp.ChangeToImmediate(Imm.getZExtValue()); } else if (ImmOp.isCImm()) { ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue()); } unsigned DstReg = I.getOperand(0).getReg(); unsigned Size; bool IsSgpr; const RegisterBank *RB = MRI.getRegBankOrNull(I.getOperand(0).getReg()); if (RB) { IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID; Size = MRI.getType(DstReg).getSizeInBits(); } else { const TargetRegisterClass *RC = TRI.getRegClassForReg(MRI, DstReg); IsSgpr = TRI.isSGPRClass(RC); Size = TRI.getRegSizeInBits(*RC); } if (Size != 32 && Size != 64) return false; unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; if (Size == 32) { I.setDesc(TII.get(Opcode)); I.addImplicitDefUseOperands(*MF); return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } DebugLoc DL = I.getDebugLoc(); const TargetRegisterClass *RC = IsSgpr ? &AMDGPU::SReg_32_XM0RegClass : &AMDGPU::VGPR_32RegClass; unsigned LoReg = MRI.createVirtualRegister(RC); unsigned HiReg = MRI.createVirtualRegister(RC); const APInt &Imm = APInt(Size, I.getOperand(1).getImm()); BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) .addImm(Imm.trunc(32).getZExtValue()); BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) .addImm(Imm.ashr(32).getZExtValue()); const MachineInstr *RS = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) .addReg(LoReg) .addImm(AMDGPU::sub0) .addReg(HiReg) .addImm(AMDGPU::sub1); // We can't call constrainSelectedInstRegOperands here, because it doesn't // work for target independent opcodes I.eraseFromParent(); const TargetRegisterClass *DstRC = TRI.getConstrainedRegClassForOperand(RS->getOperand(0), MRI); if (!DstRC) return true; return RBI.constrainGenericRegister(DstReg, *DstRC, MRI); }
/// optimizeExtInstr - If instruction is a copy-like instruction, i.e. it reads /// a single register and writes a single register and it does not modify the /// source, and if the source value is preserved as a sub-register of the /// result, then replace all reachable uses of the source with the subreg of the /// result. /// /// Do not generate an EXTRACT that is used only in a debug use, as this changes /// the code. Since this code does not currently share EXTRACTs, just ignore all /// debug uses. bool PeepholeOptimizer:: optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB, SmallPtrSet<MachineInstr*, 8> &LocalMIs) { unsigned SrcReg, DstReg, SubIdx; if (!TII->isCoalescableExtInstr(*MI, SrcReg, DstReg, SubIdx)) return false; if (TargetRegisterInfo::isPhysicalRegister(DstReg) || TargetRegisterInfo::isPhysicalRegister(SrcReg)) return false; if (MRI->hasOneNonDBGUse(SrcReg)) // No other uses. return false; // Ensure DstReg can get a register class that actually supports // sub-registers. Don't change the class until we commit. const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg); DstRC = TM->getRegisterInfo()->getSubClassWithSubReg(DstRC, SubIdx); if (!DstRC) return false; // The ext instr may be operating on a sub-register of SrcReg as well. // PPC::EXTSW is a 32 -> 64-bit sign extension, but it reads a 64-bit // register. // If UseSrcSubIdx is Set, SubIdx also applies to SrcReg, and only uses of // SrcReg:SubIdx should be replaced. bool UseSrcSubIdx = TM->getRegisterInfo()-> getSubClassWithSubReg(MRI->getRegClass(SrcReg), SubIdx) != 0; // The source has other uses. See if we can replace the other uses with use of // the result of the extension. SmallPtrSet<MachineBasicBlock*, 4> ReachedBBs; for (MachineRegisterInfo::use_nodbg_iterator UI = MRI->use_nodbg_begin(DstReg), UE = MRI->use_nodbg_end(); UI != UE; ++UI) ReachedBBs.insert(UI->getParent()); // Uses that are in the same BB of uses of the result of the instruction. SmallVector<MachineOperand*, 8> Uses; // Uses that the result of the instruction can reach. SmallVector<MachineOperand*, 8> ExtendedUses; bool ExtendLife = true; for (MachineRegisterInfo::use_nodbg_iterator UI = MRI->use_nodbg_begin(SrcReg), UE = MRI->use_nodbg_end(); UI != UE; ++UI) { MachineOperand &UseMO = UI.getOperand(); MachineInstr *UseMI = &*UI; if (UseMI == MI) continue; if (UseMI->isPHI()) { ExtendLife = false; continue; } // Only accept uses of SrcReg:SubIdx. if (UseSrcSubIdx && UseMO.getSubReg() != SubIdx) continue; // It's an error to translate this: // // %reg1025 = <sext> %reg1024 // ... // %reg1026 = SUBREG_TO_REG 0, %reg1024, 4 // // into this: // // %reg1025 = <sext> %reg1024 // ... // %reg1027 = COPY %reg1025:4 // %reg1026 = SUBREG_TO_REG 0, %reg1027, 4 // // The problem here is that SUBREG_TO_REG is there to assert that an // implicit zext occurs. It doesn't insert a zext instruction. If we allow // the COPY here, it will give us the value after the <sext>, not the // original value of %reg1024 before <sext>. if (UseMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) continue; MachineBasicBlock *UseMBB = UseMI->getParent(); if (UseMBB == MBB) { // Local uses that come after the extension. if (!LocalMIs.count(UseMI)) Uses.push_back(&UseMO); } else if (ReachedBBs.count(UseMBB)) { // Non-local uses where the result of the extension is used. Always // replace these unless it's a PHI. Uses.push_back(&UseMO); } else if (Aggressive && DT->dominates(MBB, UseMBB)) { // We may want to extend the live range of the extension result in order // to replace these uses. ExtendedUses.push_back(&UseMO); } else { // Both will be live out of the def MBB anyway. Don't extend live range of // the extension result. ExtendLife = false; break; } } if (ExtendLife && !ExtendedUses.empty()) // Extend the liveness of the extension result. std::copy(ExtendedUses.begin(), ExtendedUses.end(), std::back_inserter(Uses)); // Now replace all uses. bool Changed = false; if (!Uses.empty()) { SmallPtrSet<MachineBasicBlock*, 4> PHIBBs; // Look for PHI uses of the extended result, we don't want to extend the // liveness of a PHI input. It breaks all kinds of assumptions down // stream. A PHI use is expected to be the kill of its source values. for (MachineRegisterInfo::use_nodbg_iterator UI = MRI->use_nodbg_begin(DstReg), UE = MRI->use_nodbg_end(); UI != UE; ++UI) if (UI->isPHI()) PHIBBs.insert(UI->getParent()); const TargetRegisterClass *RC = MRI->getRegClass(SrcReg); for (unsigned i = 0, e = Uses.size(); i != e; ++i) { MachineOperand *UseMO = Uses[i]; MachineInstr *UseMI = UseMO->getParent(); MachineBasicBlock *UseMBB = UseMI->getParent(); if (PHIBBs.count(UseMBB)) continue; // About to add uses of DstReg, clear DstReg's kill flags. if (!Changed) { MRI->clearKillFlags(DstReg); MRI->constrainRegClass(DstReg, DstRC); } unsigned NewVR = MRI->createVirtualRegister(RC); MachineInstr *Copy = BuildMI(*UseMBB, UseMI, UseMI->getDebugLoc(), TII->get(TargetOpcode::COPY), NewVR) .addReg(DstReg, 0, SubIdx); // SubIdx applies to both SrcReg and DstReg when UseSrcSubIdx is set. if (UseSrcSubIdx) { Copy->getOperand(0).setSubReg(SubIdx); Copy->getOperand(0).setIsUndef(); } UseMO->setReg(NewVR); ++NumReuse; Changed = true; } } return Changed; }
/// EmitSchedule - Emit the machine code in scheduled order. Return the new /// InsertPos and MachineBasicBlock that contains this insertion /// point. ScheduleDAGSDNodes holds a BB pointer for convenience, but this does /// not necessarily refer to returned BB. The emitter may split blocks. MachineBasicBlock *ScheduleDAGSDNodes:: EmitSchedule(MachineBasicBlock::iterator &InsertPos) { InstrEmitter Emitter(BB, InsertPos); DenseMap<SDValue, unsigned> VRBaseMap; DenseMap<SUnit*, unsigned> CopyVRBaseMap; SmallVector<std::pair<unsigned, MachineInstr*>, 32> Orders; SmallSet<unsigned, 8> Seen; bool HasDbg = DAG->hasDebugValues(); // If this is the first BB, emit byval parameter dbg_value's. if (HasDbg && BB->getParent()->begin() == MachineFunction::iterator(BB)) { SDDbgInfo::DbgIterator PDI = DAG->ByvalParmDbgBegin(); SDDbgInfo::DbgIterator PDE = DAG->ByvalParmDbgEnd(); for (; PDI != PDE; ++PDI) { MachineInstr *DbgMI= Emitter.EmitDbgValue(*PDI, VRBaseMap); if (DbgMI) BB->insert(InsertPos, DbgMI); } } for (unsigned i = 0, e = Sequence.size(); i != e; i++) { SUnit *SU = Sequence[i]; if (!SU) { // Null SUnit* is a noop. TII->insertNoop(*Emitter.getBlock(), InsertPos); continue; } // For pre-regalloc scheduling, create instructions corresponding to the // SDNode and any glued SDNodes and append them to the block. if (!SU->getNode()) { // Emit a copy. EmitPhysRegCopy(SU, CopyVRBaseMap, InsertPos); continue; } SmallVector<SDNode *, 4> GluedNodes; for (SDNode *N = SU->getNode()->getGluedNode(); N; N = N->getGluedNode()) GluedNodes.push_back(N); while (!GluedNodes.empty()) { SDNode *N = GluedNodes.back(); Emitter.EmitNode(GluedNodes.back(), SU->OrigNode != SU, SU->isCloned, VRBaseMap); // Remember the source order of the inserted instruction. if (HasDbg) ProcessSourceNode(N, DAG, Emitter, VRBaseMap, Orders, Seen); GluedNodes.pop_back(); } Emitter.EmitNode(SU->getNode(), SU->OrigNode != SU, SU->isCloned, VRBaseMap); // Remember the source order of the inserted instruction. if (HasDbg) ProcessSourceNode(SU->getNode(), DAG, Emitter, VRBaseMap, Orders, Seen); } // Insert all the dbg_values which have not already been inserted in source // order sequence. if (HasDbg) { MachineBasicBlock::iterator BBBegin = BB->getFirstNonPHI(); // Sort the source order instructions and use the order to insert debug // values. std::sort(Orders.begin(), Orders.end(), OrderSorter()); SDDbgInfo::DbgIterator DI = DAG->DbgBegin(); SDDbgInfo::DbgIterator DE = DAG->DbgEnd(); // Now emit the rest according to source order. unsigned LastOrder = 0; for (unsigned i = 0, e = Orders.size(); i != e && DI != DE; ++i) { unsigned Order = Orders[i].first; MachineInstr *MI = Orders[i].second; // Insert all SDDbgValue's whose order(s) are before "Order". if (!MI) continue; for (; DI != DE && (*DI)->getOrder() >= LastOrder && (*DI)->getOrder() < Order; ++DI) { if ((*DI)->isInvalidated()) continue; MachineInstr *DbgMI = Emitter.EmitDbgValue(*DI, VRBaseMap); if (DbgMI) { if (!LastOrder) // Insert to start of the BB (after PHIs). BB->insert(BBBegin, DbgMI); else { // Insert at the instruction, which may be in a different // block, if the block was split by a custom inserter. MachineBasicBlock::iterator Pos = MI; MI->getParent()->insert(llvm::next(Pos), DbgMI); } } } LastOrder = Order; } // Add trailing DbgValue's before the terminator. FIXME: May want to add // some of them before one or more conditional branches? SmallVector<MachineInstr*, 8> DbgMIs; while (DI != DE) { if (!(*DI)->isInvalidated()) if (MachineInstr *DbgMI = Emitter.EmitDbgValue(*DI, VRBaseMap)) DbgMIs.push_back(DbgMI); ++DI; } MachineBasicBlock *InsertBB = Emitter.getBlock(); MachineBasicBlock::iterator Pos = InsertBB->getFirstTerminator(); InsertBB->insert(Pos, DbgMIs.begin(), DbgMIs.end()); } InsertPos = Emitter.getInsertPos(); return Emitter.getBlock(); }
/// It's possible to determine the value of a register based on a dominating /// condition. To do so, this function checks to see if the basic block \p MBB /// is the target of a conditional branch \p CondBr with an equality comparison. /// If the branch is a CBZ/CBNZ, we know the value of its source operand is zero /// in \p MBB for some cases. Otherwise, we find and inspect the NZCV setting /// instruction (e.g., SUBS, ADDS). If this instruction defines a register /// other than WZR/XZR, we know the value of the destination register is zero in /// \p MMB for some cases. In addition, if the NZCV setting instruction is /// comparing against a constant we know the other source register is equal to /// the constant in \p MBB for some cases. If we find any constant values, push /// a physical register and constant value pair onto the KnownRegs vector and /// return true. Otherwise, return false if no known values were found. bool AArch64RedundantCopyElimination::knownRegValInBlock( MachineInstr &CondBr, MachineBasicBlock *MBB, SmallVectorImpl<RegImm> &KnownRegs, MachineBasicBlock::iterator &FirstUse) { unsigned Opc = CondBr.getOpcode(); // Check if the current basic block is the target block to which the // CBZ/CBNZ instruction jumps when its Wt/Xt is zero. if (((Opc == AArch64::CBZW || Opc == AArch64::CBZX) && MBB == CondBr.getOperand(1).getMBB()) || ((Opc == AArch64::CBNZW || Opc == AArch64::CBNZX) && MBB != CondBr.getOperand(1).getMBB())) { FirstUse = CondBr; KnownRegs.push_back(RegImm(CondBr.getOperand(0).getReg(), 0)); return true; } // Otherwise, must be a conditional branch. if (Opc != AArch64::Bcc) return false; // Must be an equality check (i.e., == or !=). AArch64CC::CondCode CC = (AArch64CC::CondCode)CondBr.getOperand(0).getImm(); if (CC != AArch64CC::EQ && CC != AArch64CC::NE) return false; MachineBasicBlock *BrTarget = CondBr.getOperand(1).getMBB(); if ((CC == AArch64CC::EQ && BrTarget != MBB) || (CC == AArch64CC::NE && BrTarget == MBB)) return false; // Stop if we get to the beginning of PredMBB. MachineBasicBlock *PredMBB = *MBB->pred_begin(); assert(PredMBB == CondBr.getParent() && "Conditional branch not in predecessor block!"); if (CondBr == PredMBB->begin()) return false; // Registers clobbered in PredMBB between CondBr instruction and current // instruction being checked in loop. DomBBClobberedRegs.reset(); // Find compare instruction that sets NZCV used by CondBr. MachineBasicBlock::reverse_iterator RIt = CondBr.getReverseIterator(); for (MachineInstr &PredI : make_range(std::next(RIt), PredMBB->rend())) { bool IsCMN = false; switch (PredI.getOpcode()) { default: break; // CMN is an alias for ADDS with a dead destination register. case AArch64::ADDSWri: case AArch64::ADDSXri: IsCMN = true; LLVM_FALLTHROUGH; // CMP is an alias for SUBS with a dead destination register. case AArch64::SUBSWri: case AArch64::SUBSXri: { MCPhysReg DstReg = PredI.getOperand(0).getReg(); MCPhysReg SrcReg = PredI.getOperand(1).getReg(); bool Res = false; // If we're comparing against a non-symbolic immediate and the source // register of the compare is not modified (including a self-clobbering // compare) between the compare and conditional branch we known the value // of the 1st source operand. if (PredI.getOperand(2).isImm() && !DomBBClobberedRegs[SrcReg] && SrcReg != DstReg) { // We've found the instruction that sets NZCV. int32_t KnownImm = PredI.getOperand(2).getImm(); int32_t Shift = PredI.getOperand(3).getImm(); KnownImm <<= Shift; if (IsCMN) KnownImm = -KnownImm; FirstUse = PredI; KnownRegs.push_back(RegImm(SrcReg, KnownImm)); Res = true; } // If this instructions defines something other than WZR/XZR, we know it's // result is zero in some cases. if (DstReg == AArch64::WZR || DstReg == AArch64::XZR) return Res; // The destination register must not be modified between the NZCV setting // instruction and the conditional branch. if (DomBBClobberedRegs[DstReg]) return Res; FirstUse = PredI; KnownRegs.push_back(RegImm(DstReg, 0)); return true; } // Look for NZCV setting instructions that define something other than // WZR/XZR. case AArch64::ADCSWr: case AArch64::ADCSXr: case AArch64::ADDSWrr: case AArch64::ADDSWrs: case AArch64::ADDSWrx: case AArch64::ADDSXrr: case AArch64::ADDSXrs: case AArch64::ADDSXrx: case AArch64::ADDSXrx64: case AArch64::ANDSWri: case AArch64::ANDSWrr: case AArch64::ANDSWrs: case AArch64::ANDSXri: case AArch64::ANDSXrr: case AArch64::ANDSXrs: case AArch64::BICSWrr: case AArch64::BICSWrs: case AArch64::BICSXrs: case AArch64::BICSXrr: case AArch64::SBCSWr: case AArch64::SBCSXr: case AArch64::SUBSWrr: case AArch64::SUBSWrs: case AArch64::SUBSWrx: case AArch64::SUBSXrr: case AArch64::SUBSXrs: case AArch64::SUBSXrx: case AArch64::SUBSXrx64: { MCPhysReg DstReg = PredI.getOperand(0).getReg(); if (DstReg == AArch64::WZR || DstReg == AArch64::XZR) return false; // The destination register of the NZCV setting instruction must not be // modified before the conditional branch. if (DomBBClobberedRegs[DstReg]) return false; // We've found the instruction that sets NZCV whose DstReg == 0. FirstUse = PredI; KnownRegs.push_back(RegImm(DstReg, 0)); return true; } } // Bail if we see an instruction that defines NZCV that we don't handle. if (PredI.definesRegister(AArch64::NZCV)) return false; // Track clobbered registers. trackRegDefs(PredI, DomBBClobberedRegs, TRI); } return false; }
void SIFoldOperands::foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const { // We need mutate the operands of new mov instructions to add implicit // uses of EXEC, but adding them invalidates the use_iterator, so defer // this. SmallVector<MachineInstr *, 4> CopiesToReplace; SmallVector<FoldCandidate, 4> FoldList; MachineOperand &Dst = MI.getOperand(0); bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); if (FoldingImm) { unsigned NumLiteralUses = 0; MachineOperand *NonInlineUse = nullptr; int NonInlineUseOpNo = -1; MachineRegisterInfo::use_iterator NextUse, NextInstUse; for (MachineRegisterInfo::use_iterator Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end(); Use != E; Use = NextUse) { NextUse = std::next(Use); MachineInstr *UseMI = Use->getParent(); unsigned OpNo = Use.getOperandNo(); // Folding the immediate may reveal operations that can be constant // folded or replaced with a copy. This can happen for example after // frame indices are lowered to constants or from splitting 64-bit // constants. // // We may also encounter cases where one or both operands are // immediates materialized into a register, which would ordinarily not // be folded due to multiple uses or operand constraints. if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) { DEBUG(dbgs() << "Constant folded " << *UseMI <<'\n'); // Some constant folding cases change the same immediate's use to a new // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user // again. The same constant folded instruction could also have a second // use operand. NextUse = MRI->use_begin(Dst.getReg()); continue; } // Try to fold any inline immediate uses, and then only fold other // constants if they have one use. // // The legality of the inline immediate must be checked based on the use // operand, not the defining instruction, because 32-bit instructions // with 32-bit inline immediate sources may be used to materialize // constants used in 16-bit operands. // // e.g. it is unsafe to fold: // s_mov_b32 s0, 1.0 // materializes 0x3f800000 // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00 // Folding immediates with more than one use will increase program size. // FIXME: This will also reduce register usage, which may be better // in some cases. A better heuristic is needed. if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) { foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace); } else { if (++NumLiteralUses == 1) { NonInlineUse = &*Use; NonInlineUseOpNo = OpNo; } } } if (NumLiteralUses == 1) { MachineInstr *UseMI = NonInlineUse->getParent(); foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace); } } else { // Folding register. for (MachineRegisterInfo::use_iterator Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end(); Use != E; ++Use) { MachineInstr *UseMI = Use->getParent(); foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList, CopiesToReplace); } } MachineFunction *MF = MI.getParent()->getParent(); // Make sure we add EXEC uses to any new v_mov instructions created. for (MachineInstr *Copy : CopiesToReplace) Copy->addImplicitDefUseOperands(*MF); for (FoldCandidate &Fold : FoldList) { if (updateOperand(Fold, *TRI)) { // Clear kill flags. if (Fold.isReg()) { assert(Fold.OpToFold && Fold.OpToFold->isReg()); // FIXME: Probably shouldn't bother trying to fold if not an // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR // copies. MRI->clearKillFlags(Fold.OpToFold->getReg()); } DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n'); tryFoldInst(TII, Fold.UseMI); } else if (Fold.isCommuted()) { // Restoring instruction's original operand order if fold has failed. TII->commuteInstruction(*Fold.UseMI, false); } } }
void ScheduleDAGInstrs::BuildSchedGraph(AliasAnalysis *AA) { // We'll be allocating one SUnit for each instruction, plus one for // the region exit node. SUnits.reserve(BB->size()); // We build scheduling units by walking a block's instruction list from bottom // to top. // Remember where a generic side-effecting instruction is as we procede. SUnit *BarrierChain = 0, *AliasChain = 0; // Memory references to specific known memory locations are tracked // so that they can be given more precise dependencies. We track // separately the known memory locations that may alias and those // that are known not to alias std::map<const Value *, SUnit *> AliasMemDefs, NonAliasMemDefs; std::map<const Value *, std::vector<SUnit *> > AliasMemUses, NonAliasMemUses; // Keep track of dangling debug references to registers. std::vector<std::pair<MachineInstr*, unsigned> > DanglingDebugValue(TRI->getNumRegs(), std::make_pair(static_cast<MachineInstr*>(0), 0)); // Check to see if the scheduler cares about latencies. bool UnitLatencies = ForceUnitLatencies(); // Ask the target if address-backscheduling is desirable, and if so how much. const TargetSubtarget &ST = TM.getSubtarget<TargetSubtarget>(); unsigned SpecialAddressLatency = ST.getSpecialAddressLatency(); // Remove any stale debug info; sometimes BuildSchedGraph is called again // without emitting the info from the previous call. DbgValueVec.clear(); // Model data dependencies between instructions being scheduled and the // ExitSU. AddSchedBarrierDeps(); // Walk the list of instructions, from bottom moving up. for (MachineBasicBlock::iterator MII = InsertPos, MIE = Begin; MII != MIE; --MII) { MachineInstr *MI = prior(MII); // DBG_VALUE does not have SUnit's built, so just remember these for later // reinsertion. if (MI->isDebugValue()) { if (MI->getNumOperands()==3 && MI->getOperand(0).isReg() && MI->getOperand(0).getReg()) DanglingDebugValue[MI->getOperand(0).getReg()] = std::make_pair(MI, DbgValueVec.size()); DbgValueVec.push_back(MI); continue; } const TargetInstrDesc &TID = MI->getDesc(); assert(!TID.isTerminator() && !MI->isLabel() && "Cannot schedule terminators or labels!"); // Create the SUnit for this MI. SUnit *SU = NewSUnit(MI); SU->isCall = TID.isCall(); SU->isCommutable = TID.isCommutable(); // Assign the Latency field of SU using target-provided information. if (UnitLatencies) SU->Latency = 1; else ComputeLatency(SU); // Add register-based dependencies (data, anti, and output). for (unsigned j = 0, n = MI->getNumOperands(); j != n; ++j) { const MachineOperand &MO = MI->getOperand(j); if (!MO.isReg()) continue; unsigned Reg = MO.getReg(); if (Reg == 0) continue; assert(TRI->isPhysicalRegister(Reg) && "Virtual register encountered!"); if (MO.isDef() && DanglingDebugValue[Reg].first!=0) { SU->DbgInstrList.push_back(DanglingDebugValue[Reg].first); DbgValueVec[DanglingDebugValue[Reg].second] = 0; DanglingDebugValue[Reg] = std::make_pair((MachineInstr*)0, 0); } std::vector<SUnit *> &UseList = Uses[Reg]; std::vector<SUnit *> &DefList = Defs[Reg]; // Optionally add output and anti dependencies. For anti // dependencies we use a latency of 0 because for a multi-issue // target we want to allow the defining instruction to issue // in the same cycle as the using instruction. // TODO: Using a latency of 1 here for output dependencies assumes // there's no cost for reusing registers. SDep::Kind Kind = MO.isUse() ? SDep::Anti : SDep::Output; unsigned AOLatency = (Kind == SDep::Anti) ? 0 : 1; for (unsigned i = 0, e = DefList.size(); i != e; ++i) { SUnit *DefSU = DefList[i]; if (DefSU == &ExitSU) continue; if (DefSU != SU && (Kind != SDep::Output || !MO.isDead() || !DefSU->getInstr()->registerDefIsDead(Reg))) DefSU->addPred(SDep(SU, Kind, AOLatency, /*Reg=*/Reg)); } for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) { std::vector<SUnit *> &DefList = Defs[*Alias]; for (unsigned i = 0, e = DefList.size(); i != e; ++i) { SUnit *DefSU = DefList[i]; if (DefSU == &ExitSU) continue; if (DefSU != SU && (Kind != SDep::Output || !MO.isDead() || !DefSU->getInstr()->registerDefIsDead(*Alias))) DefSU->addPred(SDep(SU, Kind, AOLatency, /*Reg=*/ *Alias)); } } if (MO.isDef()) { // Add any data dependencies. unsigned DataLatency = SU->Latency; for (unsigned i = 0, e = UseList.size(); i != e; ++i) { SUnit *UseSU = UseList[i]; if (UseSU == SU) continue; unsigned LDataLatency = DataLatency; // Optionally add in a special extra latency for nodes that // feed addresses. // TODO: Do this for register aliases too. // TODO: Perhaps we should get rid of // SpecialAddressLatency and just move this into // adjustSchedDependency for the targets that care about it. if (SpecialAddressLatency != 0 && !UnitLatencies && UseSU != &ExitSU) { MachineInstr *UseMI = UseSU->getInstr(); const TargetInstrDesc &UseTID = UseMI->getDesc(); int RegUseIndex = UseMI->findRegisterUseOperandIdx(Reg); assert(RegUseIndex >= 0 && "UseMI doesn's use register!"); if (RegUseIndex >= 0 && (UseTID.mayLoad() || UseTID.mayStore()) && (unsigned)RegUseIndex < UseTID.getNumOperands() && UseTID.OpInfo[RegUseIndex].isLookupPtrRegClass()) LDataLatency += SpecialAddressLatency; } // Adjust the dependence latency using operand def/use // information (if any), and then allow the target to // perform its own adjustments. const SDep& dep = SDep(SU, SDep::Data, LDataLatency, Reg); if (!UnitLatencies) { ComputeOperandLatency(SU, UseSU, const_cast<SDep &>(dep)); ST.adjustSchedDependency(SU, UseSU, const_cast<SDep &>(dep)); } UseSU->addPred(dep); } for (const unsigned *Alias = TRI->getAliasSet(Reg); *Alias; ++Alias) { std::vector<SUnit *> &UseList = Uses[*Alias]; for (unsigned i = 0, e = UseList.size(); i != e; ++i) { SUnit *UseSU = UseList[i]; if (UseSU == SU) continue; const SDep& dep = SDep(SU, SDep::Data, DataLatency, *Alias); if (!UnitLatencies) { ComputeOperandLatency(SU, UseSU, const_cast<SDep &>(dep)); ST.adjustSchedDependency(SU, UseSU, const_cast<SDep &>(dep)); } UseSU->addPred(dep); } } // If a def is going to wrap back around to the top of the loop, // backschedule it. if (!UnitLatencies && DefList.empty()) { LoopDependencies::LoopDeps::iterator I = LoopRegs.Deps.find(Reg); if (I != LoopRegs.Deps.end()) { const MachineOperand *UseMO = I->second.first; unsigned Count = I->second.second; const MachineInstr *UseMI = UseMO->getParent(); unsigned UseMOIdx = UseMO - &UseMI->getOperand(0); const TargetInstrDesc &UseTID = UseMI->getDesc(); // TODO: If we knew the total depth of the region here, we could // handle the case where the whole loop is inside the region but // is large enough that the isScheduleHigh trick isn't needed. if (UseMOIdx < UseTID.getNumOperands()) { // Currently, we only support scheduling regions consisting of // single basic blocks. Check to see if the instruction is in // the same region by checking to see if it has the same parent. if (UseMI->getParent() != MI->getParent()) { unsigned Latency = SU->Latency; if (UseTID.OpInfo[UseMOIdx].isLookupPtrRegClass()) Latency += SpecialAddressLatency; // This is a wild guess as to the portion of the latency which // will be overlapped by work done outside the current // scheduling region. Latency -= std::min(Latency, Count); // Add the artificial edge. ExitSU.addPred(SDep(SU, SDep::Order, Latency, /*Reg=*/0, /*isNormalMemory=*/false, /*isMustAlias=*/false, /*isArtificial=*/true)); } else if (SpecialAddressLatency > 0 && UseTID.OpInfo[UseMOIdx].isLookupPtrRegClass()) { // The entire loop body is within the current scheduling region // and the latency of this operation is assumed to be greater // than the latency of the loop. // TODO: Recursively mark data-edge predecessors as // isScheduleHigh too. SU->isScheduleHigh = true; } } LoopRegs.Deps.erase(I); } } UseList.clear(); if (!MO.isDead()) DefList.clear(); DefList.push_back(SU); } else { UseList.push_back(SU); } } // Add chain dependencies. // Chain dependencies used to enforce memory order should have // latency of 0 (except for true dependency of Store followed by // aliased Load... we estimate that with a single cycle of latency // assuming the hardware will bypass) // Note that isStoreToStackSlot and isLoadFromStackSLot are not usable // after stack slots are lowered to actual addresses. // TODO: Use an AliasAnalysis and do real alias-analysis queries, and // produce more precise dependence information. #define STORE_LOAD_LATENCY 1 unsigned TrueMemOrderLatency = 0; if (TID.isCall() || MI->hasUnmodeledSideEffects() || (MI->hasVolatileMemoryRef() && (!TID.mayLoad() || !MI->isInvariantLoad(AA)))) { // Be conservative with these and add dependencies on all memory // references, even those that are known to not alias. for (std::map<const Value *, SUnit *>::iterator I = NonAliasMemDefs.begin(), E = NonAliasMemDefs.end(); I != E; ++I) { I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0)); } for (std::map<const Value *, std::vector<SUnit *> >::iterator I = NonAliasMemUses.begin(), E = NonAliasMemUses.end(); I != E; ++I) { for (unsigned i = 0, e = I->second.size(); i != e; ++i) I->second[i]->addPred(SDep(SU, SDep::Order, TrueMemOrderLatency)); } NonAliasMemDefs.clear(); NonAliasMemUses.clear(); // Add SU to the barrier chain. if (BarrierChain) BarrierChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0)); BarrierChain = SU; // fall-through new_alias_chain: // Chain all possibly aliasing memory references though SU. if (AliasChain) AliasChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0)); AliasChain = SU; for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k) PendingLoads[k]->addPred(SDep(SU, SDep::Order, TrueMemOrderLatency)); for (std::map<const Value *, SUnit *>::iterator I = AliasMemDefs.begin(), E = AliasMemDefs.end(); I != E; ++I) { I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0)); } for (std::map<const Value *, std::vector<SUnit *> >::iterator I = AliasMemUses.begin(), E = AliasMemUses.end(); I != E; ++I) { for (unsigned i = 0, e = I->second.size(); i != e; ++i) I->second[i]->addPred(SDep(SU, SDep::Order, TrueMemOrderLatency)); } PendingLoads.clear(); AliasMemDefs.clear(); AliasMemUses.clear(); } else if (TID.mayStore()) { bool MayAlias = true; TrueMemOrderLatency = STORE_LOAD_LATENCY; if (const Value *V = getUnderlyingObjectForInstr(MI, MFI, MayAlias)) { // A store to a specific PseudoSourceValue. Add precise dependencies. // Record the def in MemDefs, first adding a dep if there is // an existing def. std::map<const Value *, SUnit *>::iterator I = ((MayAlias) ? AliasMemDefs.find(V) : NonAliasMemDefs.find(V)); std::map<const Value *, SUnit *>::iterator IE = ((MayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end()); if (I != IE) { I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0, /*Reg=*/0, /*isNormalMemory=*/true)); I->second = SU; } else { if (MayAlias) AliasMemDefs[V] = SU; else NonAliasMemDefs[V] = SU; } // Handle the uses in MemUses, if there are any. std::map<const Value *, std::vector<SUnit *> >::iterator J = ((MayAlias) ? AliasMemUses.find(V) : NonAliasMemUses.find(V)); std::map<const Value *, std::vector<SUnit *> >::iterator JE = ((MayAlias) ? AliasMemUses.end() : NonAliasMemUses.end()); if (J != JE) { for (unsigned i = 0, e = J->second.size(); i != e; ++i) J->second[i]->addPred(SDep(SU, SDep::Order, TrueMemOrderLatency, /*Reg=*/0, /*isNormalMemory=*/true)); J->second.clear(); } if (MayAlias) { // Add dependencies from all the PendingLoads, i.e. loads // with no underlying object. for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k) PendingLoads[k]->addPred(SDep(SU, SDep::Order, TrueMemOrderLatency)); // Add dependence on alias chain, if needed. if (AliasChain) AliasChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0)); } // Add dependence on barrier chain, if needed. if (BarrierChain) BarrierChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0)); } else { // Treat all other stores conservatively. goto new_alias_chain; } if (!ExitSU.isPred(SU)) // Push store's up a bit to avoid them getting in between cmp // and branches. ExitSU.addPred(SDep(SU, SDep::Order, 0, /*Reg=*/0, /*isNormalMemory=*/false, /*isMustAlias=*/false, /*isArtificial=*/true)); } else if (TID.mayLoad()) { bool MayAlias = true; TrueMemOrderLatency = 0; if (MI->isInvariantLoad(AA)) { // Invariant load, no chain dependencies needed! } else { if (const Value *V = getUnderlyingObjectForInstr(MI, MFI, MayAlias)) { // A load from a specific PseudoSourceValue. Add precise dependencies. std::map<const Value *, SUnit *>::iterator I = ((MayAlias) ? AliasMemDefs.find(V) : NonAliasMemDefs.find(V)); std::map<const Value *, SUnit *>::iterator IE = ((MayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end()); if (I != IE) I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0, /*Reg=*/0, /*isNormalMemory=*/true)); if (MayAlias) AliasMemUses[V].push_back(SU); else NonAliasMemUses[V].push_back(SU); } else { // A load with no underlying object. Depend on all // potentially aliasing stores. for (std::map<const Value *, SUnit *>::iterator I = AliasMemDefs.begin(), E = AliasMemDefs.end(); I != E; ++I) I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0)); PendingLoads.push_back(SU); MayAlias = true; } // Add dependencies on alias and barrier chains, if needed. if (MayAlias && AliasChain) AliasChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0)); if (BarrierChain) BarrierChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0)); } } } for (int i = 0, e = TRI->getNumRegs(); i != e; ++i) { Defs[i].clear(); Uses[i].clear(); } PendingLoads.clear(); }
void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { SmallVector<MachineInstr *, 128> Worklist; Worklist.push_back(&TopInst); while (!Worklist.empty()) { MachineInstr *Inst = Worklist.pop_back_val(); unsigned NewOpcode = getVALUOp(*Inst); if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) continue; MachineRegisterInfo &MRI = Inst->getParent()->getParent()->getRegInfo(); // Use the new VALU Opcode. const MCInstrDesc &NewDesc = get(NewOpcode); Inst->setDesc(NewDesc); // Remove any references to SCC. Vector instructions can't read from it, and // We're just about to add the implicit use / defs of VCC, and we don't want // both. for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) { MachineOperand &Op = Inst->getOperand(i); if (Op.isReg() && Op.getReg() == AMDGPU::SCC) Inst->RemoveOperand(i); } // Add the implict and explicit register definitions. if (NewDesc.ImplicitUses) { for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) { unsigned Reg = NewDesc.ImplicitUses[i]; Inst->addOperand(MachineOperand::CreateReg(Reg, false, true)); } } if (NewDesc.ImplicitDefs) { for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) { unsigned Reg = NewDesc.ImplicitDefs[i]; Inst->addOperand(MachineOperand::CreateReg(Reg, true, true)); } } legalizeOperands(Inst); // Update the destination register class. const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0); switch (Inst->getOpcode()) { // For target instructions, getOpRegClass just returns the virtual // register class associated with the operand, so we need to find an // equivalent VGPR register class in order to move the instruction to the // VALU. case AMDGPU::COPY: case AMDGPU::PHI: case AMDGPU::REG_SEQUENCE: if (RI.hasVGPRs(NewDstRC)) continue; NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); if (!NewDstRC) continue; break; default: break; } unsigned DstReg = Inst->getOperand(0).getReg(); unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); MRI.replaceRegWith(DstReg, NewDstReg); for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg), E = MRI.use_end(); I != E; ++I) { MachineInstr &UseMI = *I; if (!canReadVGPR(UseMI, I.getOperandNo())) { Worklist.push_back(&UseMI); } } } }
// Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE. // // SGPRx = ... // SGPRy = REG_SEQUENCE SGPRx, sub0 ... // VGPRz = COPY SGPRy // // ==> // // VGPRx = COPY SGPRx // VGPRz = REG_SEQUENCE VGPRx, sub0 // // This exposes immediate folding opportunities when materializing 64-bit // immediates. static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, const SIRegisterInfo *TRI, const SIInstrInfo *TII, MachineRegisterInfo &MRI) { assert(MI.isRegSequence()); unsigned DstReg = MI.getOperand(0).getReg(); if (!TRI->isSGPRClass(MRI.getRegClass(DstReg))) return false; if (!MRI.hasOneUse(DstReg)) return false; MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg); if (!CopyUse.isCopy()) return false; const TargetRegisterClass *SrcRC, *DstRC; std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI); if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) return false; // TODO: Could have multiple extracts? unsigned SubReg = CopyUse.getOperand(1).getSubReg(); if (SubReg != AMDGPU::NoSubRegister) return false; MRI.setRegClass(DstReg, DstRC); // SGPRx = ... // SGPRy = REG_SEQUENCE SGPRx, sub0 ... // VGPRz = COPY SGPRy // => // VGPRx = COPY SGPRx // VGPRz = REG_SEQUENCE VGPRx, sub0 MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg()); for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { unsigned SrcReg = MI.getOperand(I).getReg(); unsigned SrcSubReg = MI.getOperand(I).getReg(); const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); assert(TRI->isSGPRClass(SrcRC) && "Expected SGPR REG_SEQUENCE to only have SGPR inputs"); SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg); const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC); unsigned TmpReg = MRI.createVirtualRegister(NewSrcRC); BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), TmpReg) .addOperand(MI.getOperand(I)); MI.getOperand(I).setReg(TmpReg); } CopyUse.eraseFromParent(); return true; }