// Insert Defs and Uses of MI into the sets RegDefs and RegUses. void Filler::insertDefsUses(MachineBasicBlock::iterator MI, SmallSet<unsigned, 32>& RegDefs, SmallSet<unsigned, 32>& RegUses) { unsigned I, E = MI->getDesc().getNumOperands(); for (I = 0; I != E; ++I) insertDefUse(MI->getOperand(I), RegDefs, RegUses); // If MI is a call, add RA to RegDefs to prevent users of RA from going into // delay slot. if (MI->isCall()) { RegDefs.insert(CoffeeCL::LR); return; } // Return if MI is a return. if (MI->isReturn()) return; // Examine the implicit operands. Exclude register AT which is in the list of // clobbered registers of branch instructions. E = MI->getNumOperands(); for (; I != E; ++I) insertDefUse(MI->getOperand(I), RegDefs, RegUses); }
// Insert Defs and Uses of MI into the sets RegDefs and RegUses. void Filler::insertDefsUses(MachineBasicBlock::iterator MI, SmallSet<unsigned, 32>& RegDefs, SmallSet<unsigned, 32>& RegUses) { // If MI is a call or return, just examine the explicit non-variadic operands. MCInstrDesc MCID = MI->getDesc(); unsigned e = MI->isCall() || MI->isReturn() ? MCID.getNumOperands() : MI->getNumOperands(); // Add RA to RegDefs to prevent users of RA from going into delay slot. if (MI->isCall()) RegDefs.insert(Mips::RA); for (unsigned i = 0; i != e; ++i) { const MachineOperand &MO = MI->getOperand(i); unsigned Reg; if (!MO.isReg() || !(Reg = MO.getReg())) continue; if (MO.isDef()) RegDefs.insert(Reg); else if (MO.isUse()) RegUses.insert(Reg); } }
int PatmosInstrInfo::findPrevDelaySlotEnd(MachineBasicBlock &MBB, MachineBasicBlock::iterator &II, int Cycles) const { int cnt = 0; int maxDelaySlotSize = PST.getCFLDelaySlotCycles(false); while (II != MBB.begin()) { --II; if (isPseudo(&*II)) continue; // This code assumes that delay slots can not be completely inside other // delay slots, i.e., we only need to scan up to the first CFL instruction. if (II->isInlineAsm()) { break; } if (II->isBranch() || II->isCall() || II->isReturn()) { cnt -= PST.getDelaySlotCycles(&*II); break; } if (Cycles >= 0 && cnt >= Cycles + maxDelaySlotSize) break; cnt++; } return Cycles >= 0 && cnt > Cycles ? Cycles : cnt; }
/// runOnMachineFunction - Loop over all of the basic blocks, inserting /// NOOP instructions before early exits. bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) { const AttributeSet &FnAttrs = MF.getFunction()->getAttributes(); if (FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) || FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize)) { return false; } TM = &MF.getTarget(); if (!TM->getSubtarget<X86Subtarget>().padShortFunctions()) return false; TII = TM->getInstrInfo(); // Search through basic blocks and mark the ones that have early returns ReturnBBs.clear(); VisitedBBs.clear(); findReturns(MF.begin()); bool MadeChange = false; MachineBasicBlock *MBB; unsigned int Cycles = 0; // Pad the identified basic blocks with NOOPs for (DenseMap<MachineBasicBlock*, unsigned int>::iterator I = ReturnBBs.begin(); I != ReturnBBs.end(); ++I) { MBB = I->first; Cycles = I->second; if (Cycles < Threshold) { // BB ends in a return. Skip over any DBG_VALUE instructions // trailing the terminator. assert(MBB->size() > 0 && "Basic block should contain at least a RET but is empty"); MachineBasicBlock::iterator ReturnLoc = --MBB->end(); while (ReturnLoc->isDebugValue()) --ReturnLoc; assert(ReturnLoc->isReturn() && !ReturnLoc->isCall() && "Basic block does not end with RET"); addPadding(MBB, ReturnLoc, Threshold - Cycles); NumBBsPadded++; MadeChange = true; } } return MadeChange; }
bool Filler::delayHasHazard(MachineBasicBlock::iterator candidate, bool &sawLoad, bool &sawStore, SmallSet<unsigned, 32> &RegDefs, SmallSet<unsigned, 32> &RegUses) { if (candidate->isImplicitDef() || candidate->isKill()) return true; // Loads or stores cannot be moved past a store to the delay slot // and stores cannot be moved past a load. if (candidate->mayLoad()) { if (sawStore) return true; sawLoad = true; } if (candidate->mayStore()) { if (sawStore) return true; sawStore = true; if (sawLoad) return true; } assert((!candidate->isCall() && !candidate->isReturn()) && "Cannot put calls or returns in delay slot."); for (unsigned i = 0, e = candidate->getNumOperands(); i!= e; ++i) { const MachineOperand &MO = candidate->getOperand(i); unsigned Reg; if (!MO.isReg() || !(Reg = MO.getReg())) continue; // skip if (MO.isDef()) { // check whether Reg is defined or used before delay slot. if (IsRegInSet(RegDefs, Reg) || IsRegInSet(RegUses, Reg)) return true; } if (MO.isUse()) { // check whether Reg is defined before delay slot. if (IsRegInSet(RegDefs, Reg)) return true; } } return false; }
void SystemZFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); auto *ZII = static_cast<const SystemZInstrInfo*>(MF.getTarget().getInstrInfo()); SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>(); // Skip the return instruction. assert(MBBI->isReturn() && "Can only insert epilogue into returning blocks"); uint64_t StackSize = getAllocatedStackSize(MF); if (ZFI->getLowSavedGPR()) { --MBBI; unsigned Opcode = MBBI->getOpcode(); if (Opcode != SystemZ::LMG) llvm_unreachable("Expected to see callee-save register restore code"); unsigned AddrOpNo = 2; DebugLoc DL = MBBI->getDebugLoc(); uint64_t Offset = StackSize + MBBI->getOperand(AddrOpNo + 1).getImm(); unsigned NewOpcode = ZII->getOpcodeForOffset(Opcode, Offset); // If the offset is too large, use the largest stack-aligned offset // and add the rest to the base register (the stack or frame pointer). if (!NewOpcode) { uint64_t NumBytes = Offset - 0x7fff8; emitIncrement(MBB, MBBI, DL, MBBI->getOperand(AddrOpNo).getReg(), NumBytes, ZII); Offset -= NumBytes; NewOpcode = ZII->getOpcodeForOffset(Opcode, Offset); assert(NewOpcode && "No restore instruction available"); } MBBI->setDesc(ZII->get(NewOpcode)); MBBI->getOperand(AddrOpNo + 1).ChangeToImmediate(Offset); } else if (StackSize) { DebugLoc DL = MBBI->getDebugLoc(); emitIncrement(MBB, MBBI, DL, SystemZ::R15D, StackSize, ZII); } }
/// shouldTailDuplicate - Determine if it is profitable to duplicate this block. bool TailDuplicatePass::shouldTailDuplicate(const MachineFunction &MF, bool IsSimple, MachineBasicBlock &TailBB) { // Only duplicate blocks that end with unconditional branches. if (TailBB.canFallThrough()) return false; // Don't try to tail-duplicate single-block loops. if (TailBB.isSuccessor(&TailBB)) return false; // Set the limit on the cost to duplicate. When optimizing for size, // duplicate only one, because one branch instruction can be eliminated to // compensate for the duplication. unsigned MaxDuplicateCount; if (TailDuplicateSize.getNumOccurrences() == 0 && MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize)) MaxDuplicateCount = 1; else MaxDuplicateCount = TailDuplicateSize; // If the target has hardware branch prediction that can handle indirect // branches, duplicating them can often make them predictable when there // are common paths through the code. The limit needs to be high enough // to allow undoing the effects of tail merging and other optimizations // that rearrange the predecessors of the indirect branch. bool HasIndirectbr = false; if (!TailBB.empty()) HasIndirectbr = TailBB.back().isIndirectBranch(); if (HasIndirectbr && PreRegAlloc) MaxDuplicateCount = 20; // Check the instructions in the block to determine whether tail-duplication // is invalid or unlikely to be profitable. unsigned InstrCount = 0; for (MachineBasicBlock::iterator I = TailBB.begin(); I != TailBB.end(); ++I) { // Non-duplicable things shouldn't be tail-duplicated. if (I->isNotDuplicable()) return false; // Do not duplicate 'return' instructions if this is a pre-regalloc run. // A return may expand into a lot more instructions (e.g. reload of callee // saved registers) after PEI. if (PreRegAlloc && I->isReturn()) return false; // Avoid duplicating calls before register allocation. Calls presents a // barrier to register allocation so duplicating them may end up increasing // spills. if (PreRegAlloc && I->isCall()) return false; if (!I->isPHI() && !I->isDebugValue()) InstrCount += 1; if (InstrCount > MaxDuplicateCount) return false; } if (HasIndirectbr && PreRegAlloc) return true; if (IsSimple) return true; if (!PreRegAlloc) return true; return canCompletelyDuplicateBB(TailBB); }
void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); assert(MBBI->isReturn() && "Can only insert epilog into returning blocks"); MachineFrameInfo *MFI = MF.getFrameInfo(); const AArch64InstrInfo *TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>( MF.getSubtarget().getRegisterInfo()); DebugLoc DL = MBBI->getDebugLoc(); unsigned RetOpcode = MBBI->getOpcode(); int NumBytes = MFI->getStackSize(); const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); // Initial and residual are named for consitency with the prologue. Note that // in the epilogue, the residual adjustment is executed first. uint64_t ArgumentPopSize = 0; if (RetOpcode == AArch64::TCRETURNdi || RetOpcode == AArch64::TCRETURNri) { MachineOperand &StackAdjust = MBBI->getOperand(1); // For a tail-call in a callee-pops-arguments environment, some or all of // the stack may actually be in use for the call's arguments, this is // calculated during LowerCall and consumed here... ArgumentPopSize = StackAdjust.getImm(); } else { // ... otherwise the amount to pop is *all* of the argument space, // conveniently stored in the MachineFunctionInfo by // LowerFormalArguments. This will, of course, be zero for the C calling // convention. ArgumentPopSize = AFI->getArgumentStackToRestore(); } // The stack frame should be like below, // // ---------------------- --- // | | | // | BytesInStackArgArea| CalleeArgStackSize // | (NumReusableBytes) | (of tail call) // | | --- // | | | // ---------------------| --- | // | | | | // | CalleeSavedReg | | | // | (NumRestores * 16) | | | // | | | | // ---------------------| | NumBytes // | | StackSize (StackAdjustUp) // | LocalStackSize | | | // | (covering callee | | | // | args) | | | // | | | | // ---------------------- --- --- // // So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize // = StackSize + ArgumentPopSize // // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps // it as the 2nd argument of AArch64ISD::TC_RETURN. NumBytes += ArgumentPopSize; unsigned NumRestores = 0; // Move past the restores of the callee-saved registers. MachineBasicBlock::iterator LastPopI = MBBI; const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); if (LastPopI != MBB.begin()) { do { ++NumRestores; --LastPopI; } while (LastPopI != MBB.begin() && isCSRestore(LastPopI, CSRegs)); if (!isCSRestore(LastPopI, CSRegs)) { ++LastPopI; --NumRestores; } } NumBytes -= NumRestores * 16; assert(NumBytes >= 0 && "Negative stack allocation size!?"); if (!hasFP(MF)) { // If this was a redzone leaf function, we don't need to restore the // stack pointer. if (!canUseRedZone(MF)) emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, TII); return; } // Restore the original stack pointer. // FIXME: Rather than doing the math here, we should instead just use // non-post-indexed loads for the restores if we aren't actually going to // be able to save any instructions. if (NumBytes || MFI->hasVarSizedObjects()) emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP, -(NumRestores - 1) * 16, TII, MachineInstr::NoFlags); }
bool PatmosDelaySlotKiller::killDelaySlots(MachineBasicBlock &MBB) { bool Changed = false; DEBUG( dbgs() << "Killing slots in BB#" << MBB.getNumber() << " (" << MBB.getFullName() << ")\n" ); // consider the basic block from top to bottom for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) { // Control-flow instructions ("proper" delay slots) if (I->hasDelaySlot()) { assert( ( I->isCall() || I->isReturn() || I->isBranch() ) && "Unexpected instruction with delay slot."); MachineBasicBlock::instr_iterator MI = *I; if (I->isBundle()) { ++MI; } unsigned Opcode = MI->getOpcode(); if (Opcode == Patmos::BR || Opcode == Patmos::BRu || Opcode == Patmos::BRR || Opcode == Patmos::BRRu || Opcode == Patmos::BRT || Opcode == Patmos::BRTu || Opcode == Patmos::BRCF || Opcode == Patmos::BRCFu || Opcode == Patmos::BRCFR || Opcode == Patmos::BRCFRu || Opcode == Patmos::BRCFT || Opcode == Patmos::BRCFTu || Opcode == Patmos::CALL || Opcode == Patmos::CALLR || Opcode == Patmos::RET || Opcode == Patmos::XRET) { bool onlyNops = true; unsigned maxCount = TM.getSubtargetImpl()->getDelaySlotCycles(&*I); unsigned count = 0; for (MachineBasicBlock::iterator K = llvm::next(I), E = MBB.end(); K != E && count < maxCount; ++K, ++count) { TII->skipPseudos(MBB, K); if (K->getOpcode() != Patmos::NOP) { onlyNops = false; } } if (onlyNops) { unsigned NewOpcode = 0; switch(Opcode) { case Patmos::BR: NewOpcode = Patmos::BRND; break; case Patmos::BRu: NewOpcode = Patmos::BRNDu; break; case Patmos::BRR: NewOpcode = Patmos::BRRND; break; case Patmos::BRRu: NewOpcode = Patmos::BRRNDu; break; case Patmos::BRT: NewOpcode = Patmos::BRTND; break; case Patmos::BRTu: NewOpcode = Patmos::BRTNDu; break; case Patmos::BRCF: NewOpcode = Patmos::BRCFND; break; case Patmos::BRCFu: NewOpcode = Patmos::BRCFNDu; break; case Patmos::BRCFR: NewOpcode = Patmos::BRCFRND; break; case Patmos::BRCFRu: NewOpcode = Patmos::BRCFRNDu; break; case Patmos::BRCFT: NewOpcode = Patmos::BRCFTND; break; case Patmos::BRCFTu: NewOpcode = Patmos::BRCFTNDu; break; case Patmos::CALL: NewOpcode = Patmos::CALLND; break; case Patmos::CALLR: NewOpcode = Patmos::CALLRND; break; case Patmos::RET: NewOpcode = Patmos::RETND; break; case Patmos::XRET: NewOpcode = Patmos::XRETND; break; } const MCInstrDesc &nonDelayed = TII->get(NewOpcode); MI->setDesc(nonDelayed); unsigned killCount = 0; MachineBasicBlock::iterator K = llvm::next(I); for (MachineBasicBlock::iterator E = MBB.end(); K != E && killCount < count; ++K, ++killCount) { TII->skipPseudos(MBB, K); KilledSlots++; } MBB.erase(llvm::next(I), K); } } Changed = true; // pass result } } return Changed; }
unsigned PatmosInstrInfo::moveUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &II, unsigned Cycles) const { // TODO We assume here that we do not have instructions which must be scheduled // *within* a certain amount of cycles, except for branches (i.e., we // do not emit overlapping pipelined MULs). Otherwise we would need to // check if we violate any latency constraints when inserting an instruction // Note: We assume that the instruction has no dependencies on previous // instructions within the given number of cycles. If we would check for this, // this would become a complete scheduler. // We might move an instruction // 1) outside of delay slots -> always possible // 2) into a delay slot -> optional, must add predicate and replace NOP or // be bundled; we do not move other instructions around // 3) over a branch -> always possible if not predicated, but only until next // delay slot and if not moved into a delay slot if (II->isBundled()) { // TODO moving bundled instructions is not yet supported. return Cycles; } MachineBasicBlock::iterator J = II; // determine start of first delay slot above the instruction int nonDelayed = findPrevDelaySlotEnd(MBB, J, Cycles); // Check if the instruction is inside a delay slot if (nonDelayed < 0) { // do not move it out of the delay slot // TODO we could move it, and insert a NOP instead.. return Cycles; } bool isBranch = II->isBranch(); bool isCFLInstr = isBranch || II->isCall() || II->isReturn(); if (nonDelayed < (int)Cycles && J->isBranch() && !isPredicated(&*II) && isPredicated(&*J) && (!isCFLInstr || (isBranch && PST.allowBranchInsideCFLDelaySots()) )) { // J points to the branch instruction unsigned delayed = nonDelayed + PST.getDelaySlotCycles(&*J) + 1; // Load the predicate of the branch // We assume here that a bundle only contains at most one branch, // that this instruction is the first instruction in the bundle, and // that the branch is actually predicated. // TODO add a check for this! SmallVector<MachineOperand,4> Pred; const MachineInstr *BR = getFirstMI(&*J); assert(BR->isBranch() && "Branch is not in the first slot"); getPredicateOperands(BR, Pred); assert(Pred.size() >= 2 && "Branch instruction not predicated"); // determine if instruction might be moved over the delay slot if (delayed <= Cycles) { // TODO We only move the instruction at most one cycle above the branch. // We could move it further up, but then we need to check where the // predicate is defined. MachineBasicBlock::iterator JJ = J; if (findPrevDelaySlotEnd(MBB, JJ, 0) >= 0) { // Move the instruction up and predicate it II = MBB.insert(J, MBB.remove(II)); PredicateInstruction(&*II, Pred); NegatePredicate(&*II); return Cycles - delayed; } } // if not, check if we can move it into the delay slot MachineBasicBlock::iterator dst = J; // Going down from the branch until the first possible slot, checking // that the predicate is not redefined. // Note that we are not inserting the instruction, but replacing an // instruction, i.e., we move one instruction less over II than in the // other cases. while ((int)delayed > nonDelayed) { delayed--; if (delayed <= Cycles && moveTo(MBB, dst, II, &Pred, true)) { return Cycles - delayed; } // TODO check if this also finds a MTS $S0 !! if (dst->definesRegister(Pred[0].getReg(), &getRegisterInfo())) { break; } dst = nextNonPseudo(MBB, dst); } } if (nonDelayed > 0) { // we are staying below a delay slot, just move the instruction up J = II; recedeCycles(MBB, J, nonDelayed); II = MBB.insert(J, MBB.remove(II)); return Cycles - nonDelayed; } return Cycles; }
bool ARCInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const { TBB = FBB = nullptr; MachineBasicBlock::iterator I = MBB.end(); if (I == MBB.begin()) return false; --I; while (isPredicated(*I) || I->isTerminator() || I->isDebugValue()) { // Flag to be raised on unanalyzeable instructions. This is useful in cases // where we want to clean up on the end of the basic block before we bail // out. bool CantAnalyze = false; // Skip over DEBUG values and predicated nonterminators. while (I->isDebugValue() || !I->isTerminator()) { if (I == MBB.begin()) return false; --I; } if (isJumpOpcode(I->getOpcode())) { // Indirect branches and jump tables can't be analyzed, but we still want // to clean up any instructions at the tail of the basic block. CantAnalyze = true; } else if (isUncondBranchOpcode(I->getOpcode())) { TBB = I->getOperand(0).getMBB(); } else if (isCondBranchOpcode(I->getOpcode())) { // Bail out if we encounter multiple conditional branches. if (!Cond.empty()) return true; assert(!FBB && "FBB should have been null."); FBB = TBB; TBB = I->getOperand(0).getMBB(); Cond.push_back(I->getOperand(1)); Cond.push_back(I->getOperand(2)); Cond.push_back(I->getOperand(3)); } else if (I->isReturn()) { // Returns can't be analyzed, but we should run cleanup. CantAnalyze = !isPredicated(*I); } else { // We encountered other unrecognized terminator. Bail out immediately. return true; } // Cleanup code - to be run for unpredicated unconditional branches and // returns. if (!isPredicated(*I) && (isUncondBranchOpcode(I->getOpcode()) || isJumpOpcode(I->getOpcode()) || I->isReturn())) { // Forget any previous condition branch information - it no longer // applies. Cond.clear(); FBB = nullptr; // If we can modify the function, delete everything below this // unconditional branch. if (AllowModify) { MachineBasicBlock::iterator DI = std::next(I); while (DI != MBB.end()) { MachineInstr &InstToDelete = *DI; ++DI; InstToDelete.eraseFromParent(); } } } if (CantAnalyze) return true; if (I == MBB.begin()) return false; --I; } // We made it past the terminators without bailing out - we must have // analyzed this branch successfully. return false; }