/// shouldTailDuplicate - Determine if it is profitable to duplicate this block. bool TailDuplicatePass::shouldTailDuplicate(const MachineFunction &MF, bool IsSimple, MachineBasicBlock &TailBB) { // Only duplicate blocks that end with unconditional branches. if (TailBB.canFallThrough()) return false; // Don't try to tail-duplicate single-block loops. if (TailBB.isSuccessor(&TailBB)) return false; // Set the limit on the cost to duplicate. When optimizing for size, // duplicate only one, because one branch instruction can be eliminated to // compensate for the duplication. unsigned MaxDuplicateCount; if (TailDuplicateSize.getNumOccurrences() == 0 && MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize)) MaxDuplicateCount = 1; else MaxDuplicateCount = TailDuplicateSize; // If the target has hardware branch prediction that can handle indirect // branches, duplicating them can often make them predictable when there // are common paths through the code. The limit needs to be high enough // to allow undoing the effects of tail merging and other optimizations // that rearrange the predecessors of the indirect branch. bool HasIndirectbr = false; if (!TailBB.empty()) HasIndirectbr = TailBB.back().isIndirectBranch(); if (HasIndirectbr && PreRegAlloc) MaxDuplicateCount = 20; // Check the instructions in the block to determine whether tail-duplication // is invalid or unlikely to be profitable. unsigned InstrCount = 0; for (MachineBasicBlock::iterator I = TailBB.begin(); I != TailBB.end(); ++I) { // Non-duplicable things shouldn't be tail-duplicated. if (I->isNotDuplicable()) return false; // Do not duplicate 'return' instructions if this is a pre-regalloc run. // A return may expand into a lot more instructions (e.g. reload of callee // saved registers) after PEI. if (PreRegAlloc && I->isReturn()) return false; // Avoid duplicating calls before register allocation. Calls presents a // barrier to register allocation so duplicating them may end up increasing // spills. if (PreRegAlloc && I->isCall()) return false; if (!I->isPHI() && !I->isDebugValue()) InstrCount += 1; if (InstrCount > MaxDuplicateCount) return false; } if (HasIndirectbr && PreRegAlloc) return true; if (IsSimple) return true; if (!PreRegAlloc) return true; return canCompletelyDuplicateBB(TailBB); }
void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, CallContext &Context) { // Check that this particular call sequence is amenable to the // transformation. const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>( STI->getRegisterInfo()); unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); // We expect to enter this at the beginning of a call sequence assert(I->getOpcode() == TII->getCallFrameSetupOpcode()); MachineBasicBlock::iterator FrameSetup = I++; Context.FrameSetup = FrameSetup; // How much do we adjust the stack? This puts an upper bound on // the number of parameters actually passed on it. unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4; // A zero adjustment means no stack parameters if (!MaxAdjust) { Context.NoStackParams = true; return; } // For globals in PIC mode, we can have some LEAs here. // Ignore them, they don't bother us. // TODO: Extend this to something that covers more cases. while (I->getOpcode() == X86::LEA32r) ++I; // We expect a copy instruction here. // TODO: The copy instruction is a lowering artifact. // We should also support a copy-less version, where the stack // pointer is used directly. if (!I->isCopy() || !I->getOperand(0).isReg()) return; Context.SPCopy = I++; unsigned StackPtr = Context.SPCopy->getOperand(0).getReg(); // Scan the call setup sequence for the pattern we're looking for. // We only handle a simple case - a sequence of MOV32mi or MOV32mr // instructions, that push a sequence of 32-bit values onto the stack, with // no gaps between them. if (MaxAdjust > 4) Context.MovVector.resize(MaxAdjust, nullptr); InstClassification Classification; DenseSet<unsigned int> UsedRegs; while ((Classification = classifyInstruction(MBB, I, RegInfo, UsedRegs)) != Exit) { if (Classification == Skip) { ++I; continue; } // We know the instruction is a MOV32mi/MOV32mr. // We only want movs of the form: // movl imm/r32, k(%esp) // If we run into something else, bail. // Note that AddrBaseReg may, counter to its name, not be a register, // but rather a frame index. // TODO: Support the fi case. This should probably work now that we // have the infrastructure to track the stack pointer within a call // sequence. if (!I->getOperand(X86::AddrBaseReg).isReg() || (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) || !I->getOperand(X86::AddrScaleAmt).isImm() || (I->getOperand(X86::AddrScaleAmt).getImm() != 1) || (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) || (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) || !I->getOperand(X86::AddrDisp).isImm()) return; int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm(); assert(StackDisp >= 0 && "Negative stack displacement when passing parameters"); // We really don't want to consider the unaligned case. if (StackDisp % 4) return; StackDisp /= 4; assert((size_t)StackDisp < Context.MovVector.size() && "Function call has more parameters than the stack is adjusted for."); // If the same stack slot is being filled twice, something's fishy. if (Context.MovVector[StackDisp] != nullptr) return; Context.MovVector[StackDisp] = I; for (const MachineOperand &MO : I->uses()) { if (!MO.isReg()) continue; unsigned int Reg = MO.getReg(); if (RegInfo.isPhysicalRegister(Reg)) UsedRegs.insert(Reg); } ++I; } // We now expect the end of the sequence. If we stopped early, // or reached the end of the block without finding a call, bail. if (I == MBB.end() || !I->isCall()) return; Context.Call = I; if ((++I)->getOpcode() != FrameDestroyOpcode) return; // Now, go through the vector, and see that we don't have any gaps, // but only a series of 32-bit MOVs. auto MMI = Context.MovVector.begin(), MME = Context.MovVector.end(); for (; MMI != MME; ++MMI, Context.ExpectedDist += 4) if (*MMI == nullptr) break; // If the call had no parameters, do nothing if (MMI == Context.MovVector.begin()) return; // We are either at the last parameter, or a gap. // Make sure it's not a gap for (; MMI != MME; ++MMI) if (*MMI != nullptr) return; Context.UsePush = true; return; }
MachineBasicBlock::iterator Filler::findDelayInstr(MachineBasicBlock &MBB, MachineBasicBlock::iterator slot) { SmallSet<unsigned, 32> RegDefs; SmallSet<unsigned, 32> RegUses; bool sawLoad = false; bool sawStore = false; if (slot == MBB.begin()) return MBB.end(); if (slot->getOpcode() == SP::RET) return MBB.end(); if (slot->getOpcode() == SP::RETL) { MachineBasicBlock::iterator J = slot; --J; if (J->getOpcode() == SP::RESTORErr || J->getOpcode() == SP::RESTOREri) { // change retl to ret. slot->setDesc(TM.getInstrInfo()->get(SP::RET)); return J; } } // Call's delay filler can def some of call's uses. if (slot->isCall()) insertCallDefsUses(slot, RegDefs, RegUses); else insertDefsUses(slot, RegDefs, RegUses); bool done = false; MachineBasicBlock::iterator I = slot; while (!done) { done = (I == MBB.begin()); if (!done) --I; // skip debug value if (I->isDebugValue()) continue; if (I->hasUnmodeledSideEffects() || I->isInlineAsm() || I->isLabel() || I->hasDelaySlot() || isDelayFiller(MBB, I)) break; if (delayHasHazard(I, sawLoad, sawStore, RegDefs, RegUses)) { insertDefsUses(I, RegDefs, RegUses); continue; } return I; } return MBB.end(); }
bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) { // Check that this particular call sequence is amenable to the // transformation. const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>( MF.getSubtarget().getRegisterInfo()); unsigned StackPtr = RegInfo.getStackRegister(); int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); // We expect to enter this at the beginning of a call sequence assert(I->getOpcode() == TII->getCallFrameSetupOpcode()); MachineBasicBlock::iterator FrameSetup = I++; // For globals in PIC mode, we can have some LEAs here. // Ignore them, they don't bother us. // TODO: Extend this to something that covers more cases. while (I->getOpcode() == X86::LEA32r) ++I; // We expect a copy instruction here. // TODO: The copy instruction is a lowering artifact. // We should also support a copy-less version, where the stack // pointer is used directly. if (!I->isCopy() || !I->getOperand(0).isReg()) return false; MachineBasicBlock::iterator SPCopy = I++; StackPtr = SPCopy->getOperand(0).getReg(); // Scan the call setup sequence for the pattern we're looking for. // We only handle a simple case - a sequence of MOV32mi or MOV32mr // instructions, that push a sequence of 32-bit values onto the stack, with // no gaps between them. SmallVector<MachineInstr*, 4> MovVector(4, nullptr); unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4; if (MaxAdjust > 4) MovVector.resize(MaxAdjust, nullptr); do { int Opcode = I->getOpcode(); if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr) break; // We only want movs of the form: // movl imm/r32, k(%esp) // If we run into something else, bail. // Note that AddrBaseReg may, counter to its name, not be a register, // but rather a frame index. // TODO: Support the fi case. This should probably work now that we // have the infrastructure to track the stack pointer within a call // sequence. if (!I->getOperand(X86::AddrBaseReg).isReg() || (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) || !I->getOperand(X86::AddrScaleAmt).isImm() || (I->getOperand(X86::AddrScaleAmt).getImm() != 1) || (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) || (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) || !I->getOperand(X86::AddrDisp).isImm()) return false; int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm(); assert(StackDisp >= 0 && "Negative stack displacement when passing parameters"); // We really don't want to consider the unaligned case. if (StackDisp % 4) return false; StackDisp /= 4; assert((size_t)StackDisp < MovVector.size() && "Function call has more parameters than the stack is adjusted for."); // If the same stack slot is being filled twice, something's fishy. if (MovVector[StackDisp] != nullptr) return false; MovVector[StackDisp] = I; ++I; } while (I != MBB.end()); // We now expect the end of the sequence - a call and a stack adjust. if (I == MBB.end()) return false; // For PCrel calls, we expect an additional COPY of the basereg. // If we find one, skip it. if (I->isCopy()) { if (I->getOperand(1).getReg() == MF.getInfo<X86MachineFunctionInfo>()->getGlobalBaseReg()) ++I; else return false; } if (!I->isCall()) return false; MachineBasicBlock::iterator Call = I; if ((++I)->getOpcode() != FrameDestroyOpcode) return false; // Now, go through the vector, and see that we don't have any gaps, // but only a series of 32-bit MOVs. int64_t ExpectedDist = 0; auto MMI = MovVector.begin(), MME = MovVector.end(); for (; MMI != MME; ++MMI, ExpectedDist += 4) if (*MMI == nullptr) break; // If the call had no parameters, do nothing if (!ExpectedDist) return false; // We are either at the last parameter, or a gap. // Make sure it's not a gap for (; MMI != MME; ++MMI) if (*MMI != nullptr) return false; // Ok, we can in fact do the transformation for this call. // Do not remove the FrameSetup instruction, but adjust the parameters. // PEI will end up finalizing the handling of this. FrameSetup->getOperand(1).setImm(ExpectedDist); DebugLoc DL = I->getDebugLoc(); // Now, iterate through the vector in reverse order, and replace the movs // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to // replace uses. for (int Idx = (ExpectedDist / 4) - 1; Idx >= 0; --Idx) { MachineBasicBlock::iterator MOV = *MovVector[Idx]; MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands); if (MOV->getOpcode() == X86::MOV32mi) { unsigned PushOpcode = X86::PUSHi32; // If the operand is a small (8-bit) immediate, we can use a // PUSH instruction with a shorter encoding. // Note that isImm() may fail even though this is a MOVmi, because // the operand can also be a symbol. if (PushOp.isImm()) { int64_t Val = PushOp.getImm(); if (isInt<8>(Val)) PushOpcode = X86::PUSH32i8; } BuildMI(MBB, Call, DL, TII->get(PushOpcode)).addOperand(PushOp); } else { unsigned int Reg = PushOp.getReg(); // If PUSHrmm is not slow on this target, try to fold the source of the // push into the instruction. const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); bool SlowPUSHrmm = ST.isAtom() || ST.isSLM(); // Check that this is legal to fold. Right now, we're extremely // conservative about that. MachineInstr *DefMov = nullptr; if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) { MachineInstr *Push = BuildMI(MBB, Call, DL, TII->get(X86::PUSH32rmm)); unsigned NumOps = DefMov->getDesc().getNumOperands(); for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i) Push->addOperand(DefMov->getOperand(i)); DefMov->eraseFromParent(); } else { BuildMI(MBB, Call, DL, TII->get(X86::PUSH32r)).addReg(Reg).getInstr(); } } MBB.erase(MOV); } // The stack-pointer copy is no longer used in the call sequences. // There should not be any other users, but we can't commit to that, so: if (MRI->use_empty(SPCopy->getOperand(0).getReg())) SPCopy->eraseFromParent(); // Once we've done this, we need to make sure PEI doesn't assume a reserved // frame. X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); FuncInfo->setHasPushSequences(true); return true; }
bool PatmosDelaySlotKiller::killDelaySlots(MachineBasicBlock &MBB) { bool Changed = false; DEBUG( dbgs() << "Killing slots in BB#" << MBB.getNumber() << " (" << MBB.getFullName() << ")\n" ); // consider the basic block from top to bottom for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) { // Control-flow instructions ("proper" delay slots) if (I->hasDelaySlot()) { assert( ( I->isCall() || I->isReturn() || I->isBranch() ) && "Unexpected instruction with delay slot."); MachineBasicBlock::instr_iterator MI = *I; if (I->isBundle()) { ++MI; } unsigned Opcode = MI->getOpcode(); if (Opcode == Patmos::BR || Opcode == Patmos::BRu || Opcode == Patmos::BRR || Opcode == Patmos::BRRu || Opcode == Patmos::BRT || Opcode == Patmos::BRTu || Opcode == Patmos::BRCF || Opcode == Patmos::BRCFu || Opcode == Patmos::BRCFR || Opcode == Patmos::BRCFRu || Opcode == Patmos::BRCFT || Opcode == Patmos::BRCFTu || Opcode == Patmos::CALL || Opcode == Patmos::CALLR || Opcode == Patmos::RET || Opcode == Patmos::XRET) { bool onlyNops = true; unsigned maxCount = TM.getSubtargetImpl()->getDelaySlotCycles(&*I); unsigned count = 0; for (MachineBasicBlock::iterator K = llvm::next(I), E = MBB.end(); K != E && count < maxCount; ++K, ++count) { TII->skipPseudos(MBB, K); if (K->getOpcode() != Patmos::NOP) { onlyNops = false; } } if (onlyNops) { unsigned NewOpcode = 0; switch(Opcode) { case Patmos::BR: NewOpcode = Patmos::BRND; break; case Patmos::BRu: NewOpcode = Patmos::BRNDu; break; case Patmos::BRR: NewOpcode = Patmos::BRRND; break; case Patmos::BRRu: NewOpcode = Patmos::BRRNDu; break; case Patmos::BRT: NewOpcode = Patmos::BRTND; break; case Patmos::BRTu: NewOpcode = Patmos::BRTNDu; break; case Patmos::BRCF: NewOpcode = Patmos::BRCFND; break; case Patmos::BRCFu: NewOpcode = Patmos::BRCFNDu; break; case Patmos::BRCFR: NewOpcode = Patmos::BRCFRND; break; case Patmos::BRCFRu: NewOpcode = Patmos::BRCFRNDu; break; case Patmos::BRCFT: NewOpcode = Patmos::BRCFTND; break; case Patmos::BRCFTu: NewOpcode = Patmos::BRCFTNDu; break; case Patmos::CALL: NewOpcode = Patmos::CALLND; break; case Patmos::CALLR: NewOpcode = Patmos::CALLRND; break; case Patmos::RET: NewOpcode = Patmos::RETND; break; case Patmos::XRET: NewOpcode = Patmos::XRETND; break; } const MCInstrDesc &nonDelayed = TII->get(NewOpcode); MI->setDesc(nonDelayed); unsigned killCount = 0; MachineBasicBlock::iterator K = llvm::next(I); for (MachineBasicBlock::iterator E = MBB.end(); K != E && killCount < count; ++K, ++killCount) { TII->skipPseudos(MBB, K); KilledSlots++; } MBB.erase(llvm::next(I), K); } } Changed = true; // pass result } } return Changed; }
unsigned PatmosInstrInfo::moveUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &II, unsigned Cycles) const { // TODO We assume here that we do not have instructions which must be scheduled // *within* a certain amount of cycles, except for branches (i.e., we // do not emit overlapping pipelined MULs). Otherwise we would need to // check if we violate any latency constraints when inserting an instruction // Note: We assume that the instruction has no dependencies on previous // instructions within the given number of cycles. If we would check for this, // this would become a complete scheduler. // We might move an instruction // 1) outside of delay slots -> always possible // 2) into a delay slot -> optional, must add predicate and replace NOP or // be bundled; we do not move other instructions around // 3) over a branch -> always possible if not predicated, but only until next // delay slot and if not moved into a delay slot if (II->isBundled()) { // TODO moving bundled instructions is not yet supported. return Cycles; } MachineBasicBlock::iterator J = II; // determine start of first delay slot above the instruction int nonDelayed = findPrevDelaySlotEnd(MBB, J, Cycles); // Check if the instruction is inside a delay slot if (nonDelayed < 0) { // do not move it out of the delay slot // TODO we could move it, and insert a NOP instead.. return Cycles; } bool isBranch = II->isBranch(); bool isCFLInstr = isBranch || II->isCall() || II->isReturn(); if (nonDelayed < (int)Cycles && J->isBranch() && !isPredicated(&*II) && isPredicated(&*J) && (!isCFLInstr || (isBranch && PST.allowBranchInsideCFLDelaySots()) )) { // J points to the branch instruction unsigned delayed = nonDelayed + PST.getDelaySlotCycles(&*J) + 1; // Load the predicate of the branch // We assume here that a bundle only contains at most one branch, // that this instruction is the first instruction in the bundle, and // that the branch is actually predicated. // TODO add a check for this! SmallVector<MachineOperand,4> Pred; const MachineInstr *BR = getFirstMI(&*J); assert(BR->isBranch() && "Branch is not in the first slot"); getPredicateOperands(BR, Pred); assert(Pred.size() >= 2 && "Branch instruction not predicated"); // determine if instruction might be moved over the delay slot if (delayed <= Cycles) { // TODO We only move the instruction at most one cycle above the branch. // We could move it further up, but then we need to check where the // predicate is defined. MachineBasicBlock::iterator JJ = J; if (findPrevDelaySlotEnd(MBB, JJ, 0) >= 0) { // Move the instruction up and predicate it II = MBB.insert(J, MBB.remove(II)); PredicateInstruction(&*II, Pred); NegatePredicate(&*II); return Cycles - delayed; } } // if not, check if we can move it into the delay slot MachineBasicBlock::iterator dst = J; // Going down from the branch until the first possible slot, checking // that the predicate is not redefined. // Note that we are not inserting the instruction, but replacing an // instruction, i.e., we move one instruction less over II than in the // other cases. while ((int)delayed > nonDelayed) { delayed--; if (delayed <= Cycles && moveTo(MBB, dst, II, &Pred, true)) { return Cycles - delayed; } // TODO check if this also finds a MTS $S0 !! if (dst->definesRegister(Pred[0].getReg(), &getRegisterInfo())) { break; } dst = nextNonPseudo(MBB, dst); } } if (nonDelayed > 0) { // we are staying below a delay slot, just move the instruction up J = II; recedeCycles(MBB, J, nonDelayed); II = MBB.insert(J, MBB.remove(II)); return Cycles - nonDelayed; } return Cycles; }
bool GCMachineCodeFixup::runOnMachineFunction(MachineFunction &MF) { // Quick exit for functions that do not use GC. if (!MF.getFunction()->hasGC()) return false; const TargetMachine &TM = MF.getTarget(); const TargetInstrInfo *TII = TM.getInstrInfo(); GCModuleInfo &GMI = getAnalysis<GCModuleInfo>(); GCFunctionInfo &GCFI = GMI.getFunctionInfo(*MF.getFunction()); for (MachineFunction::iterator MBBI = MF.begin(), MBBE = MF.end(); MBBI != MBBE; ++MBBI) { for (MachineBasicBlock::iterator MII = MBBI->begin(), MIE = MBBI->end(); MII != MIE;) { if (!MII->isGCRegRoot() || !MII->getOperand(0).isReg()) { ++MII; continue; } // Trace the register back to its location at the site of the call (either // a physical reg or a frame index). bool TracingReg = true; unsigned TracedReg = MII->getOperand(0).getReg(); int FrameIndex; MachineBasicBlock::iterator PrevII = MII; for (--PrevII;; --PrevII) { if (PrevII->isGCRegRoot() && PrevII->getOperand(0).isReg()) break; if (PrevII->isCall()) break; int FI; // Trace back through register reloads. unsigned Reg = TM.getInstrInfo()->isLoadFromStackSlotPostFE(&*PrevII, FI); if (Reg) { // This is a reload. If we're tracing this register, start tracing the // frame index instead. if (TracingReg && TracedReg == Reg) { TracingReg = false; FrameIndex = FI; } continue; } // Trace back through spills. if (TM.getInstrInfo()->isStoreToStackSlotPostFE(&*PrevII, FI)) continue; // Trace back through register-to-register copies. if (PrevII->isCopy()) { if (TracingReg && TracedReg == PrevII->getOperand(0).getReg()) TracedReg = PrevII->getOperand(1).getReg(); continue; } // Trace back through non-register GC_REG_ROOT instructions. if (PrevII->isGCRegRoot() && !PrevII->getOperand(0).isReg()) continue; DEBUG(dbgs() << "Bad instruction: " << *PrevII); llvm_unreachable("GC_REG_ROOT found in an unexpected location!"); } // Now we've reached either a call or another GC_REG_ROOT instruction. // Move the GC_REG_ROOT instruction we're considering to the right place, // and rewrite it if necessary. // // Also, tell the GCFunctionInfo about the frame index, since this is // our only chance -- the frame indices will be deleted by the time // GCMachineCodeAnalysis runs. ++PrevII; unsigned RootIndex = MII->getOperand(1).getImm(); MachineInstr *NewMI; if (TracingReg) { MachineInstrBuilder MIB = BuildMI(MF, MII->getDebugLoc(), TII->get(TargetOpcode::GC_REG_ROOT)); MIB.addReg(TracedReg).addImm(RootIndex); NewMI = MIB; } else { NewMI = TII->emitFrameIndexGCRegRoot(MF, FrameIndex, RootIndex, MII->getDebugLoc()); GCFI.spillRegRoot(RootIndex, FrameIndex); } MBBI->insert(PrevII, NewMI); MachineBasicBlock::iterator NextII = MII; ++NextII; MII->eraseFromParent(); MII = NextII; } } return true; }