// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States" // around other non-memory instructions. bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { bool Changes = false; ST = &MF.getSubtarget<SISubtarget>(); TII = ST->getInstrInfo(); TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); WaitedOn = ZeroCounts; DelayedWaitOn = ZeroCounts; LastIssued = ZeroCounts; LastOpcodeType = OTHER; LastInstWritesM0 = false; ReturnsVoid = MF.getInfo<SIMachineFunctionInfo>()->returnsVoid(); memset(&UsedRegs, 0, sizeof(UsedRegs)); memset(&DefinedRegs, 0, sizeof(DefinedRegs)); SmallVector<MachineInstr *, 4> RemoveMI; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I) { if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) { // There is a hardware bug on CI/SI where SMRD instruction may corrupt // vccz bit, so when we detect that an instruction may read from a // corrupt vccz bit, we need to: // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to // complete. // 2. Restore the correct value of vccz by writing the current value // of vcc back to vcc. if (TII->isSMRD(I->getOpcode())) { VCCZCorrupt = true; } else if (!hasOutstandingLGKM() && I->modifiesRegister(AMDGPU::VCC, TRI)) { // FIXME: We only care about SMRD instructions here, not LDS or GDS. // Whenever we store a value in vcc, the correct value of vccz is // restored. VCCZCorrupt = false; } // Check if we need to apply the bug work-around if (readsVCCZ(I->getOpcode()) && VCCZCorrupt) { DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n'); // Wait on everything, not just LGKM. vccz reads usually come from // terminators, and we always wait on everything at the end of the // block, so if we only wait on LGKM here, we might end up with // another s_waitcnt inserted right after this if there are non-LGKM // instructions still outstanding. insertWait(MBB, I, LastIssued); // Restore the vccz bit. Any time a value is written to vcc, the vcc // bit is updated, so we can restore the bit by reading the value of // vcc and then writing it back to the register. BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), AMDGPU::VCC) .addReg(AMDGPU::VCC); } } // Record pre-existing, explicitly requested waits if (I->getOpcode() == AMDGPU::S_WAITCNT) { handleExistingWait(*I); RemoveMI.push_back(&*I); continue; } Counters Required; // Wait for everything before a barrier. // // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish, // but we also want to wait for any other outstanding transfers before // signalling other hardware blocks if (I->getOpcode() == AMDGPU::S_BARRIER || I->getOpcode() == AMDGPU::S_SENDMSG) Required = LastIssued; else Required = handleOperands(*I); Counters Increment = getHwCounts(*I); if (countersNonZero(Required) || countersNonZero(Increment)) increaseCounters(Required, DelayedWaitOn); Changes |= insertWait(MBB, I, Required); pushInstruction(MBB, I, Increment); handleSendMsg(MBB, I); } // Wait for everything at the end of the MBB Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); } for (MachineInstr *I : RemoveMI) I->eraseFromParent(); return Changes; }
void SystemZFrameLowering::emitPrologue(MachineFunction &MF) const { MachineBasicBlock &MBB = MF.front(); MachineFrameInfo *MFFrame = MF.getFrameInfo(); auto *ZII = static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo()); SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>(); MachineBasicBlock::iterator MBBI = MBB.begin(); MachineModuleInfo &MMI = MF.getMMI(); const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); const std::vector<CalleeSavedInfo> &CSI = MFFrame->getCalleeSavedInfo(); bool HasFP = hasFP(MF); DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); // The current offset of the stack pointer from the CFA. int64_t SPOffsetFromCFA = -SystemZMC::CFAOffsetFromInitialSP; if (ZFI->getLowSavedGPR()) { // Skip over the GPR saves. if (MBBI != MBB.end() && MBBI->getOpcode() == SystemZ::STMG) ++MBBI; else llvm_unreachable("Couldn't skip over GPR saves"); // Add CFI for the GPR saves. for (auto &Save : CSI) { unsigned Reg = Save.getReg(); if (SystemZ::GR64BitRegClass.contains(Reg)) { int64_t Offset = SPOffsetFromCFA + RegSpillOffsets[Reg]; unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset( nullptr, MRI->getDwarfRegNum(Reg, true), Offset)); BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); } } } uint64_t StackSize = getAllocatedStackSize(MF); if (StackSize) { // Allocate StackSize bytes. int64_t Delta = -int64_t(StackSize); emitIncrement(MBB, MBBI, DL, SystemZ::R15D, Delta, ZII); // Add CFI for the allocation. unsigned CFIIndex = MMI.addFrameInst( MCCFIInstruction::createDefCfaOffset(nullptr, SPOffsetFromCFA + Delta)); BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); SPOffsetFromCFA += Delta; } if (HasFP) { // Copy the base of the frame to R11. BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::LGR), SystemZ::R11D) .addReg(SystemZ::R15D); // Add CFI for the new frame location. unsigned HardFP = MRI->getDwarfRegNum(SystemZ::R11D, true); unsigned CFIIndex = MMI.addFrameInst( MCCFIInstruction::createDefCfaRegister(nullptr, HardFP)); BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); // Mark the FramePtr as live at the beginning of every block except // the entry block. (We'll have marked R11 as live on entry when // saving the GPRs.) for (auto I = std::next(MF.begin()), E = MF.end(); I != E; ++I) I->addLiveIn(SystemZ::R11D); } // Skip over the FPR saves. SmallVector<unsigned, 8> CFIIndexes; for (auto &Save : CSI) { unsigned Reg = Save.getReg(); if (SystemZ::FP64BitRegClass.contains(Reg)) { if (MBBI != MBB.end() && (MBBI->getOpcode() == SystemZ::STD || MBBI->getOpcode() == SystemZ::STDY)) ++MBBI; else llvm_unreachable("Couldn't skip over FPR save"); // Add CFI for the this save. unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); int64_t Offset = getFrameIndexOffset(MF, Save.getFrameIdx()); unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset( nullptr, DwarfReg, SPOffsetFromCFA + Offset)); CFIIndexes.push_back(CFIIndex); } } // Complete the CFI for the FPR saves, modelling them as taking effect // after the last save. for (auto CFIIndex : CFIIndexes) { BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); } }
bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo()); const R600RegisterInfo &TRI = TII->getRegisterInfo(); for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); BB != BB_E; ++BB) { MachineBasicBlock &MBB = *BB; MachineBasicBlock::iterator I = MBB.begin(); while (I != MBB.end()) { MachineInstr &MI = *I; I = llvm::next(I); // Expand LDS_*_RET instructions if (TII->isLDSRetInstr(MI.getOpcode())) { int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); assert(DstIdx != -1); MachineOperand &DstOp = MI.getOperand(DstIdx); MachineInstr *Mov = TII->buildMovInstr(&MBB, I, DstOp.getReg(), AMDGPU::OQAP); DstOp.setReg(AMDGPU::OQAP); int LDSPredSelIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::pred_sel); int MovPredSelIdx = TII->getOperandIdx(Mov->getOpcode(), AMDGPU::OpName::pred_sel); // Copy the pred_sel bit Mov->getOperand(MovPredSelIdx).setReg( MI.getOperand(LDSPredSelIdx).getReg()); } switch (MI.getOpcode()) { default: break; // Expand PRED_X to one of the PRED_SET instructions. case AMDGPU::PRED_X: { uint64_t Flags = MI.getOperand(3).getImm(); // The native opcode used by PRED_X is stored as an immediate in the // third operand. MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I, MI.getOperand(2).getImm(), // opcode MI.getOperand(0).getReg(), // dst MI.getOperand(1).getReg(), // src0 AMDGPU::ZERO); // src1 TII->addFlag(PredSet, 0, MO_FLAG_MASK); if (Flags & MO_FLAG_PUSH) { TII->setImmOperand(PredSet, AMDGPU::OpName::update_exec_mask, 1); } else { TII->setImmOperand(PredSet, AMDGPU::OpName::update_pred, 1); } MI.eraseFromParent(); continue; } case AMDGPU::INTERP_PAIR_XY: { MachineInstr *BMI; unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( MI.getOperand(2).getImm()); for (unsigned Chan = 0; Chan < 4; ++Chan) { unsigned DstReg; if (Chan < 2) DstReg = MI.getOperand(Chan).getReg(); else DstReg = Chan == 2 ? AMDGPU::T0_Z : AMDGPU::T0_W; BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_XY, DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg); if (Chan > 0) { BMI->bundleWithPred(); } if (Chan >= 2) TII->addFlag(BMI, 0, MO_FLAG_MASK); if (Chan != 3) TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); } MI.eraseFromParent(); continue; } case AMDGPU::INTERP_PAIR_ZW: { MachineInstr *BMI; unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( MI.getOperand(2).getImm()); for (unsigned Chan = 0; Chan < 4; ++Chan) { unsigned DstReg; if (Chan < 2) DstReg = Chan == 0 ? AMDGPU::T0_X : AMDGPU::T0_Y; else DstReg = MI.getOperand(Chan-2).getReg(); BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_ZW, DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg); if (Chan > 0) { BMI->bundleWithPred(); } if (Chan < 2) TII->addFlag(BMI, 0, MO_FLAG_MASK); if (Chan != 3) TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); } MI.eraseFromParent(); continue; } case AMDGPU::INTERP_VEC_LOAD: { const R600RegisterInfo &TRI = TII->getRegisterInfo(); MachineInstr *BMI; unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( MI.getOperand(1).getImm()); unsigned DstReg = MI.getOperand(0).getReg(); for (unsigned Chan = 0; Chan < 4; ++Chan) { BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_LOAD_P0, TRI.getSubReg(DstReg, TRI.getSubRegFromChannel(Chan)), PReg); if (Chan > 0) { BMI->bundleWithPred(); } if (Chan != 3) TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); } MI.eraseFromParent(); continue; } case AMDGPU::DOT_4: { const R600RegisterInfo &TRI = TII->getRegisterInfo(); unsigned DstReg = MI.getOperand(0).getReg(); unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK; for (unsigned Chan = 0; Chan < 4; ++Chan) { bool Mask = (Chan != TRI.getHWRegChan(DstReg)); unsigned SubDstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan); MachineInstr *BMI = TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, SubDstReg); if (Chan > 0) { BMI->bundleWithPred(); } if (Mask) { TII->addFlag(BMI, 0, MO_FLAG_MASK); } if (Chan != 3) TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); unsigned Opcode = BMI->getOpcode(); // While not strictly necessary from hw point of view, we force // all src operands of a dot4 inst to belong to the same slot. unsigned Src0 = BMI->getOperand( TII->getOperandIdx(Opcode, AMDGPU::OpName::src0)) .getReg(); unsigned Src1 = BMI->getOperand( TII->getOperandIdx(Opcode, AMDGPU::OpName::src1)) .getReg(); (void) Src0; (void) Src1; if ((TRI.getEncodingValue(Src0) & 0xff) < 127 && (TRI.getEncodingValue(Src1) & 0xff) < 127) assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1)); } MI.eraseFromParent(); continue; } } bool IsReduction = TII->isReductionOp(MI.getOpcode()); bool IsVector = TII->isVector(MI); bool IsCube = TII->isCubeOp(MI.getOpcode()); if (!IsReduction && !IsVector && !IsCube) { continue; } // Expand the instruction // // Reduction instructions: // T0_X = DP4 T1_XYZW, T2_XYZW // becomes: // TO_X = DP4 T1_X, T2_X // TO_Y (write masked) = DP4 T1_Y, T2_Y // TO_Z (write masked) = DP4 T1_Z, T2_Z // TO_W (write masked) = DP4 T1_W, T2_W // // Vector instructions: // T0_X = MULLO_INT T1_X, T2_X // becomes: // T0_X = MULLO_INT T1_X, T2_X // T0_Y (write masked) = MULLO_INT T1_X, T2_X // T0_Z (write masked) = MULLO_INT T1_X, T2_X // T0_W (write masked) = MULLO_INT T1_X, T2_X // // Cube instructions: // T0_XYZW = CUBE T1_XYZW // becomes: // TO_X = CUBE T1_Z, T1_Y // T0_Y = CUBE T1_Z, T1_X // T0_Z = CUBE T1_X, T1_Z // T0_W = CUBE T1_Y, T1_Z for (unsigned Chan = 0; Chan < 4; Chan++) { unsigned DstReg = MI.getOperand( TII->getOperandIdx(MI, AMDGPU::OpName::dst)).getReg(); unsigned Src0 = MI.getOperand( TII->getOperandIdx(MI, AMDGPU::OpName::src0)).getReg(); unsigned Src1 = 0; // Determine the correct source registers if (!IsCube) { int Src1Idx = TII->getOperandIdx(MI, AMDGPU::OpName::src1); if (Src1Idx != -1) { Src1 = MI.getOperand(Src1Idx).getReg(); } } if (IsReduction) { unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan); Src0 = TRI.getSubReg(Src0, SubRegIndex); Src1 = TRI.getSubReg(Src1, SubRegIndex); } else if (IsCube) { static const int CubeSrcSwz[] = {2, 2, 0, 1}; unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]); unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]); Src1 = TRI.getSubReg(Src0, SubRegIndex1); Src0 = TRI.getSubReg(Src0, SubRegIndex0); } // Determine the correct destination registers; bool Mask = false; bool NotLast = true; if (IsCube) { unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan); DstReg = TRI.getSubReg(DstReg, SubRegIndex); } else { // Mask the write if the original instruction does not write to // the current Channel. Mask = (Chan != TRI.getHWRegChan(DstReg)); unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK; DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan); } // Set the IsLast bit NotLast = (Chan != 3 ); // Add the new instruction unsigned Opcode = MI.getOpcode(); switch (Opcode) { case AMDGPU::CUBE_r600_pseudo: Opcode = AMDGPU::CUBE_r600_real; break; case AMDGPU::CUBE_eg_pseudo: Opcode = AMDGPU::CUBE_eg_real; break; default: break; } MachineInstr *NewMI = TII->buildDefaultInstruction(MBB, I, Opcode, DstReg, Src0, Src1); if (Chan != 0) NewMI->bundleWithPred(); if (Mask) { TII->addFlag(NewMI, 0, MO_FLAG_MASK); } if (NotLast) { TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST); } SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::clamp); SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::literal); SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_abs); SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_abs); SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_neg); SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_neg); } MI.eraseFromParent(); } } return false; }
bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); const SIRegisterInfo &TRI = TII->getRegisterInfo(); std::vector<unsigned> I1Defs; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; MachineBasicBlock::iterator I, Next; for (I = MBB.begin(); I != MBB.end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. if (MI.getOpcode() == AMDGPU::S_MOV_B32) { const MachineOperand &Src = MI.getOperand(1); if (Src.isImm()) { if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src)) MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); } continue; } if (!TII->hasVALU32BitEncoding(MI.getOpcode())) continue; if (!canShrink(MI, TII, TRI, MRI)) { // Try commuting the instruction and see if that enables us to shrink // it. if (!MI.isCommutable() || !TII->commuteInstruction(&MI) || !canShrink(MI, TII, TRI, MRI)) continue; } // getVOPe32 could be -1 here if we started with an instruction that had // a 32-bit encoding and then commuted it to an instruction that did not. if (!TII->hasVALU32BitEncoding(MI.getOpcode())) continue; int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); if (TII->isVOPC(Op32)) { unsigned DstReg = MI.getOperand(0).getReg(); if (TargetRegisterInfo::isVirtualRegister(DstReg)) { // VOPC instructions can only write to the VCC register. We can't // force them to use VCC here, because the register allocator has // trouble with sequences like this, which cause the allocator to run // out of registers if vreg0 and vreg1 belong to the VCCReg register // class: // vreg0 = VOPC; // vreg1 = VOPC; // S_AND_B64 vreg0, vreg1 // // So, instead of forcing the instruction to write to VCC, we provide // a hint to the register allocator to use VCC and then we we will run // this pass again after RA and shrink it if it outputs to VCC. MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC); continue; } if (DstReg != AMDGPU::VCC) continue; } // We can shrink this instruction DEBUG(dbgs() << "Shrinking "; MI.dump(); dbgs() << '\n';); MachineInstrBuilder Inst32 = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32)); // dst Inst32.addOperand(MI.getOperand(0)); Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); if (Src1) Inst32.addOperand(*Src1); ++NumInstructionsShrunk; MI.eraseFromParent(); foldImmediates(*Inst32, TII, MRI); DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); } }
/// replaceFrameIndices - Replace all MO_FrameIndex operands with physical /// register references and actual offsets. /// void PEI::replaceFrameIndices(MachineFunction &Fn) { if (!Fn.getFrameInfo()->hasStackObjects()) return; // Nothing to do? const TargetMachine &TM = Fn.getTarget(); assert(TM.getRegisterInfo() && "TM::getRegisterInfo() must be implemented!"); const TargetInstrInfo &TII = *Fn.getTarget().getInstrInfo(); const TargetRegisterInfo &TRI = *TM.getRegisterInfo(); const TargetFrameLowering *TFI = TM.getFrameLowering(); bool StackGrowsDown = TFI->getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown; int FrameSetupOpcode = TII.getCallFrameSetupOpcode(); int FrameDestroyOpcode = TII.getCallFrameDestroyOpcode(); for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) { #ifndef NDEBUG int SPAdjCount = 0; // frame setup / destroy count. #endif int SPAdj = 0; // SP offset due to call frame setup / destroy. if (RS && !FrameIndexVirtualScavenging) RS->enterBasicBlock(BB); for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) { if (I->getOpcode() == FrameSetupOpcode || I->getOpcode() == FrameDestroyOpcode) { #ifndef NDEBUG // Track whether we see even pairs of them SPAdjCount += I->getOpcode() == FrameSetupOpcode ? 1 : -1; #endif // Remember how much SP has been adjusted to create the call // frame. int Size = I->getOperand(0).getImm(); if ((!StackGrowsDown && I->getOpcode() == FrameSetupOpcode) || (StackGrowsDown && I->getOpcode() == FrameDestroyOpcode)) Size = -Size; SPAdj += Size; MachineBasicBlock::iterator PrevI = BB->end(); if (I != BB->begin()) PrevI = prior(I); TFI->eliminateCallFramePseudoInstr(Fn, *BB, I); // Visit the instructions created by eliminateCallFramePseudoInstr(). if (PrevI == BB->end()) I = BB->begin(); // The replaced instr was the first in the block. else I = llvm::next(PrevI); continue; } MachineInstr *MI = I; bool DoIncr = true; for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { if (!MI->getOperand(i).isFI()) continue; // Some instructions (e.g. inline asm instructions) can have // multiple frame indices and/or cause eliminateFrameIndex // to insert more than one instruction. We need the register // scavenger to go through all of these instructions so that // it can update its register information. We keep the // iterator at the point before insertion so that we can // revisit them in full. bool AtBeginning = (I == BB->begin()); if (!AtBeginning) --I; // If this instruction has a FrameIndex operand, we need to // use that target machine register info object to eliminate // it. TRI.eliminateFrameIndex(MI, SPAdj, i, FrameIndexVirtualScavenging ? NULL : RS); // Reset the iterator if we were at the beginning of the BB. if (AtBeginning) { I = BB->begin(); DoIncr = false; } MI = 0; break; } if (DoIncr && I != BB->end()) ++I; // Update register states. if (RS && !FrameIndexVirtualScavenging && MI) RS->forward(MI); } // If we have evenly matched pairs of frame setup / destroy instructions, // make sure the adjustments come out to zero. If we don't have matched // pairs, we can't be sure the missing bit isn't in another basic block // due to a custom inserter playing tricks, so just asserting SPAdj==0 // isn't sufficient. See tMOVCC on Thumb1, for example. assert((SPAdjCount || SPAdj == 0) && "Unbalanced call frame setup / destroy pairs?"); } }
bool StrongPHIElimination::runOnMachineFunction(MachineFunction &MF) { MRI = &MF.getRegInfo(); TII = MF.getTarget().getInstrInfo(); DT = &getAnalysis<MachineDominatorTree>(); LI = &getAnalysis<LiveIntervals>(); for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) { for (MachineBasicBlock::iterator BBI = I->begin(), BBE = I->end(); BBI != BBE && BBI->isPHI(); ++BBI) { unsigned DestReg = BBI->getOperand(0).getReg(); addReg(DestReg); PHISrcDefs[I].push_back(BBI); for (unsigned i = 1; i < BBI->getNumOperands(); i += 2) { MachineOperand &SrcMO = BBI->getOperand(i); unsigned SrcReg = SrcMO.getReg(); addReg(SrcReg); unionRegs(DestReg, SrcReg); MachineInstr *DefMI = MRI->getVRegDef(SrcReg); if (DefMI) PHISrcDefs[DefMI->getParent()].push_back(DefMI); } } } // Perform a depth-first traversal of the dominator tree, splitting // interferences amongst PHI-congruence classes. DenseMap<unsigned, unsigned> CurrentDominatingParent; DenseMap<unsigned, unsigned> ImmediateDominatingParent; for (df_iterator<MachineDomTreeNode*> DI = df_begin(DT->getRootNode()), DE = df_end(DT->getRootNode()); DI != DE; ++DI) { SplitInterferencesForBasicBlock(*DI->getBlock(), CurrentDominatingParent, ImmediateDominatingParent); } // Insert copies for all PHI source and destination registers. for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) { for (MachineBasicBlock::iterator BBI = I->begin(), BBE = I->end(); BBI != BBE && BBI->isPHI(); ++BBI) { InsertCopiesForPHI(BBI, I); } } // FIXME: Preserve the equivalence classes during copy insertion and use // the preversed equivalence classes instead of recomputing them. RegNodeMap.clear(); for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) { for (MachineBasicBlock::iterator BBI = I->begin(), BBE = I->end(); BBI != BBE && BBI->isPHI(); ++BBI) { unsigned DestReg = BBI->getOperand(0).getReg(); addReg(DestReg); for (unsigned i = 1; i < BBI->getNumOperands(); i += 2) { unsigned SrcReg = BBI->getOperand(i).getReg(); addReg(SrcReg); unionRegs(DestReg, SrcReg); } } } DenseMap<unsigned, unsigned> RegRenamingMap; bool Changed = false; for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) { MachineBasicBlock::iterator BBI = I->begin(), BBE = I->end(); while (BBI != BBE && BBI->isPHI()) { MachineInstr *PHI = BBI; assert(PHI->getNumOperands() > 0); unsigned SrcReg = PHI->getOperand(1).getReg(); unsigned SrcColor = getRegColor(SrcReg); unsigned NewReg = RegRenamingMap[SrcColor]; if (!NewReg) { NewReg = SrcReg; RegRenamingMap[SrcColor] = SrcReg; } MergeLIsAndRename(SrcReg, NewReg); unsigned DestReg = PHI->getOperand(0).getReg(); if (!InsertedDestCopies.count(DestReg)) MergeLIsAndRename(DestReg, NewReg); for (unsigned i = 3; i < PHI->getNumOperands(); i += 2) { unsigned SrcReg = PHI->getOperand(i).getReg(); MergeLIsAndRename(SrcReg, NewReg); } ++BBI; LI->RemoveMachineInstrFromMaps(PHI); PHI->eraseFromParent(); Changed = true; } } // Due to the insertion of copies to split live ranges, the live intervals are // guaranteed to not overlap, except in one case: an original PHI source and a // PHI destination copy. In this case, they have the same value and thus don't // truly intersect, so we merge them into the value live at that point. // FIXME: Is there some better way we can handle this? for (DestCopyMap::iterator I = InsertedDestCopies.begin(), E = InsertedDestCopies.end(); I != E; ++I) { unsigned DestReg = I->first; unsigned DestColor = getRegColor(DestReg); unsigned NewReg = RegRenamingMap[DestColor]; LiveInterval &DestLI = LI->getInterval(DestReg); LiveInterval &NewLI = LI->getInterval(NewReg); assert(DestLI.ranges.size() == 1 && "PHI destination copy's live interval should be a single live " "range from the beginning of the BB to the copy instruction."); LiveRange *DestLR = DestLI.begin(); VNInfo *NewVNI = NewLI.getVNInfoAt(DestLR->start); if (!NewVNI) { NewVNI = NewLI.createValueCopy(DestLR->valno, LI->getVNInfoAllocator()); MachineInstr *CopyInstr = I->second; CopyInstr->getOperand(1).setIsKill(true); } LiveRange NewLR(DestLR->start, DestLR->end, NewVNI); NewLI.addRange(NewLR); LI->removeInterval(DestReg); MRI->replaceRegWith(DestReg, NewReg); } // Adjust the live intervals of all PHI source registers to handle the case // where the PHIs in successor blocks were the only later uses of the source // register. for (SrcCopySet::iterator I = InsertedSrcCopySet.begin(), E = InsertedSrcCopySet.end(); I != E; ++I) { MachineBasicBlock *MBB = I->first; unsigned SrcReg = I->second; if (unsigned RenamedRegister = RegRenamingMap[getRegColor(SrcReg)]) SrcReg = RenamedRegister; LiveInterval &SrcLI = LI->getInterval(SrcReg); bool isLiveOut = false; for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI) { if (SrcLI.liveAt(LI->getMBBStartIdx(*SI))) { isLiveOut = true; break; } } if (isLiveOut) continue; MachineOperand *LastUse = findLastUse(MBB, SrcReg); assert(LastUse); SlotIndex LastUseIndex = LI->getInstructionIndex(LastUse->getParent()); SrcLI.removeRange(LastUseIndex.getDefIndex(), LI->getMBBEndIdx(MBB)); LastUse->setIsKill(true); } LI->renumber(); Allocator.Reset(); RegNodeMap.clear(); PHISrcDefs.clear(); InsertedSrcCopySet.clear(); InsertedSrcCopyMap.clear(); InsertedDestCopies.clear(); return Changed; }
bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); const SIRegisterInfo &TRI = TII->getRegisterInfo(); std::vector<unsigned> I1Defs; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; MachineBasicBlock::iterator I, Next; for (I = MBB.begin(); I != MBB.end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. if (MI.getOpcode() == AMDGPU::S_MOV_B32) { const MachineOperand &Src = MI.getOperand(1); if (Src.isImm()) { if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4)) MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); } continue; } if (!TII->hasVALU32BitEncoding(MI.getOpcode())) continue; if (!canShrink(MI, TII, TRI, MRI)) { // Try commuting the instruction and see if that enables us to shrink // it. if (!MI.isCommutable() || !TII->commuteInstruction(&MI) || !canShrink(MI, TII, TRI, MRI)) continue; } // getVOPe32 could be -1 here if we started with an instruction that had // a 32-bit encoding and then commuted it to an instruction that did not. if (!TII->hasVALU32BitEncoding(MI.getOpcode())) continue; int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); if (TII->isVOPC(Op32)) { unsigned DstReg = MI.getOperand(0).getReg(); if (TargetRegisterInfo::isVirtualRegister(DstReg)) { // VOPC instructions can only write to the VCC register. We can't // force them to use VCC here, because this is only one register and // cannot deal with sequences which would require multiple copies of // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) // // So, instead of forcing the instruction to write to VCC, we provide // a hint to the register allocator to use VCC and then we we will run // this pass again after RA and shrink it if it outputs to VCC. MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC); continue; } if (DstReg != AMDGPU::VCC) continue; } if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC // instructions. const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); if (!Src2->isReg()) continue; unsigned SReg = Src2->getReg(); if (TargetRegisterInfo::isVirtualRegister(SReg)) { MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC); continue; } if (SReg != AMDGPU::VCC) continue; } // We can shrink this instruction DEBUG(dbgs() << "Shrinking " << MI); MachineInstrBuilder Inst32 = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32)); // Add the dst operand if the 32-bit encoding also has an explicit $dst. // For VOPC instructions, this is replaced by an implicit def of vcc. int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::dst); if (Op32DstIdx != -1) { // dst Inst32.addOperand(MI.getOperand(0)); } else { assert(MI.getOperand(0).getReg() == AMDGPU::VCC && "Unexpected case"); } Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); if (Src1) Inst32.addOperand(*Src1); const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); if (Src2) { int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); if (Op32Src2Idx != -1) { Inst32.addOperand(*Src2); } else { // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is // replaced with an implicit read of vcc. assert(Src2->getReg() == AMDGPU::VCC && "Unexpected missing register operand"); Inst32.addOperand(copyRegOperandAsImplicit(*Src2)); } } ++NumInstructionsShrunk; MI.eraseFromParent(); foldImmediates(*Inst32, TII, MRI); DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); } } return false; }
bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) { DEBUG(dbgs() << "********** Hexagon New Value Jump **********\n" << "********** Function: " << MF.getName() << "\n"); #if 0 // for now disable this, if we move NewValueJump before register // allocation we need this information. LiveVariables &LVs = getAnalysis<LiveVariables>(); #endif QII = static_cast<const HexagonInstrInfo *>(MF.getSubtarget().getInstrInfo()); QRI = static_cast<const HexagonRegisterInfo *>( MF.getSubtarget().getRegisterInfo()); MBPI = &getAnalysis<MachineBranchProbabilityInfo>(); if (!QRI->Subtarget.hasV4TOps() || DisableNewValueJumps) { return false; } int nvjCount = DbgNVJCount; int nvjGenerated = 0; // Loop through all the bb's of the function for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end(); MBBb != MBBe; ++MBBb) { MachineBasicBlock* MBB = MBBb; DEBUG(dbgs() << "** dumping bb ** " << MBB->getNumber() << "\n"); DEBUG(MBB->dump()); DEBUG(dbgs() << "\n" << "********** dumping instr bottom up **********\n"); bool foundJump = false; bool foundCompare = false; bool invertPredicate = false; unsigned predReg = 0; // predicate reg of the jump. unsigned cmpReg1 = 0; int cmpOp2 = 0; bool MO1IsKill = false; bool MO2IsKill = false; MachineBasicBlock::iterator jmpPos; MachineBasicBlock::iterator cmpPos; MachineInstr *cmpInstr = nullptr, *jmpInstr = nullptr; MachineBasicBlock *jmpTarget = nullptr; bool afterRA = false; bool isSecondOpReg = false; bool isSecondOpNewified = false; // Traverse the basic block - bottom up for (MachineBasicBlock::iterator MII = MBB->end(), E = MBB->begin(); MII != E;) { MachineInstr *MI = --MII; if (MI->isDebugValue()) { continue; } if ((nvjCount == 0) || (nvjCount > -1 && nvjCount <= nvjGenerated)) break; DEBUG(dbgs() << "Instr: "; MI->dump(); dbgs() << "\n"); if (!foundJump && (MI->getOpcode() == Hexagon::JMP_t || MI->getOpcode() == Hexagon::JMP_f || MI->getOpcode() == Hexagon::JMP_tnew_t || MI->getOpcode() == Hexagon::JMP_tnew_nt || MI->getOpcode() == Hexagon::JMP_fnew_t || MI->getOpcode() == Hexagon::JMP_fnew_nt)) { // This is where you would insert your compare and // instr that feeds compare jmpPos = MII; jmpInstr = MI; predReg = MI->getOperand(0).getReg(); afterRA = TargetRegisterInfo::isPhysicalRegister(predReg); // If ifconverter had not messed up with the kill flags of the // operands, the following check on the kill flag would suffice. // if(!jmpInstr->getOperand(0).isKill()) break; // This predicate register is live out out of BB // this would only work if we can actually use Live // variable analysis on phy regs - but LLVM does not // provide LV analysis on phys regs. //if(LVs.isLiveOut(predReg, *MBB)) break; // Get all the successors of this block - which will always // be 2. Check if the predicate register is live in in those // successor. If yes, we can not delete the predicate - // I am doing this only because LLVM does not provide LiveOut // at the BB level. bool predLive = false; for (MachineBasicBlock::const_succ_iterator SI = MBB->succ_begin(), SIE = MBB->succ_end(); SI != SIE; ++SI) { MachineBasicBlock* succMBB = *SI; if (succMBB->isLiveIn(predReg)) { predLive = true; } } if (predLive) break; jmpTarget = MI->getOperand(1).getMBB(); foundJump = true; if (MI->getOpcode() == Hexagon::JMP_f || MI->getOpcode() == Hexagon::JMP_fnew_t || MI->getOpcode() == Hexagon::JMP_fnew_nt) { invertPredicate = true; } continue; } // No new value jump if there is a barrier. A barrier has to be in its // own packet. A barrier has zero operands. We conservatively bail out // here if we see any instruction with zero operands. if (foundJump && MI->getNumOperands() == 0) break; if (foundJump && !foundCompare && MI->getOperand(0).isReg() && MI->getOperand(0).getReg() == predReg) { // Not all compares can be new value compare. Arch Spec: 7.6.1.1 if (QII->isNewValueJumpCandidate(MI)) { assert((MI->getDesc().isCompare()) && "Only compare instruction can be collapsed into New Value Jump"); isSecondOpReg = MI->getOperand(2).isReg(); if (!canCompareBeNewValueJump(QII, QRI, MII, predReg, isSecondOpReg, afterRA, jmpPos, MF)) break; cmpInstr = MI; cmpPos = MII; foundCompare = true; // We need cmpReg1 and cmpOp2(imm or reg) while building // new value jump instruction. cmpReg1 = MI->getOperand(1).getReg(); if (MI->getOperand(1).isKill()) MO1IsKill = true; if (isSecondOpReg) { cmpOp2 = MI->getOperand(2).getReg(); if (MI->getOperand(2).isKill()) MO2IsKill = true; } else cmpOp2 = MI->getOperand(2).getImm(); continue; } } if (foundCompare && foundJump) { // If "common" checks fail, bail out on this BB. if (!commonChecksToProhibitNewValueJump(afterRA, MII)) break; bool foundFeeder = false; MachineBasicBlock::iterator feederPos = MII; if (MI->getOperand(0).isReg() && MI->getOperand(0).isDef() && (MI->getOperand(0).getReg() == cmpReg1 || (isSecondOpReg && MI->getOperand(0).getReg() == (unsigned) cmpOp2))) { unsigned feederReg = MI->getOperand(0).getReg(); // First try to see if we can get the feeder from the first operand // of the compare. If we can not, and if secondOpReg is true // (second operand of the compare is also register), try that one. // TODO: Try to come up with some heuristic to figure out which // feeder would benefit. if (feederReg == cmpReg1) { if (!canBeFeederToNewValueJump(QII, QRI, MII, jmpPos, cmpPos, MF)) { if (!isSecondOpReg) break; else continue; } else foundFeeder = true; } if (!foundFeeder && isSecondOpReg && feederReg == (unsigned) cmpOp2) if (!canBeFeederToNewValueJump(QII, QRI, MII, jmpPos, cmpPos, MF)) break; if (isSecondOpReg) { // In case of CMPLT, or CMPLTU, or EQ with the second register // to newify, swap the operands. if (cmpInstr->getOpcode() == Hexagon::C2_cmpeq && feederReg == (unsigned) cmpOp2) { unsigned tmp = cmpReg1; bool tmpIsKill = MO1IsKill; cmpReg1 = cmpOp2; MO1IsKill = MO2IsKill; cmpOp2 = tmp; MO2IsKill = tmpIsKill; } // Now we have swapped the operands, all we need to check is, // if the second operand (after swap) is the feeder. // And if it is, make a note. if (feederReg == (unsigned)cmpOp2) isSecondOpNewified = true; } // Now that we are moving feeder close the jump, // make sure we are respecting the kill values of // the operands of the feeder. bool updatedIsKill = false; for (unsigned i = 0; i < MI->getNumOperands(); i++) { MachineOperand &MO = MI->getOperand(i); if (MO.isReg() && MO.isUse()) { unsigned feederReg = MO.getReg(); for (MachineBasicBlock::iterator localII = feederPos, end = jmpPos; localII != end; localII++) { MachineInstr *localMI = localII; for (unsigned j = 0; j < localMI->getNumOperands(); j++) { MachineOperand &localMO = localMI->getOperand(j); if (localMO.isReg() && localMO.isUse() && localMO.isKill() && feederReg == localMO.getReg()) { // We found that there is kill of a use register // Set up a kill flag on the register localMO.setIsKill(false); MO.setIsKill(); updatedIsKill = true; break; } } if (updatedIsKill) break; } } if (updatedIsKill) break; } MBB->splice(jmpPos, MI->getParent(), MI); MBB->splice(jmpPos, MI->getParent(), cmpInstr); DebugLoc dl = MI->getDebugLoc(); MachineInstr *NewMI; assert((QII->isNewValueJumpCandidate(cmpInstr)) && "This compare is not a New Value Jump candidate."); unsigned opc = getNewValueJumpOpcode(cmpInstr, cmpOp2, isSecondOpNewified, jmpTarget, MBPI); if (invertPredicate) opc = QII->getInvertedPredicatedOpcode(opc); if (isSecondOpReg) NewMI = BuildMI(*MBB, jmpPos, dl, QII->get(opc)) .addReg(cmpReg1, getKillRegState(MO1IsKill)) .addReg(cmpOp2, getKillRegState(MO2IsKill)) .addMBB(jmpTarget); else if ((cmpInstr->getOpcode() == Hexagon::C2_cmpeqi || cmpInstr->getOpcode() == Hexagon::C2_cmpgti) && cmpOp2 == -1 ) // Corresponding new-value compare jump instructions don't have the // operand for -1 immediate value. NewMI = BuildMI(*MBB, jmpPos, dl, QII->get(opc)) .addReg(cmpReg1, getKillRegState(MO1IsKill)) .addMBB(jmpTarget); else NewMI = BuildMI(*MBB, jmpPos, dl, QII->get(opc)) .addReg(cmpReg1, getKillRegState(MO1IsKill)) .addImm(cmpOp2) .addMBB(jmpTarget); assert(NewMI && "New Value Jump Instruction Not created!"); (void)NewMI; if (cmpInstr->getOperand(0).isReg() && cmpInstr->getOperand(0).isKill()) cmpInstr->getOperand(0).setIsKill(false); if (cmpInstr->getOperand(1).isReg() && cmpInstr->getOperand(1).isKill()) cmpInstr->getOperand(1).setIsKill(false); cmpInstr->eraseFromParent(); jmpInstr->eraseFromParent(); ++nvjGenerated; ++NumNVJGenerated; break; } } } } return true; }
bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) { QII = static_cast<const HexagonInstrInfo *>(MF.getTarget(). getInstrInfo()); QRI = static_cast<const HexagonRegisterInfo *>(MF.getTarget(). getRegisterInfo()); MRI = &MF.getRegInfo(); DenseMap<unsigned, unsigned> PeepholeMap; DenseMap<unsigned, std::pair<unsigned, unsigned> > PeepholeDoubleRegsMap; if (DisableHexagonPeephole) return false; // Loop over all of the basic blocks. for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end(); MBBb != MBBe; ++MBBb) { MachineBasicBlock* MBB = MBBb; PeepholeMap.clear(); PeepholeDoubleRegsMap.clear(); // Traverse the basic block. for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end(); ++MII) { MachineInstr *MI = MII; // Look for sign extends: // %vreg170<def> = SXTW %vreg166 if (!DisableOptSZExt && MI->getOpcode() == Hexagon::SXTW) { assert (MI->getNumOperands() == 2); MachineOperand &Dst = MI->getOperand(0); MachineOperand &Src = MI->getOperand(1); unsigned DstReg = Dst.getReg(); unsigned SrcReg = Src.getReg(); // Just handle virtual registers. if (TargetRegisterInfo::isVirtualRegister(DstReg) && TargetRegisterInfo::isVirtualRegister(SrcReg)) { // Map the following: // %vreg170<def> = SXTW %vreg166 // PeepholeMap[170] = vreg166 PeepholeMap[DstReg] = SrcReg; } } // Look for %vreg170<def> = COMBINE_ir_V4 (0, %vreg169) // %vreg170:DoublRegs, %vreg169:IntRegs if (!DisableOptExtTo64 && MI->getOpcode () == Hexagon::COMBINE_Ir_V4) { assert (MI->getNumOperands() == 3); MachineOperand &Dst = MI->getOperand(0); MachineOperand &Src1 = MI->getOperand(1); MachineOperand &Src2 = MI->getOperand(2); if (Src1.getImm() != 0) continue; unsigned DstReg = Dst.getReg(); unsigned SrcReg = Src2.getReg(); PeepholeMap[DstReg] = SrcReg; } // Look for this sequence below // %vregDoubleReg1 = LSRd_ri %vregDoubleReg0, 32 // %vregIntReg = COPY %vregDoubleReg1:subreg_loreg. // and convert into // %vregIntReg = COPY %vregDoubleReg0:subreg_hireg. if (MI->getOpcode() == Hexagon::LSRd_ri) { assert(MI->getNumOperands() == 3); MachineOperand &Dst = MI->getOperand(0); MachineOperand &Src1 = MI->getOperand(1); MachineOperand &Src2 = MI->getOperand(2); if (Src2.getImm() != 32) continue; unsigned DstReg = Dst.getReg(); unsigned SrcReg = Src1.getReg(); PeepholeDoubleRegsMap[DstReg] = std::make_pair(*&SrcReg, 1/*Hexagon::subreg_hireg*/); } // Look for P=NOT(P). if (!DisablePNotP && (MI->getOpcode() == Hexagon::NOT_p)) { assert (MI->getNumOperands() == 2); MachineOperand &Dst = MI->getOperand(0); MachineOperand &Src = MI->getOperand(1); unsigned DstReg = Dst.getReg(); unsigned SrcReg = Src.getReg(); // Just handle virtual registers. if (TargetRegisterInfo::isVirtualRegister(DstReg) && TargetRegisterInfo::isVirtualRegister(SrcReg)) { // Map the following: // %vreg170<def> = NOT_xx %vreg166 // PeepholeMap[170] = vreg166 PeepholeMap[DstReg] = SrcReg; } } // Look for copy: // %vreg176<def> = COPY %vreg170:subreg_loreg if (!DisableOptSZExt && MI->isCopy()) { assert (MI->getNumOperands() == 2); MachineOperand &Dst = MI->getOperand(0); MachineOperand &Src = MI->getOperand(1); // Make sure we are copying the lower 32 bits. if (Src.getSubReg() != Hexagon::subreg_loreg) continue; unsigned DstReg = Dst.getReg(); unsigned SrcReg = Src.getReg(); if (TargetRegisterInfo::isVirtualRegister(DstReg) && TargetRegisterInfo::isVirtualRegister(SrcReg)) { // Try to find in the map. if (unsigned PeepholeSrc = PeepholeMap.lookup(SrcReg)) { // Change the 1st operand. MI->RemoveOperand(1); MI->addOperand(MachineOperand::CreateReg(PeepholeSrc, false)); } else { DenseMap<unsigned, std::pair<unsigned, unsigned> >::iterator DI = PeepholeDoubleRegsMap.find(SrcReg); if (DI != PeepholeDoubleRegsMap.end()) { std::pair<unsigned,unsigned> PeepholeSrc = DI->second; MI->RemoveOperand(1); MI->addOperand(MachineOperand::CreateReg(PeepholeSrc.first, false /*isDef*/, false /*isImp*/, false /*isKill*/, false /*isDead*/, false /*isUndef*/, false /*isEarlyClobber*/, PeepholeSrc.second)); } } } } // Look for Predicated instructions. if (!DisablePNotP) { bool Done = false; if (QII->isPredicated(MI)) { MachineOperand &Op0 = MI->getOperand(0); unsigned Reg0 = Op0.getReg(); const TargetRegisterClass *RC0 = MRI->getRegClass(Reg0); if (RC0->getID() == Hexagon::PredRegsRegClassID) { // Handle instructions that have a prediate register in op0 // (most cases of predicable instructions). if (TargetRegisterInfo::isVirtualRegister(Reg0)) { // Try to find in the map. if (unsigned PeepholeSrc = PeepholeMap.lookup(Reg0)) { // Change the 1st operand and, flip the opcode. MI->getOperand(0).setReg(PeepholeSrc); int NewOp = QII->getInvertedPredicatedOpcode(MI->getOpcode()); MI->setDesc(QII->get(NewOp)); Done = true; } } } } if (!Done) { // Handle special instructions. unsigned Op = MI->getOpcode(); unsigned NewOp = 0; unsigned PR = 1, S1 = 2, S2 = 3; // Operand indices. switch (Op) { case Hexagon::TFR_condset_rr: case Hexagon::TFR_condset_ii: case Hexagon::MUX_ii: case Hexagon::MUX_rr: NewOp = Op; break; case Hexagon::TFR_condset_ri: NewOp = Hexagon::TFR_condset_ir; break; case Hexagon::TFR_condset_ir: NewOp = Hexagon::TFR_condset_ri; break; case Hexagon::MUX_ri: NewOp = Hexagon::MUX_ir; break; case Hexagon::MUX_ir: NewOp = Hexagon::MUX_ri; break; } if (NewOp) { unsigned PSrc = MI->getOperand(PR).getReg(); if (unsigned POrig = PeepholeMap.lookup(PSrc)) { MI->getOperand(PR).setReg(POrig); MI->setDesc(QII->get(NewOp)); // Swap operands S1 and S2. MachineOperand Op1 = MI->getOperand(S1); MachineOperand Op2 = MI->getOperand(S2); ChangeOpInto(MI->getOperand(S1), Op2); ChangeOpInto(MI->getOperand(S2), Op1); } } // if (NewOp) } // if (!Done) } // if (!DisablePNotP) } // Instruction } // Basic Block return true; }
void X86RegisterInfo::emitPrologue(MachineFunction &MF) const { MachineBasicBlock &MBB = MF.front(); // Prolog goes in entry BB MachineFrameInfo *MFI = MF.getFrameInfo(); const Function* Fn = MF.getFunction(); const X86Subtarget* Subtarget = &MF.getTarget().getSubtarget<X86Subtarget>(); MachineModuleInfo *MMI = MFI->getMachineModuleInfo(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); MachineBasicBlock::iterator MBBI = MBB.begin(); bool needsFrameMoves = (MMI && MMI->hasDebugInfo()) || !Fn->doesNotThrow() || UnwindTablesMandatory; // Prepare for frame info. unsigned FrameLabelId = 0; // Get the number of bytes to allocate from the FrameInfo. uint64_t StackSize = MFI->getStackSize(); // Get desired stack alignment uint64_t MaxAlign = MFI->getMaxAlignment(); // Add RETADDR move area to callee saved frame size. int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); if (TailCallReturnAddrDelta < 0) X86FI->setCalleeSavedFrameSize( X86FI->getCalleeSavedFrameSize() +(-TailCallReturnAddrDelta)); // If this is x86-64 and the Red Zone is not disabled, if we are a leaf // function, and use up to 128 bytes of stack space, don't have a frame // pointer, calls, or dynamic alloca then we do not need to adjust the // stack pointer (we fit in the Red Zone). if (Is64Bit && !DisableRedZone && !needsStackRealignment(MF) && !MFI->hasVarSizedObjects() && // No dynamic alloca. !MFI->hasCalls()) { // No calls. uint64_t MinSize = X86FI->getCalleeSavedFrameSize(); if (hasFP(MF)) MinSize += SlotSize; StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0); MFI->setStackSize(StackSize); } // Insert stack pointer adjustment for later moving of return addr. Only // applies to tail call optimized functions where the callee argument stack // size is bigger than the callers. if (TailCallReturnAddrDelta < 0) { MachineInstr *MI = BuildMI(MBB, MBBI, TII.get(Is64Bit? X86::SUB64ri32 : X86::SUB32ri), StackPtr).addReg(StackPtr).addImm(-TailCallReturnAddrDelta); // The EFLAGS implicit def is dead. MI->getOperand(3).setIsDead(); } uint64_t NumBytes = 0; if (hasFP(MF)) { // Calculate required stack adjustment uint64_t FrameSize = StackSize - SlotSize; if (needsStackRealignment(MF)) FrameSize = (FrameSize + MaxAlign - 1)/MaxAlign*MaxAlign; NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize(); // Get the offset of the stack slot for the EBP register... which is // guaranteed to be the last slot by processFunctionBeforeFrameFinalized. // Update the frame offset adjustment. MFI->setOffsetAdjustment(-NumBytes); // Save EBP into the appropriate stack slot... BuildMI(MBB, MBBI, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) .addReg(FramePtr, /*isDef=*/false, /*isImp=*/false, /*isKill=*/true); if (needsFrameMoves) { // Mark effective beginning of when frame pointer becomes valid. FrameLabelId = MMI->NextLabelID(); BuildMI(MBB, MBBI, TII.get(X86::DBG_LABEL)).addImm(FrameLabelId); } // Update EBP with the new base value... BuildMI(MBB, MBBI, TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr) .addReg(StackPtr); // Mark the FramePtr as live-in in every block except the entry. for (MachineFunction::iterator I = next(MF.begin()), E = MF.end(); I != E; ++I) I->addLiveIn(FramePtr); // Realign stack if (needsStackRealignment(MF)) { MachineInstr *MI = BuildMI(MBB, MBBI, TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri), StackPtr).addReg(StackPtr).addImm(-MaxAlign); // The EFLAGS implicit def is dead. MI->getOperand(3).setIsDead(); } } else NumBytes = StackSize - X86FI->getCalleeSavedFrameSize(); unsigned ReadyLabelId = 0; if (needsFrameMoves) { // Mark effective beginning of when frame pointer is ready. ReadyLabelId = MMI->NextLabelID(); BuildMI(MBB, MBBI, TII.get(X86::DBG_LABEL)).addImm(ReadyLabelId); } // Skip the callee-saved push instructions. while (MBBI != MBB.end() && (MBBI->getOpcode() == X86::PUSH32r || MBBI->getOpcode() == X86::PUSH64r)) ++MBBI; if (NumBytes) { // adjust stack pointer: ESP -= numbytes if (NumBytes >= 4096 && Subtarget->isTargetCygMing()) { // Check, whether EAX is livein for this function bool isEAXAlive = false; for (MachineRegisterInfo::livein_iterator II = MF.getRegInfo().livein_begin(), EE = MF.getRegInfo().livein_end(); (II != EE) && !isEAXAlive; ++II) { unsigned Reg = II->first; isEAXAlive = (Reg == X86::EAX || Reg == X86::AX || Reg == X86::AH || Reg == X86::AL); } // Function prologue calls _alloca to probe the stack when allocating // more than 4k bytes in one go. Touching the stack at 4K increments is // necessary to ensure that the guard pages used by the OS virtual memory // manager are allocated in correct sequence. if (!isEAXAlive) { BuildMI(MBB, MBBI, TII.get(X86::MOV32ri), X86::EAX).addImm(NumBytes); BuildMI(MBB, MBBI, TII.get(X86::CALLpcrel32)) .addExternalSymbol("_alloca"); } else { // Save EAX BuildMI(MBB, MBBI, TII.get(X86::PUSH32r)) .addReg(X86::EAX, /*isDef=*/false, /*isImp=*/false, /*isKill=*/true); // Allocate NumBytes-4 bytes on stack. We'll also use 4 already // allocated bytes for EAX. BuildMI(MBB, MBBI, TII.get(X86::MOV32ri), X86::EAX).addImm(NumBytes-4); BuildMI(MBB, MBBI, TII.get(X86::CALLpcrel32)) .addExternalSymbol("_alloca"); // Restore EAX MachineInstr *MI = addRegOffset(BuildMI(MF, TII.get(X86::MOV32rm),X86::EAX), StackPtr, false, NumBytes-4); MBB.insert(MBBI, MI); } } else { // If there is an SUB32ri of ESP immediately before this instruction, // merge the two. This can be the case when tail call elimination is // enabled and the callee has more arguments then the caller. NumBytes -= mergeSPUpdates(MBB, MBBI, StackPtr, true); // If there is an ADD32ri or SUB32ri of ESP immediately after this // instruction, merge the two instructions. mergeSPUpdatesDown(MBB, MBBI, StackPtr, &NumBytes); if (NumBytes) emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, TII); } } if (needsFrameMoves) emitFrameMoves(MF, FrameLabelId, ReadyLabelId); }
bool runOnMachineFunction(MachineFunction &MF) override { ST = &MF.getSubtarget<R600Subtarget>(); MaxFetchInst = ST->getTexVTXClauseSize(); TII = ST->getInstrInfo(); TRI = ST->getRegisterInfo(); R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); CFStack CFStack(ST, MF.getFunction()->getCallingConv()); for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME; ++MB) { MachineBasicBlock &MBB = *MB; unsigned CfCount = 0; std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack; std::vector<MachineInstr * > IfThenElseStack; if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_VS) { BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), getHWInstrDesc(CF_CALL_FS)); CfCount++; } std::vector<ClauseFile> FetchClauses, AluClauses; std::vector<MachineInstr *> LastAlu(1); std::vector<MachineInstr *> ToPopAfter; for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { if (TII->usesTextureCache(*I) || TII->usesVertexCache(*I)) { DEBUG(dbgs() << CfCount << ":"; I->dump();); FetchClauses.push_back(MakeFetchClause(MBB, I)); CfCount++; LastAlu.back() = nullptr; continue; } MachineBasicBlock::iterator MI = I; if (MI->getOpcode() != AMDGPU::ENDIF) LastAlu.back() = nullptr; if (MI->getOpcode() == AMDGPU::CF_ALU) LastAlu.back() = &*MI; I++; bool RequiresWorkAround = CFStack.requiresWorkAroundForInst(MI->getOpcode()); switch (MI->getOpcode()) { case AMDGPU::CF_ALU_PUSH_BEFORE: if (RequiresWorkAround) { DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n"); BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG)) .addImm(CfCount + 1) .addImm(1); MI->setDesc(TII->get(AMDGPU::CF_ALU)); CfCount++; CFStack.pushBranch(AMDGPU::CF_PUSH_EG); } else CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE); case AMDGPU::CF_ALU: I = MI; AluClauses.push_back(MakeALUClause(MBB, I)); DEBUG(dbgs() << CfCount << ":"; MI->dump();); CfCount++; break; case AMDGPU::WHILELOOP: { CFStack.pushLoop(); MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_WHILE_LOOP)) .addImm(1); std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount, std::set<MachineInstr *>()); Pair.second.insert(MIb); LoopStack.push_back(std::move(Pair)); MI->eraseFromParent(); CfCount++; break; } case AMDGPU::ENDLOOP: { CFStack.popLoop(); std::pair<unsigned, std::set<MachineInstr *> > Pair = std::move(LoopStack.back()); LoopStack.pop_back(); CounterPropagateAddr(Pair.second, CfCount); BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP)) .addImm(Pair.first + 1); MI->eraseFromParent(); CfCount++; break; } case AMDGPU::IF_PREDICATE_SET: { LastAlu.push_back(nullptr); MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_JUMP)) .addImm(0) .addImm(0); IfThenElseStack.push_back(MIb); DEBUG(dbgs() << CfCount << ":"; MIb->dump();); MI->eraseFromParent(); CfCount++; break; }
bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I) { MachineInstr &MI = *I; if (MI.getOpcode() == AMDGPU::COPY && isVGPRToSGPRCopy(MI, TRI, MRI)) { DEBUG(dbgs() << "Fixing VGPR -> SGPR copy:\n"); DEBUG(MI.print(dbgs())); TII->moveToVALU(MI); } switch (MI.getOpcode()) { default: continue; case AMDGPU::PHI: { DEBUG(dbgs() << "Fixing PHI: " << MI); for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { const MachineOperand &Op = MI.getOperand(i); unsigned Reg = Op.getReg(); const TargetRegisterClass *RC = inferRegClassFromDef(TRI, MRI, Reg, Op.getSubReg()); MRI.constrainRegClass(Op.getReg(), RC); } unsigned Reg = MI.getOperand(0).getReg(); const TargetRegisterClass *RC = inferRegClassFromUses(TRI, MRI, Reg, MI.getOperand(0).getSubReg()); if (TRI->getCommonSubClass(RC, &AMDGPU::VGPR_32RegClass)) { MRI.constrainRegClass(Reg, &AMDGPU::VGPR_32RegClass); } if (!TRI->isSGPRClass(MRI.getRegClass(Reg))) break; // If a PHI node defines an SGPR and any of its operands are VGPRs, // then we need to move it to the VALU. // // Also, if a PHI node defines an SGPR and has all SGPR operands // we must move it to the VALU, because the SGPR operands will // all end up being assigned the same register, which means // there is a potential for a conflict if different threads take // different control flow paths. // // For Example: // // sgpr0 = def; // ... // sgpr1 = def; // ... // sgpr2 = PHI sgpr0, sgpr1 // use sgpr2; // // Will Become: // // sgpr2 = def; // ... // sgpr2 = def; // ... // use sgpr2 // // FIXME: This is OK if the branching decision is made based on an // SGPR value. bool SGPRBranch = false; // The one exception to this rule is when one of the operands // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK // instruction. In this case, there we know the program will // never enter the second block (the loop) without entering // the first block (where the condition is computed), so there // is no chance for values to be over-written. bool HasBreakDef = false; for (unsigned i = 1; i < MI.getNumOperands(); i+=2) { unsigned Reg = MI.getOperand(i).getReg(); if (TRI->hasVGPRs(MRI.getRegClass(Reg))) { TII->moveToVALU(MI); break; } MachineInstr *DefInstr = MRI.getUniqueVRegDef(Reg); assert(DefInstr); switch(DefInstr->getOpcode()) { case AMDGPU::SI_BREAK: case AMDGPU::SI_IF_BREAK: case AMDGPU::SI_ELSE_BREAK: // If we see a PHI instruction that defines an SGPR, then that PHI // instruction has already been considered and should have // a *_BREAK as an operand. case AMDGPU::PHI: HasBreakDef = true; break; } } if (!SGPRBranch && !HasBreakDef) TII->moveToVALU(MI); break; } case AMDGPU::REG_SEQUENCE: { if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) || !hasVGPROperands(MI, TRI)) continue; DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI); TII->moveToVALU(MI); break; } case AMDGPU::INSERT_SUBREG: { const TargetRegisterClass *DstRC, *Src0RC, *Src1RC; DstRC = MRI.getRegClass(MI.getOperand(0).getReg()); Src0RC = MRI.getRegClass(MI.getOperand(1).getReg()); Src1RC = MRI.getRegClass(MI.getOperand(2).getReg()); if (TRI->isSGPRClass(DstRC) && (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) { DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI); TII->moveToVALU(MI); } break; } } } } return true; }
bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const TargetRegisterInfo *TRI = &TII->getRegisterInfo(); std::vector<unsigned> I1Defs; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; MachineBasicBlock::iterator I, Next; for (I = MBB.begin(); I != MBB.end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) { unsigned Reg = MI.getOperand(0).getReg(); const TargetRegisterClass *RC = MRI.getRegClass(Reg); if (RC == &AMDGPU::VReg_1RegClass) MRI.setRegClass(Reg, &AMDGPU::SReg_64RegClass); continue; } if (MI.getOpcode() != AMDGPU::COPY) continue; const MachineOperand &Dst = MI.getOperand(0); const MachineOperand &Src = MI.getOperand(1); if (!TargetRegisterInfo::isVirtualRegister(Src.getReg()) || !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) continue; const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg()); const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg()); if (DstRC == &AMDGPU::VReg_1RegClass && TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) { I1Defs.push_back(Dst.getReg()); DebugLoc DL = MI.getDebugLoc(); MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg()); if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) { if (DefInst->getOperand(1).isImm()) { I1Defs.push_back(Dst.getReg()); int64_t Val = DefInst->getOperand(1).getImm(); assert(Val == 0 || Val == -1); BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32)) .addOperand(Dst) .addImm(Val); MI.eraseFromParent(); continue; } } BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64)) .addOperand(Dst) .addImm(0) .addImm(-1) .addOperand(Src); MI.eraseFromParent(); } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) && SrcRC == &AMDGPU::VReg_1RegClass) { BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_U32_e64)) .addOperand(Dst) .addOperand(Src) .addImm(0); MI.eraseFromParent(); } } } for (unsigned Reg : I1Defs) MRI.setRegClass(Reg, &AMDGPU::VGPR_32RegClass); return false; }
bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); TRI = static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); bool HaveKill = false; bool NeedWQM = false; bool NeedFlat = false; unsigned Depth = 0; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; MachineBasicBlock::iterator I, Next; for (I = MBB.begin(); I != MBB.end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; if (TII->isDS(MI.getOpcode())) NeedWQM = true; // Flat uses m0 in case it needs to access LDS. if (TII->isFLAT(MI.getOpcode())) NeedFlat = true; switch (MI.getOpcode()) { default: break; case AMDGPU::SI_IF: ++Depth; If(MI); break; case AMDGPU::SI_ELSE: Else(MI); break; case AMDGPU::SI_BREAK: Break(MI); break; case AMDGPU::SI_IF_BREAK: IfBreak(MI); break; case AMDGPU::SI_ELSE_BREAK: ElseBreak(MI); break; case AMDGPU::SI_LOOP: ++Depth; Loop(MI); break; case AMDGPU::SI_END_CF: if (--Depth == 0 && HaveKill) { SkipIfDead(MI); HaveKill = false; } EndCf(MI); break; case AMDGPU::SI_KILL: if (Depth == 0) SkipIfDead(MI); else HaveKill = true; Kill(MI); break; case AMDGPU::S_BRANCH: Branch(MI); break; case AMDGPU::SI_INDIRECT_SRC: IndirectSrc(MI); break; case AMDGPU::SI_INDIRECT_DST_V1: case AMDGPU::SI_INDIRECT_DST_V2: case AMDGPU::SI_INDIRECT_DST_V4: case AMDGPU::SI_INDIRECT_DST_V8: case AMDGPU::SI_INDIRECT_DST_V16: IndirectDst(MI); break; case AMDGPU::V_INTERP_P1_F32: case AMDGPU::V_INTERP_P2_F32: case AMDGPU::V_INTERP_MOV_F32: NeedWQM = true; break; } } } if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) { MachineBasicBlock &MBB = MF.front(); BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC).addReg(AMDGPU::EXEC); } // FIXME: This seems inappropriate to do here. if (NeedFlat && MFI->IsKernel) { // Insert the prologue initializing the SGPRs pointing to the scratch space // for flat accesses. const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); // TODO: What to use with function calls? // FIXME: This is reporting stack size that is used in a scratch buffer // rather than registers as well. uint64_t StackSizeBytes = FrameInfo->getStackSize(); int IndirectBegin = static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF); // Convert register index to 256-byte unit. uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256); assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff && "Stack limits should be smaller than 16-bits"); // Initialize the flat scratch register pair. // TODO: Can we use one s_mov_b64 here? // Offset is in units of 256-bytes. MachineBasicBlock &MBB = MF.front(); DebugLoc NoDL; MachineBasicBlock::iterator Start = MBB.getFirstNonPHI(); const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32); assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes)); BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO) .addImm(StackOffset); // Documentation says size is "per-thread scratch size in bytes" BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI) .addImm(StackSizeBytes); } return true; }
bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) { raw_ostream *OutFile = 0; if (OutFileName) { std::string ErrorInfo; OutFile = new raw_fd_ostream(OutFileName, ErrorInfo, raw_fd_ostream::F_Append); if (!ErrorInfo.empty()) { errs() << "Error opening '" << OutFileName << "': " << ErrorInfo << '\n'; exit(1); } OS = OutFile; } else { OS = &errs(); } foundErrors = 0; this->MF = &MF; TM = &MF.getTarget(); TII = TM->getInstrInfo(); TRI = TM->getRegisterInfo(); MRI = &MF.getRegInfo(); LiveVars = NULL; LiveInts = NULL; LiveStks = NULL; Indexes = NULL; if (PASS) { LiveInts = PASS->getAnalysisIfAvailable<LiveIntervals>(); // We don't want to verify LiveVariables if LiveIntervals is available. if (!LiveInts) LiveVars = PASS->getAnalysisIfAvailable<LiveVariables>(); LiveStks = PASS->getAnalysisIfAvailable<LiveStacks>(); Indexes = PASS->getAnalysisIfAvailable<SlotIndexes>(); } visitMachineFunctionBefore(); for (MachineFunction::const_iterator MFI = MF.begin(), MFE = MF.end(); MFI!=MFE; ++MFI) { visitMachineBasicBlockBefore(MFI); for (MachineBasicBlock::const_instr_iterator MBBI = MFI->instr_begin(), MBBE = MFI->instr_end(); MBBI != MBBE; ++MBBI) { if (MBBI->getParent() != MFI) { report("Bad instruction parent pointer", MFI); *OS << "Instruction: " << *MBBI; continue; } // Skip BUNDLE instruction for now. FIXME: We should add code to verify // the BUNDLE's specifically. if (MBBI->isBundle()) continue; visitMachineInstrBefore(MBBI); for (unsigned I = 0, E = MBBI->getNumOperands(); I != E; ++I) visitMachineOperand(&MBBI->getOperand(I), I); visitMachineInstrAfter(MBBI); } visitMachineBasicBlockAfter(MFI); } visitMachineFunctionAfter(); if (OutFile) delete OutFile; else if (foundErrors) report_fatal_error("Found "+Twine(foundErrors)+" machine code errors."); // Clean up. regsLive.clear(); regsDefined.clear(); regsDead.clear(); regsKilled.clear(); regMasks.clear(); regsLiveInButUnused.clear(); MBBInfoMap.clear(); return false; // no changes }
bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(*MF.getFunction())) return false; MachineRegisterInfo &MRI = MF.getRegInfo(); const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); std::vector<unsigned> I1Defs; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; MachineBasicBlock::iterator I, Next; for (I = MBB.begin(); I != MBB.end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) { // If this has a literal constant source that is the same as the // reversed bits of an inline immediate, replace with a bitreverse of // that constant. This saves 4 bytes in the common case of materializing // sign bits. // Test if we are after regalloc. We only want to do this after any // optimizations happen because this will confuse them. // XXX - not exactly a check for post-regalloc run. MachineOperand &Src = MI.getOperand(1); if (Src.isImm() && TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) { int32_t ReverseImm; if (isReverseInlineImm(TII, Src, ReverseImm)) { MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32)); Src.setImm(ReverseImm); continue; } } } // Combine adjacent s_nops to use the immediate operand encoding how long // to wait. // // s_nop N // s_nop M // => // s_nop (N + M) if (MI.getOpcode() == AMDGPU::S_NOP && Next != MBB.end() && (*Next).getOpcode() == AMDGPU::S_NOP) { MachineInstr &NextMI = *Next; // The instruction encodes the amount to wait with an offset of 1, // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back // after adding. uint8_t Nop0 = MI.getOperand(0).getImm() + 1; uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1; // Make sure we don't overflow the bounds. if (Nop0 + Nop1 <= 8) { NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1); MI.eraseFromParent(); } continue; } // FIXME: We also need to consider movs of constant operands since // immediate operands are not folded if they have more than one use, and // the operand folding pass is unaware if the immediate will be free since // it won't know if the src == dest constraint will end up being // satisfied. if (MI.getOpcode() == AMDGPU::S_ADD_I32 || MI.getOpcode() == AMDGPU::S_MUL_I32) { const MachineOperand *Dest = &MI.getOperand(0); MachineOperand *Src0 = &MI.getOperand(1); MachineOperand *Src1 = &MI.getOperand(2); if (!Src0->isReg() && Src1->isReg()) { if (TII->commuteInstruction(MI, false, 1, 2)) std::swap(Src0, Src1); } // FIXME: This could work better if hints worked with subregisters. If // we have a vector add of a constant, we usually don't get the correct // allocation due to the subregister usage. if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) && Src0->isReg()) { MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg()); MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg()); continue; } if (Src0->isReg() && Src0->getReg() == Dest->getReg()) { if (Src1->isImm() && isKImmOperand(TII, *Src1)) { unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; MI.setDesc(TII->get(Opc)); MI.tieOperands(0, 1); } } } // Try to use s_cmpk_* if (MI.isCompare() && TII->isSOPC(MI)) { shrinkScalarCompare(TII, MI); continue; } // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. if (MI.getOpcode() == AMDGPU::S_MOV_B32) { const MachineOperand &Dst = MI.getOperand(0); MachineOperand &Src = MI.getOperand(1); if (Src.isImm() && TargetRegisterInfo::isPhysicalRegister(Dst.getReg())) { int32_t ReverseImm; if (isKImmOperand(TII, Src)) MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); else if (isReverseInlineImm(TII, Src, ReverseImm)) { MI.setDesc(TII->get(AMDGPU::S_BREV_B32)); Src.setImm(ReverseImm); } } continue; } if (!TII->hasVALU32BitEncoding(MI.getOpcode())) continue; if (!canShrink(MI, TII, TRI, MRI)) { // Try commuting the instruction and see if that enables us to shrink // it. if (!MI.isCommutable() || !TII->commuteInstruction(MI) || !canShrink(MI, TII, TRI, MRI)) continue; } // getVOPe32 could be -1 here if we started with an instruction that had // a 32-bit encoding and then commuted it to an instruction that did not. if (!TII->hasVALU32BitEncoding(MI.getOpcode())) continue; int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); if (TII->isVOPC(Op32)) { unsigned DstReg = MI.getOperand(0).getReg(); if (TargetRegisterInfo::isVirtualRegister(DstReg)) { // VOPC instructions can only write to the VCC register. We can't // force them to use VCC here, because this is only one register and // cannot deal with sequences which would require multiple copies of // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) // // So, instead of forcing the instruction to write to VCC, we provide // a hint to the register allocator to use VCC and then we we will run // this pass again after RA and shrink it if it outputs to VCC. MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC); continue; } if (DstReg != AMDGPU::VCC) continue; } if (Op32 == AMDGPU::V_CNDMASK_B32_e32) { // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC // instructions. const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); if (!Src2->isReg()) continue; unsigned SReg = Src2->getReg(); if (TargetRegisterInfo::isVirtualRegister(SReg)) { MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC); continue; } if (SReg != AMDGPU::VCC) continue; } // Check for the bool flag output for instructions like V_ADD_I32_e64. const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); // Check the carry-in operand for v_addc_u32_e64. const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); if (SDst) { if (SDst->getReg() != AMDGPU::VCC) { if (TargetRegisterInfo::isVirtualRegister(SDst->getReg())) MRI.setRegAllocationHint(SDst->getReg(), 0, AMDGPU::VCC); continue; } // All of the instructions with carry outs also have an SGPR input in // src2. if (Src2 && Src2->getReg() != AMDGPU::VCC) { if (TargetRegisterInfo::isVirtualRegister(Src2->getReg())) MRI.setRegAllocationHint(Src2->getReg(), 0, AMDGPU::VCC); continue; } } // We can shrink this instruction DEBUG(dbgs() << "Shrinking " << MI); MachineInstrBuilder Inst32 = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32)); // Add the dst operand if the 32-bit encoding also has an explicit $vdst. // For VOPC instructions, this is replaced by an implicit def of vcc. int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst); if (Op32DstIdx != -1) { // dst Inst32.add(MI.getOperand(0)); } else { assert(MI.getOperand(0).getReg() == AMDGPU::VCC && "Unexpected case"); } Inst32.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); if (Src1) Inst32.add(*Src1); if (Src2) { int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); if (Op32Src2Idx != -1) { Inst32.add(*Src2); } else { // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is // replaced with an implicit read of vcc. This was already added // during the initial BuildMI, so find it to preserve the flags. copyFlagsToImplicitVCC(*Inst32, *Src2); } } ++NumInstructionsShrunk; // Copy extra operands not present in the instruction definition. copyExtraImplicitOps(*Inst32, MF, MI); MI.eraseFromParent(); foldImmediates(*Inst32, TII, MRI); DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); } } return false; }
bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { const R600RegisterInfo &TRI = TII->getRegisterInfo(); for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); BB != BB_E; ++BB) { MachineBasicBlock &MBB = *BB; MachineBasicBlock::iterator I = MBB.begin(); while (I != MBB.end()) { MachineInstr &MI = *I; I = llvm::next(I); switch (MI.getOpcode()) { default: break; // Expand PRED_X to one of the PRED_SET instructions. case AMDGPU::PRED_X: { uint64_t Flags = MI.getOperand(3).getImm(); // The native opcode used by PRED_X is stored as an immediate in the // third operand. MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I, MI.getOperand(2).getImm(), // opcode MI.getOperand(0).getReg(), // dst MI.getOperand(1).getReg(), // src0 AMDGPU::ZERO); // src1 TII->addFlag(PredSet, 0, MO_FLAG_MASK); if (Flags & MO_FLAG_PUSH) { TII->setImmOperand(PredSet, R600Operands::UPDATE_EXEC_MASK, 1); } else { TII->setImmOperand(PredSet, R600Operands::UPDATE_PREDICATE, 1); } MI.eraseFromParent(); continue; } case AMDGPU::BREAK: { MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I, AMDGPU::PRED_SETE_INT, AMDGPU::PREDICATE_BIT, AMDGPU::ZERO, AMDGPU::ZERO); TII->addFlag(PredSet, 0, MO_FLAG_MASK); TII->setImmOperand(PredSet, R600Operands::UPDATE_EXEC_MASK, 1); BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::PREDICATED_BREAK)) .addReg(AMDGPU::PREDICATE_BIT); MI.eraseFromParent(); continue; } case AMDGPU::INTERP_PAIR_XY: { MachineInstr *BMI; unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( MI.getOperand(2).getImm()); for (unsigned Chan = 0; Chan < 4; ++Chan) { unsigned DstReg; if (Chan < 2) DstReg = MI.getOperand(Chan).getReg(); else DstReg = Chan == 2 ? AMDGPU::T0_Z : AMDGPU::T0_W; BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_XY, DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg); if (Chan > 0) { BMI->bundleWithPred(); } if (Chan >= 2) TII->addFlag(BMI, 0, MO_FLAG_MASK); if (Chan != 3) TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); } MI.eraseFromParent(); continue; } case AMDGPU::INTERP_PAIR_ZW: { MachineInstr *BMI; unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( MI.getOperand(2).getImm()); for (unsigned Chan = 0; Chan < 4; ++Chan) { unsigned DstReg; if (Chan < 2) DstReg = Chan == 0 ? AMDGPU::T0_X : AMDGPU::T0_Y; else DstReg = MI.getOperand(Chan-2).getReg(); BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_ZW, DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg); if (Chan > 0) { BMI->bundleWithPred(); } if (Chan < 2) TII->addFlag(BMI, 0, MO_FLAG_MASK); if (Chan != 3) TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); } MI.eraseFromParent(); continue; } case AMDGPU::INTERP_VEC_LOAD: { const R600RegisterInfo &TRI = TII->getRegisterInfo(); MachineInstr *BMI; unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( MI.getOperand(1).getImm()); unsigned DstReg = MI.getOperand(0).getReg(); for (unsigned Chan = 0; Chan < 4; ++Chan) { BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_LOAD_P0, TRI.getSubReg(DstReg, TRI.getSubRegFromChannel(Chan)), PReg); if (Chan > 0) { BMI->bundleWithPred(); } if (Chan != 3) TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); } MI.eraseFromParent(); continue; } } bool IsReduction = TII->isReductionOp(MI.getOpcode()); bool IsVector = TII->isVector(MI); bool IsCube = TII->isCubeOp(MI.getOpcode()); if (!IsReduction && !IsVector && !IsCube) { continue; } // Expand the instruction // // Reduction instructions: // T0_X = DP4 T1_XYZW, T2_XYZW // becomes: // TO_X = DP4 T1_X, T2_X // TO_Y (write masked) = DP4 T1_Y, T2_Y // TO_Z (write masked) = DP4 T1_Z, T2_Z // TO_W (write masked) = DP4 T1_W, T2_W // // Vector instructions: // T0_X = MULLO_INT T1_X, T2_X // becomes: // T0_X = MULLO_INT T1_X, T2_X // T0_Y (write masked) = MULLO_INT T1_X, T2_X // T0_Z (write masked) = MULLO_INT T1_X, T2_X // T0_W (write masked) = MULLO_INT T1_X, T2_X // // Cube instructions: // T0_XYZW = CUBE T1_XYZW // becomes: // TO_X = CUBE T1_Z, T1_Y // T0_Y = CUBE T1_Z, T1_X // T0_Z = CUBE T1_X, T1_Z // T0_W = CUBE T1_Y, T1_Z for (unsigned Chan = 0; Chan < 4; Chan++) { unsigned DstReg = MI.getOperand( TII->getOperandIdx(MI, R600Operands::DST)).getReg(); unsigned Src0 = MI.getOperand( TII->getOperandIdx(MI, R600Operands::SRC0)).getReg(); unsigned Src1 = 0; // Determine the correct source registers if (!IsCube) { int Src1Idx = TII->getOperandIdx(MI, R600Operands::SRC1); if (Src1Idx != -1) { Src1 = MI.getOperand(Src1Idx).getReg(); } } if (IsReduction) { unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan); Src0 = TRI.getSubReg(Src0, SubRegIndex); Src1 = TRI.getSubReg(Src1, SubRegIndex); } else if (IsCube) { static const int CubeSrcSwz[] = {2, 2, 0, 1}; unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]); unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]); Src1 = TRI.getSubReg(Src0, SubRegIndex1); Src0 = TRI.getSubReg(Src0, SubRegIndex0); } // Determine the correct destination registers; bool Mask = false; bool NotLast = true; if (IsCube) { unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan); DstReg = TRI.getSubReg(DstReg, SubRegIndex); } else { // Mask the write if the original instruction does not write to // the current Channel. Mask = (Chan != TRI.getHWRegChan(DstReg)); unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK; DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan); } // Set the IsLast bit NotLast = (Chan != 3 ); // Add the new instruction unsigned Opcode = MI.getOpcode(); switch (Opcode) { case AMDGPU::CUBE_r600_pseudo: Opcode = AMDGPU::CUBE_r600_real; break; case AMDGPU::CUBE_eg_pseudo: Opcode = AMDGPU::CUBE_eg_real; break; case AMDGPU::DOT4_r600_pseudo: Opcode = AMDGPU::DOT4_r600_real; break; case AMDGPU::DOT4_eg_pseudo: Opcode = AMDGPU::DOT4_eg_real; break; default: break; } MachineInstr *NewMI = TII->buildDefaultInstruction(MBB, I, Opcode, DstReg, Src0, Src1); if (Chan != 0) NewMI->bundleWithPred(); if (Mask) { TII->addFlag(NewMI, 0, MO_FLAG_MASK); } if (NotLast) { TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST); } } MI.eraseFromParent(); } } return false; }
/// runOnMachineFunction - Reduce two-address instructions to two operands. /// bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) { DEBUG(errs() << "Machine Function\n"); const TargetMachine &TM = MF.getTarget(); MRI = &MF.getRegInfo(); TII = TM.getInstrInfo(); TRI = TM.getRegisterInfo(); LV = getAnalysisIfAvailable<LiveVariables>(); AA = &getAnalysis<AliasAnalysis>(); bool MadeChange = false; DEBUG(errs() << "********** REWRITING TWO-ADDR INSTRS **********\n"); DEBUG(errs() << "********** Function: " << MF.getFunction()->getName() << '\n'); // ReMatRegs - Keep track of the registers whose def's are remat'ed. BitVector ReMatRegs; ReMatRegs.resize(MRI->getLastVirtReg()+1); typedef DenseMap<unsigned, SmallVector<std::pair<unsigned, unsigned>, 4> > TiedOperandMap; TiedOperandMap TiedOperands(4); SmallPtrSet<MachineInstr*, 8> Processed; for (MachineFunction::iterator mbbi = MF.begin(), mbbe = MF.end(); mbbi != mbbe; ++mbbi) { unsigned Dist = 0; DistanceMap.clear(); SrcRegMap.clear(); DstRegMap.clear(); Processed.clear(); for (MachineBasicBlock::iterator mi = mbbi->begin(), me = mbbi->end(); mi != me; ) { MachineBasicBlock::iterator nmi = next(mi); const TargetInstrDesc &TID = mi->getDesc(); bool FirstTied = true; DistanceMap.insert(std::make_pair(mi, ++Dist)); ProcessCopy(&*mi, &*mbbi, Processed); // First scan through all the tied register uses in this instruction // and record a list of pairs of tied operands for each register. unsigned NumOps = (mi->getOpcode() == TargetInstrInfo::INLINEASM) ? mi->getNumOperands() : TID.getNumOperands(); for (unsigned SrcIdx = 0; SrcIdx < NumOps; ++SrcIdx) { unsigned DstIdx = 0; if (!mi->isRegTiedToDefOperand(SrcIdx, &DstIdx)) continue; if (FirstTied) { FirstTied = false; ++NumTwoAddressInstrs; DEBUG(errs() << '\t' << *mi); } assert(mi->getOperand(SrcIdx).isReg() && mi->getOperand(SrcIdx).getReg() && mi->getOperand(SrcIdx).isUse() && "two address instruction invalid"); unsigned regB = mi->getOperand(SrcIdx).getReg(); TiedOperandMap::iterator OI = TiedOperands.find(regB); if (OI == TiedOperands.end()) { SmallVector<std::pair<unsigned, unsigned>, 4> TiedPair; OI = TiedOperands.insert(std::make_pair(regB, TiedPair)).first; } OI->second.push_back(std::make_pair(SrcIdx, DstIdx)); } // Now iterate over the information collected above. for (TiedOperandMap::iterator OI = TiedOperands.begin(), OE = TiedOperands.end(); OI != OE; ++OI) { SmallVector<std::pair<unsigned, unsigned>, 4> &TiedPairs = OI->second; // If the instruction has a single pair of tied operands, try some // transformations that may either eliminate the tied operands or // improve the opportunities for coalescing away the register copy. if (TiedOperands.size() == 1 && TiedPairs.size() == 1) { unsigned SrcIdx = TiedPairs[0].first; unsigned DstIdx = TiedPairs[0].second; // If the registers are already equal, nothing needs to be done. if (mi->getOperand(SrcIdx).getReg() == mi->getOperand(DstIdx).getReg()) break; // Done with this instruction. if (TryInstructionTransform(mi, nmi, mbbi, SrcIdx, DstIdx, Dist)) break; // The tied operands have been eliminated. } bool RemovedKillFlag = false; bool AllUsesCopied = true; unsigned LastCopiedReg = 0; unsigned regB = OI->first; for (unsigned tpi = 0, tpe = TiedPairs.size(); tpi != tpe; ++tpi) { unsigned SrcIdx = TiedPairs[tpi].first; unsigned DstIdx = TiedPairs[tpi].second; unsigned regA = mi->getOperand(DstIdx).getReg(); // Grab regB from the instruction because it may have changed if the // instruction was commuted. regB = mi->getOperand(SrcIdx).getReg(); if (regA == regB) { // The register is tied to multiple destinations (or else we would // not have continued this far), but this use of the register // already matches the tied destination. Leave it. AllUsesCopied = false; continue; } LastCopiedReg = regA; assert(TargetRegisterInfo::isVirtualRegister(regB) && "cannot make instruction into two-address form"); #ifndef NDEBUG // First, verify that we don't have a use of "a" in the instruction // (a = b + a for example) because our transformation will not // work. This should never occur because we are in SSA form. for (unsigned i = 0; i != mi->getNumOperands(); ++i) assert(i == DstIdx || !mi->getOperand(i).isReg() || mi->getOperand(i).getReg() != regA); #endif // Emit a copy or rematerialize the definition. const TargetRegisterClass *rc = MRI->getRegClass(regB); MachineInstr *DefMI = MRI->getVRegDef(regB); // If it's safe and profitable, remat the definition instead of // copying it. if (DefMI && DefMI->getDesc().isAsCheapAsAMove() && DefMI->isSafeToReMat(TII, regB, AA) && isProfitableToReMat(regB, rc, mi, DefMI, mbbi, Dist)){ DEBUG(errs() << "2addr: REMATTING : " << *DefMI << "\n"); unsigned regASubIdx = mi->getOperand(DstIdx).getSubReg(); TII->reMaterialize(*mbbi, mi, regA, regASubIdx, DefMI, TRI); ReMatRegs.set(regB); ++NumReMats; } else { bool Emitted = TII->copyRegToReg(*mbbi, mi, regA, regB, rc, rc); (void)Emitted; assert(Emitted && "Unable to issue a copy instruction!\n"); } MachineBasicBlock::iterator prevMI = prior(mi); // Update DistanceMap. DistanceMap.insert(std::make_pair(prevMI, Dist)); DistanceMap[mi] = ++Dist; DEBUG(errs() << "\t\tprepend:\t" << *prevMI); MachineOperand &MO = mi->getOperand(SrcIdx); assert(MO.isReg() && MO.getReg() == regB && MO.isUse() && "inconsistent operand info for 2-reg pass"); if (MO.isKill()) { MO.setIsKill(false); RemovedKillFlag = true; } MO.setReg(regA); } if (AllUsesCopied) { // Replace other (un-tied) uses of regB with LastCopiedReg. for (unsigned i = 0, e = mi->getNumOperands(); i != e; ++i) { MachineOperand &MO = mi->getOperand(i); if (MO.isReg() && MO.getReg() == regB && MO.isUse()) { if (MO.isKill()) { MO.setIsKill(false); RemovedKillFlag = true; } MO.setReg(LastCopiedReg); } } // Update live variables for regB. if (RemovedKillFlag && LV && LV->getVarInfo(regB).removeKill(mi)) LV->addVirtualRegisterKilled(regB, prior(mi)); } else if (RemovedKillFlag) { // Some tied uses of regB matched their destination registers, so // regB is still used in this instruction, but a kill flag was // removed from a different tied use of regB, so now we need to add // a kill flag to one of the remaining uses of regB. for (unsigned i = 0, e = mi->getNumOperands(); i != e; ++i) { MachineOperand &MO = mi->getOperand(i); if (MO.isReg() && MO.getReg() == regB && MO.isUse()) { MO.setIsKill(true); break; } } } MadeChange = true; DEBUG(errs() << "\t\trewrite to:\t" << *mi); } // Clear TiedOperands here instead of at the top of the loop // since most instructions do not have tied operands. TiedOperands.clear(); mi = nmi; } } // Some remat'ed instructions are dead. int VReg = ReMatRegs.find_first(); while (VReg != -1) { if (MRI->use_empty(VReg)) { MachineInstr *DefMI = MRI->getVRegDef(VReg); DefMI->eraseFromParent(); } VReg = ReMatRegs.find_next(VReg); } return MadeChange; }
void MCS51FrameLowering::emitPrologue(MachineFunction &MF) const { MachineBasicBlock &MBB = MF.front(); // Prolog goes in entry BB MachineFrameInfo *MFI = MF.getFrameInfo(); MCS51MachineFunctionInfo *MCS51FI = MF.getInfo<MCS51MachineFunctionInfo>(); const MCS51InstrInfo &TII = *static_cast<const MCS51InstrInfo*>(MF.getTarget().getInstrInfo()); MachineBasicBlock::iterator MBBI = MBB.begin(); DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); // Get the number of bytes to allocate from the FrameInfo. uint64_t StackSize = MFI->getStackSize(); uint64_t NumBytes = 0; if (hasFP(MF)) { // Calculate required stack adjustment uint64_t FrameSize = StackSize - 2; NumBytes = FrameSize - MCS51FI->getCalleeSavedFrameSize(); // Get the offset of the stack slot for the EBP register... which is // guaranteed to be the last slot by processFunctionBeforeFrameFinalized. // Update the frame offset adjustment. MFI->setOffsetAdjustment(-NumBytes); // Save FPW into the appropriate stack slot... BuildMI(MBB, MBBI, DL, TII.get(MCS51::PUSH16r)) .addReg(MCS51::FPW, RegState::Kill); // Update FPW with the new base value... BuildMI(MBB, MBBI, DL, TII.get(MCS51::MOV16rr), MCS51::FPW) .addReg(MCS51::SPW); // Mark the FramePtr as live-in in every block except the entry. for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end(); I != E; ++I) I->addLiveIn(MCS51::FPW); } else NumBytes = StackSize - MCS51FI->getCalleeSavedFrameSize(); // Skip the callee-saved push instructions. while (MBBI != MBB.end() && (MBBI->getOpcode() == MCS51::PUSH16r)) ++MBBI; if (MBBI != MBB.end()) DL = MBBI->getDebugLoc(); if (NumBytes) { // adjust stack pointer: SPW -= numbytes // If there is an SUB16ri of SPW immediately before this instruction, merge // the two. //NumBytes -= mergeSPUpdates(MBB, MBBI, true); // If there is an ADD16ri or SUB16ri of SPW immediately after this // instruction, merge the two instructions. // mergeSPUpdatesDown(MBB, MBBI, &NumBytes); if (NumBytes) { MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(MCS51::SUB16ri), MCS51::SPW) .addReg(MCS51::SPW).addImm(NumBytes); // The SRW implicit def is dead. MI->getOperand(3).setIsDead(); } } }
bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); TRI = static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); bool HaveKill = false; bool NeedFlat = false; unsigned Depth = 0; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock *EmptyMBBAtEnd = NULL; MachineBasicBlock &MBB = *BI; MachineBasicBlock::iterator I, Next; bool ExecModified = false; for (I = MBB.begin(); I != MBB.end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; // Flat uses m0 in case it needs to access LDS. if (TII->isFLAT(MI)) NeedFlat = true; for (const auto &Def : I->defs()) { if (Def.isReg() && Def.isDef() && Def.getReg() == AMDGPU::EXEC) { ExecModified = true; break; } } switch (MI.getOpcode()) { default: break; case AMDGPU::SI_IF: ++Depth; If(MI); break; case AMDGPU::SI_ELSE: Else(MI, ExecModified); break; case AMDGPU::SI_BREAK: Break(MI); break; case AMDGPU::SI_IF_BREAK: IfBreak(MI); break; case AMDGPU::SI_ELSE_BREAK: ElseBreak(MI); break; case AMDGPU::SI_LOOP: ++Depth; Loop(MI); break; case AMDGPU::SI_END_CF: if (--Depth == 0 && HaveKill) { SkipIfDead(MI); HaveKill = false; } EndCf(MI); break; case AMDGPU::SI_KILL: if (Depth == 0) SkipIfDead(MI); else HaveKill = true; Kill(MI); break; case AMDGPU::S_BRANCH: Branch(MI); break; case AMDGPU::SI_INDIRECT_SRC_V1: case AMDGPU::SI_INDIRECT_SRC_V2: case AMDGPU::SI_INDIRECT_SRC_V4: case AMDGPU::SI_INDIRECT_SRC_V8: case AMDGPU::SI_INDIRECT_SRC_V16: IndirectSrc(MI); break; case AMDGPU::SI_INDIRECT_DST_V1: case AMDGPU::SI_INDIRECT_DST_V2: case AMDGPU::SI_INDIRECT_DST_V4: case AMDGPU::SI_INDIRECT_DST_V8: case AMDGPU::SI_INDIRECT_DST_V16: IndirectDst(MI); break; case AMDGPU::S_ENDPGM: { if (MF.getInfo<SIMachineFunctionInfo>()->returnsVoid()) break; // Graphics shaders returning non-void shouldn't contain S_ENDPGM, // because external bytecode will be appended at the end. if (BI != --MF.end() || I != MBB.getFirstTerminator()) { // S_ENDPGM is not the last instruction. Add an empty block at // the end and jump there. if (!EmptyMBBAtEnd) { EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); MF.insert(MF.end(), EmptyMBBAtEnd); } MBB.addSuccessor(EmptyMBBAtEnd); BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) .addMBB(EmptyMBBAtEnd); } I->eraseFromParent(); break; } } } } if (NeedFlat && MFI->IsKernel) { // TODO: What to use with function calls? // We will need to Initialize the flat scratch register pair. if (NeedFlat) MFI->setHasFlatInstructions(true); } return true; }
/// MoveDiscontiguousLoopBlocks - Move any loop blocks that are not in the /// portion of the loop contiguous with the header. This usually makes the loop /// contiguous, provided that AnalyzeBranch can handle all the relevant /// branching. See the @cfg_islands case in test/CodeGen/X86/loop_blocks.ll /// for an example of this. bool CodePlacementOpt::MoveDiscontiguousLoopBlocks(MachineFunction &MF, MachineLoop *L) { bool Changed = false; MachineBasicBlock *TopMBB = L->getTopBlock(); MachineBasicBlock *BotMBB = L->getBottomBlock(); // Determine a position to move orphaned loop blocks to. If TopMBB is not // entered via fallthrough and BotMBB is exited via fallthrough, prepend them // to the top of the loop to avoid loosing that fallthrough. Otherwise append // them to the bottom, even if it previously had a fallthrough, on the theory // that it's worth an extra branch to keep the loop contiguous. MachineFunction::iterator InsertPt = llvm::next(MachineFunction::iterator(BotMBB)); bool InsertAtTop = false; if (TopMBB != MF.begin() && !HasFallthrough(prior(MachineFunction::iterator(TopMBB))) && HasFallthrough(BotMBB)) { InsertPt = TopMBB; InsertAtTop = true; } // Keep a record of which blocks are in the portion of the loop contiguous // with the loop header. SmallPtrSet<MachineBasicBlock *, 8> ContiguousBlocks; for (MachineFunction::iterator I = TopMBB, E = llvm::next(MachineFunction::iterator(BotMBB)); I != E; ++I) ContiguousBlocks.insert(I); // Find non-contigous blocks and fix them. if (InsertPt != MF.begin() && HasAnalyzableTerminator(prior(InsertPt))) for (MachineLoop::block_iterator BI = L->block_begin(), BE = L->block_end(); BI != BE; ++BI) { MachineBasicBlock *BB = *BI; // Verify that we can analyze all the loop entry edges before beginning // any changes which will require us to be able to analyze them. if (!HasAnalyzableTerminator(BB)) continue; if (!HasAnalyzableTerminator(prior(MachineFunction::iterator(BB)))) continue; // If the layout predecessor is part of the loop, this block will be // processed along with it. This keeps them in their relative order. if (BB != MF.begin() && L->contains(prior(MachineFunction::iterator(BB)))) continue; // Check to see if this block is already contiguous with the main // portion of the loop. if (!ContiguousBlocks.insert(BB)) continue; // Move the block. DEBUG(dbgs() << "CGP: Moving blocks starting at BB#" << BB->getNumber() << " to be contiguous with loop.\n"); Changed = true; // Process this block and all loop blocks contiguous with it, to keep // them in their relative order. MachineFunction::iterator Begin = BB; MachineFunction::iterator End = llvm::next(MachineFunction::iterator(BB)); for (; End != MF.end(); ++End) { if (!L->contains(End)) break; if (!HasAnalyzableTerminator(End)) break; ContiguousBlocks.insert(End); ++NumIntraMoved; } // If we're inserting at the bottom of the loop, and the code we're // moving originally had fall-through successors, bring the sucessors // up with the loop blocks to preserve the fall-through edges. if (!InsertAtTop) for (; End != MF.end(); ++End) { if (L->contains(End)) break; if (!HasAnalyzableTerminator(End)) break; if (!HasFallthrough(prior(End))) break; } // Move the blocks. This may invalidate TopMBB and/or BotMBB, but // we don't need them anymore at this point. Splice(MF, InsertPt, Begin, End); } return Changed; }
bool GCMachineCodeFixup::runOnMachineFunction(MachineFunction &MF) { // Quick exit for functions that do not use GC. if (!MF.getFunction()->hasGC()) return false; const TargetMachine &TM = MF.getTarget(); const TargetInstrInfo *TII = TM.getInstrInfo(); GCModuleInfo &GMI = getAnalysis<GCModuleInfo>(); GCFunctionInfo &GCFI = GMI.getFunctionInfo(*MF.getFunction()); for (MachineFunction::iterator MBBI = MF.begin(), MBBE = MF.end(); MBBI != MBBE; ++MBBI) { for (MachineBasicBlock::iterator MII = MBBI->begin(), MIE = MBBI->end(); MII != MIE;) { if (!MII->isGCRegRoot() || !MII->getOperand(0).isReg()) { ++MII; continue; } // Trace the register back to its location at the site of the call (either // a physical reg or a frame index). bool TracingReg = true; unsigned TracedReg = MII->getOperand(0).getReg(); int FrameIndex; MachineBasicBlock::iterator PrevII = MII; for (--PrevII;; --PrevII) { if (PrevII->isGCRegRoot() && PrevII->getOperand(0).isReg()) break; if (PrevII->isCall()) break; int FI; // Trace back through register reloads. unsigned Reg = TM.getInstrInfo()->isLoadFromStackSlotPostFE(&*PrevII, FI); if (Reg) { // This is a reload. If we're tracing this register, start tracing the // frame index instead. if (TracingReg && TracedReg == Reg) { TracingReg = false; FrameIndex = FI; } continue; } // Trace back through spills. if (TM.getInstrInfo()->isStoreToStackSlotPostFE(&*PrevII, FI)) continue; // Trace back through register-to-register copies. if (PrevII->isCopy()) { if (TracingReg && TracedReg == PrevII->getOperand(0).getReg()) TracedReg = PrevII->getOperand(1).getReg(); continue; } // Trace back through non-register GC_REG_ROOT instructions. if (PrevII->isGCRegRoot() && !PrevII->getOperand(0).isReg()) continue; DEBUG(dbgs() << "Bad instruction: " << *PrevII); llvm_unreachable("GC_REG_ROOT found in an unexpected location!"); } // Now we've reached either a call or another GC_REG_ROOT instruction. // Move the GC_REG_ROOT instruction we're considering to the right place, // and rewrite it if necessary. // // Also, tell the GCFunctionInfo about the frame index, since this is // our only chance -- the frame indices will be deleted by the time // GCMachineCodeAnalysis runs. ++PrevII; unsigned RootIndex = MII->getOperand(1).getImm(); MachineInstr *NewMI; if (TracingReg) { MachineInstrBuilder MIB = BuildMI(MF, MII->getDebugLoc(), TII->get(TargetOpcode::GC_REG_ROOT)); MIB.addReg(TracedReg).addImm(RootIndex); NewMI = MIB; } else { NewMI = TII->emitFrameIndexGCRegRoot(MF, FrameIndex, RootIndex, MII->getDebugLoc()); GCFI.spillRegRoot(RootIndex, FrameIndex); } MBBI->insert(PrevII, NewMI); MachineBasicBlock::iterator NextII = MII; ++NextII; MII->eraseFromParent(); MII = NextII; } } return true; }
bool PatmosSPMark::runOnMachineModule(const Module &M) { DEBUG( dbgs() << "[Single-Path] Mark functions reachable from single-path roots\n"); MMI = &getAnalysis<MachineModuleInfo>(); assert(MMI); Worklist W; // initialize the worklist with machine functions that have either // sp-root or sp-reachable function attribute for(Module::const_iterator F(M.begin()), FE(M.end()); F != FE; ++F) { if (F->hasFnAttribute("sp-root") || F->hasFnAttribute("sp-reachable")) { // get the machine-level function MachineFunction *MF = MMI->getMachineFunction(F); assert( MF ); PatmosMachineFunctionInfo *PMFI = MF->getInfo<PatmosMachineFunctionInfo>(); PMFI->setSinglePath(); NumSPTotal++; // bump STATISTIC W.push_back(MF); } } // process worklist while (!W.empty()) { MachineFunction *MF = W.front(); W.pop_front(); scanAndRewriteCalls(MF, W); } // clear all cloned machine functions that are not marked as single-path // by now for(Module::const_iterator F(M.begin()), FE(M.end()); F != FE; ++F) { if (F->hasFnAttribute("sp-maybe")) { // get the machine-level function MachineFunction *MF = MMI->getMachineFunction(F); assert( MF ); PatmosMachineFunctionInfo *PMFI = MF->getInfo<PatmosMachineFunctionInfo>(); if (!PMFI->isSinglePath()) { // delete all MBBs while (MF->begin() != MF->end()) { MF->begin()->eraseFromParent(); } // insert a new single MBB with a single return instruction MachineBasicBlock *EmptyMBB = MF->CreateMachineBasicBlock(); MF->push_back(EmptyMBB); DebugLoc DL; AddDefaultPred(BuildMI(*EmptyMBB, EmptyMBB->end(), DL, TM.getInstrInfo()->get(Patmos::RET))); NumSPCleared++; // bump STATISTIC }; } } return true; }
bool VirtRegReduction::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; #if VRRPROF const Function *F = MF.getFunction(); std::string FN = F->getName().str(); llog("starting vrr... %s (%d)\n", FN.c_str(), (int)time(NULL)); llog("starting immRegs finder... (%d)\n", (int)time(NULL)); #endif std::auto_ptr<std::unordered_set<unsigned> > immRegsHolder; std::unordered_set<unsigned> *immRegs = NULL; // single-def regs defined by a MoveImm shouldn't coalesce as we may be // able to fold them later { std::unordered_map<unsigned, const MachineInstr *> singleDef; MachineFunction::const_iterator I = MF.begin(), E = MF.end(); // find all registers w/ a single def for(; I != E; I++) { MachineBasicBlock::const_iterator BI = I->begin(), BE = I->end(); for(; BI != BE; BI++) { MachineInstr::const_mop_iterator II, IE; II = BI->operands_begin(); IE = BI->operands_end(); for(; II != IE; II++) if(II->isReg() && II->isDef()) { unsigned R = II->getReg(); std::unordered_map<unsigned, const MachineInstr *>::iterator SI = singleDef.find(R); if(SI == singleDef.end()) singleDef[R] = BI; // first seen! insert else SI->second = NULL; // second seen -- replace w/ NULL } } } std::unordered_map<unsigned, const MachineInstr *>::const_iterator SI = singleDef.begin(), SE = singleDef.end(); for(; SI != SE; SI++) { if(SI->second && SI->second->getDesc().isMoveImmediate()) // single def imm? { if(!immRegs) immRegsHolder.reset(immRegs = new std::unordered_set<unsigned>); immRegs->insert(SI->first); // don't coalesce } } } #if VRRPROF llog("starting tdkRegs finder... (%d)\n", (int)time(NULL)); #endif std::auto_ptr<std::unordered_set<unsigned> > tdkRegsHolder; std::unordered_set<unsigned> *tdkRegs = NULL; bool setjmpSafe = !MF.callsSetJmp() && MF.getFunction()->doesNotThrow(); { tdkRegsHolder.reset(tdkRegs = new std::unordered_set<unsigned>); std::unordered_map<unsigned, unsigned> trivialDefKills; MachineFunction::const_iterator I = MF.begin(), E = MF.end(); // find all registers defed and killed in the same block w/ no intervening // unsafe (due to setjmp) calls + side-effecty operations for(; I != E; I++) { std::unordered_set<unsigned> defs; MachineBasicBlock::const_iterator BI = I->begin(), BE = I->end(); for(; BI != BE; BI++) { // TODO need to add || BI->getDesc().isInlineAsm() here to help stackification? if((!setjmpSafe && BI->getDesc().isCall()) || BI->getDesc().hasUnmodeledSideEffects()) { // invalidate on a call instruction if setjmp present, or instr with side effects regardless defs.clear(); } MachineInstr::const_mop_iterator II, IE; // uses when we're not tracking a reg it make it unsafe II = BI->operands_begin(); IE = BI->operands_end(); for(; II != IE; II++) if(II->isReg() && II->isUse()) { unsigned R = II->getReg(); std::unordered_set<unsigned>::const_iterator DI = defs.find(R); if(DI == defs.end()) trivialDefKills[R] = 100; } // kills of tracked defs are trivial def/kills II = BI->operands_begin(); IE = BI->operands_end(); for(; II != IE; II++) if(II->isReg() && II->isKill()) { unsigned R = II->getReg(); std::unordered_set<unsigned>::const_iterator DI = defs.find(R); if(DI != defs.end()) { defs.erase(DI); trivialDefKills[R]++; } else trivialDefKills[R] = 100; // don't use } // record all defs in this instruction II = BI->operands_begin(); IE = BI->operands_end(); for(; II != IE; II++) if(II->isReg() && II->isDef()) defs.insert(II->getReg()); } } std::unordered_map<unsigned, unsigned>::const_iterator DKI = trivialDefKills.begin(), DKE = trivialDefKills.end(); for(; DKI != DKE; DKI++) if(DKI->second == 1) tdkRegs->insert(DKI->first); } #if VRRPROF llog("starting conflict graph construction... (%d)\n", (int)time(NULL)); #endif std::unordered_set<unsigned>::const_iterator tdkE = tdkRegs->end(); std::unordered_set<unsigned> *okRegs = NULL; if(!setjmpSafe) okRegs = tdkRegs; MachineRegisterInfo *RI = &(MF.getRegInfo()); // will eventually hold a virt register coloring for this function ConflictGraph::Coloring coloring; { ConflictGraph cg; LiveIntervals &LIS = getAnalysis<LiveIntervals>(); LiveIntervals::const_iterator I = LIS.begin(), E = LIS.end(); // check every possible LiveInterval, LiveInterval pair of the same // register class for overlap and add overlaps to the conflict graph // also, treat trivially def-kill-ed regs and not trivially def-kill-ed // regs as conflicting so they end up using different VRs -- this makes // stackification easier later in the toolchain for(; I != E; I++) { unsigned R = I->first; if(TargetRegisterInfo::isPhysicalRegister(R)) continue; if(okRegs && okRegs->find(R) == okRegs->end()) continue; // leave singly-defined MoveImm regs for later coalescing if(immRegs && immRegs->find(R) != immRegs->end()) continue; // const TargetRegisterClass *RC = RI->getRegClass(R); const LiveInterval *LI = I->second; if(LI->empty()) continue; cg.addVertex(R); bool notTDK = tdkRegs->find(R) == tdkE; LiveIntervals::const_iterator I1 = I; I1++; for(; I1 != E; I1++) { unsigned R1 = I1->first; if(TargetRegisterInfo::isPhysicalRegister(R1)) continue; if(okRegs && okRegs->find(R1) == okRegs->end()) continue; // leave singly-defined MoveImm regs for later coalescing if(immRegs && immRegs->find(R1) != immRegs->end()) continue; /* Don't bother checked RC -- even though it sounds like an opt, it doesn't speed us up in practice const TargetRegisterClass *RC1 = RI->getRegClass(R1); if(RC != RC1) continue; // different reg class... won't conflict */ const LiveInterval *LI1 = I1->second; // conflict if intervals overlap OR they're not both TDK or both NOT TDK if(LI->overlaps(*LI1) || notTDK != (tdkRegs->find(R1) == tdkE)) cg.addEdge(R, R1); } } #if VRRPROF llog("starting coloring... (%d)\n", (int)time(NULL)); #endif cg.color(&coloring); #if VRRPROF llog("starting vreg=>vreg construction... (%d)\n", (int)time(NULL)); #endif typedef std::unordered_map<unsigned, unsigned> VRegMap; VRegMap Regs; // build up map of vreg=>vreg { std::unordered_map<const TargetRegisterClass *, std::unordered_map<unsigned, unsigned> > RCColor2VReg; ConflictGraph::Coloring::const_iterator I = coloring.begin(), E = coloring.end(); for(; I != E; I++) { unsigned R = I->first; unsigned Color = I->second; const TargetRegisterClass *RC = RI->getRegClass(R); std::unordered_map<unsigned, unsigned> &Color2VReg = RCColor2VReg[RC]; VRegMap::const_iterator CI = Color2VReg.find(Color); if(CI != Color2VReg.end()) Regs[R] = CI->second; // seen this color; map it else Regs[R] = Color2VReg[Color] = R; // first sighting of color; bind to this reg } } #if VRRPROF llog("starting remap... (%d)\n", (int)time(NULL)); #endif // remap regs { VRegMap::const_iterator I = Regs.begin(), E = Regs.end(); for(; I != E; I++) if(I->first != I->second) { RI->replaceRegWith(I->first, I->second); Changed = true; } } } #if VRRPROF llog("done... (%d)\n", (int)time(NULL)); #endif return Changed; }
/// scavengeFrameVirtualRegs - Replace all frame index virtual registers /// with physical registers. Use the register scavenger to find an /// appropriate register to use. /// /// FIXME: Iterating over the instruction stream is unnecessary. We can simply /// iterate over the vreg use list, which at this point only contains machine /// operands for which eliminateFrameIndex need a new scratch reg. void PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) { // Run through the instructions and find any virtual registers. for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) { RS->enterBasicBlock(BB); int SPAdj = 0; // The instruction stream may change in the loop, so check BB->end() // directly. for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) { MachineInstr *MI = I; MachineBasicBlock::iterator J = llvm::next(I); MachineBasicBlock::iterator P = I == BB->begin() ? MachineBasicBlock::iterator(NULL) : llvm::prior(I); // RS should process this instruction before we might scavenge at this // location. This is because we might be replacing a virtual register // defined by this instruction, and if so, registers killed by this // instruction are available, and defined registers are not. RS->forward(I); for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { if (MI->getOperand(i).isReg()) { MachineOperand &MO = MI->getOperand(i); unsigned Reg = MO.getReg(); if (Reg == 0) continue; if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue; // When we first encounter a new virtual register, it // must be a definition. assert(MI->getOperand(i).isDef() && "frame index virtual missing def!"); // Scavenge a new scratch register const TargetRegisterClass *RC = Fn.getRegInfo().getRegClass(Reg); unsigned ScratchReg = RS->scavengeRegister(RC, J, SPAdj); ++NumScavengedRegs; // Replace this reference to the virtual register with the // scratch register. assert (ScratchReg && "Missing scratch register!"); Fn.getRegInfo().replaceRegWith(Reg, ScratchReg); // Because this instruction was processed by the RS before this // register was allocated, make sure that the RS now records the // register as being used. RS->setUsed(ScratchReg); } } // If the scavenger needed to use one of its spill slots, the // spill code will have been inserted in between I and J. This is a // problem because we need the spill code before I: Move I to just // prior to J. if (I != llvm::prior(J)) { BB->splice(J, BB, I); // Before we move I, we need to prepare the RS to visit I again. // Specifically, RS will assert if it sees uses of registers that // it believes are undefined. Because we have already processed // register kills in I, when it visits I again, it will believe that // those registers are undefined. To avoid this situation, unprocess // the instruction I. assert(RS->getCurrentPosition() == I && "The register scavenger has an unexpected position"); I = P; RS->unprocess(P); // RS->skipTo(I == BB->begin() ? NULL : llvm::prior(I)); } else ++I; } } }
/// runOnMachineFunction - This emits the frame section, autos section and /// assembly for each instruction. Also takes care of function begin debug /// directive and file begin debug directive (if required) for the function. /// bool PIC16AsmPrinter::runOnMachineFunction(MachineFunction &MF) { // This calls the base class function required to be called at beginning // of runOnMachineFunction. SetupMachineFunction(MF); // Put the color information from function to its auto section. const Function *F = MF.getFunction(); ColorAutoSection(F); // Emit the function frame (args and temps). EmitFunctionFrame(MF); DbgInfo.BeginFunction(MF); // Now emit the instructions of function in its code section. const MCSection *fCodeSection = getObjFileLowering().SectionForCode(CurrentFnSym->getName(), PAN::isISR(F->getSection())); // Start the Code Section. O << "\n"; OutStreamer.SwitchSection(fCodeSection); // Emit the frame address of the function at the beginning of code. O << "\tretlw low(" << PAN::getFrameLabel(CurrentFnSym->getName()) << ")\n"; O << "\tretlw high(" << PAN::getFrameLabel(CurrentFnSym->getName()) << ")\n"; // Emit function start label. O << *CurrentFnSym << ":\n"; DebugLoc CurDL; O << "\n"; // Print out code for the function. for (MachineFunction::const_iterator I = MF.begin(), E = MF.end(); I != E; ++I) { // Print a label for the basic block. if (I != MF.begin()) { EmitBasicBlockStart(I); } // Print a basic block. for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end(); II != E; ++II) { // Emit the line directive if source line changed. const DebugLoc DL = II->getDebugLoc(); if (!DL.isUnknown() && DL != CurDL) { DbgInfo.ChangeDebugLoc(MF, DL); CurDL = DL; } // Print the assembly for the instruction. EmitInstruction(II); } } // Emit function end debug directives. DbgInfo.EndFunction(MF); return false; // we didn't modify anything. }
bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); const SIRegisterInfo &TRI = TII->getRegisterInfo(); for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; MachineBasicBlock::iterator I, Next; for (I = MBB.begin(); I != MBB.end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; if (!isSafeToFold(MI.getOpcode())) continue; unsigned OpSize = TII->getOpSize(MI, 1); MachineOperand &OpToFold = MI.getOperand(1); bool FoldingImm = OpToFold.isImm(); // FIXME: We could also be folding things like FrameIndexes and // TargetIndexes. if (!FoldingImm && !OpToFold.isReg()) continue; // Folding immediates with more than one use will increase program size. // FIXME: This will also reduce register usage, which may be better // in some cases. A better heuristic is needed. if (FoldingImm && !TII->isInlineConstant(OpToFold, OpSize) && !MRI.hasOneUse(MI.getOperand(0).getReg())) continue; if (OpToFold.isReg() && !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg())) continue; // Prevent folding operands backwards in the function. For example, // the COPY opcode must not be replaced by 1 in this example: // // %vreg3<def> = COPY %VGPR0; VGPR_32:%vreg3 // ... // %VGPR0<def> = V_MOV_B32_e32 1, %EXEC<imp-use> MachineOperand &Dst = MI.getOperand(0); if (Dst.isReg() && !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) continue; // We need mutate the operands of new mov instructions to add implicit // uses of EXEC, but adding them invalidates the use_iterator, so defer // this. SmallVector<MachineInstr *, 4> CopiesToReplace; std::vector<FoldCandidate> FoldList; for (MachineRegisterInfo::use_iterator Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end(); Use != E; ++Use) { MachineInstr *UseMI = Use->getParent(); foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList, CopiesToReplace, TII, TRI, MRI); } // Make sure we add EXEC uses to any new v_mov instructions created. for (MachineInstr *Copy : CopiesToReplace) Copy->addImplicitDefUseOperands(MF); for (FoldCandidate &Fold : FoldList) { if (updateOperand(Fold, TRI)) { // Clear kill flags. if (!Fold.isImm()) { assert(Fold.OpToFold && Fold.OpToFold->isReg()); // FIXME: Probably shouldn't bother trying to fold if not an // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR // copies. MRI.clearKillFlags(Fold.OpToFold->getReg()); } DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << Fold.UseOpNo << " of " << *Fold.UseMI << '\n'); } } } } return false; }
bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) { const PPCInstrInfo *TII = static_cast<const PPCInstrInfo*>(Fn.getTarget().getInstrInfo()); // Give the blocks of the function a dense, in-order, numbering. Fn.RenumberBlocks(); BlockSizes.resize(Fn.getNumBlockIDs()); // Measure each MBB and compute a size for the entire function. unsigned FuncSize = 0; for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; ++MFI) { MachineBasicBlock *MBB = MFI; unsigned BlockSize = 0; for (MachineBasicBlock::iterator MBBI = MBB->begin(), EE = MBB->end(); MBBI != EE; ++MBBI) BlockSize += TII->GetInstSizeInBytes(MBBI); BlockSizes[MBB->getNumber()] = BlockSize; FuncSize += BlockSize; } // If the entire function is smaller than the displacement of a branch field, // we know we don't need to shrink any branches in this function. This is a // common case. if (FuncSize < (1 << 15)) { BlockSizes.clear(); return false; } // For each conditional branch, if the offset to its destination is larger // than the offset field allows, transform it into a long branch sequence // like this: // short branch: // bCC MBB // long branch: // b!CC $PC+8 // b MBB // bool MadeChange = true; bool EverMadeChange = false; while (MadeChange) { // Iteratively expand branches until we reach a fixed point. MadeChange = false; for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; ++MFI) { MachineBasicBlock &MBB = *MFI; unsigned MBBStartOffset = 0; for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I) { MachineBasicBlock *Dest = nullptr; if (I->getOpcode() == PPC::BCC && !I->getOperand(2).isImm()) Dest = I->getOperand(2).getMBB(); else if ((I->getOpcode() == PPC::BC || I->getOpcode() == PPC::BCn) && !I->getOperand(1).isImm()) Dest = I->getOperand(1).getMBB(); else if ((I->getOpcode() == PPC::BDNZ8 || I->getOpcode() == PPC::BDNZ || I->getOpcode() == PPC::BDZ8 || I->getOpcode() == PPC::BDZ) && !I->getOperand(0).isImm()) Dest = I->getOperand(0).getMBB(); if (!Dest) { MBBStartOffset += TII->GetInstSizeInBytes(I); continue; } // Determine the offset from the current branch to the destination // block. int BranchSize; if (Dest->getNumber() <= MBB.getNumber()) { // If this is a backwards branch, the delta is the offset from the // start of this block to this branch, plus the sizes of all blocks // from this block to the dest. BranchSize = MBBStartOffset; for (unsigned i = Dest->getNumber(), e = MBB.getNumber(); i != e; ++i) BranchSize += BlockSizes[i]; } else { // Otherwise, add the size of the blocks between this block and the // dest to the number of bytes left in this block. BranchSize = -MBBStartOffset; for (unsigned i = MBB.getNumber(), e = Dest->getNumber(); i != e; ++i) BranchSize += BlockSizes[i]; } // If this branch is in range, ignore it. if (isInt<16>(BranchSize)) { MBBStartOffset += 4; continue; } // Otherwise, we have to expand it to a long branch. MachineInstr *OldBranch = I; DebugLoc dl = OldBranch->getDebugLoc(); if (I->getOpcode() == PPC::BCC) { // The BCC operands are: // 0. PPC branch predicate // 1. CR register // 2. Target MBB PPC::Predicate Pred = (PPC::Predicate)I->getOperand(0).getImm(); unsigned CRReg = I->getOperand(1).getReg(); // Jump over the uncond branch inst (i.e. $PC+8) on opposite condition. BuildMI(MBB, I, dl, TII->get(PPC::BCC)) .addImm(PPC::InvertPredicate(Pred)).addReg(CRReg).addImm(2); } else if (I->getOpcode() == PPC::BC) { unsigned CRBit = I->getOperand(0).getReg(); BuildMI(MBB, I, dl, TII->get(PPC::BCn)).addReg(CRBit).addImm(2); } else if (I->getOpcode() == PPC::BCn) { unsigned CRBit = I->getOperand(0).getReg(); BuildMI(MBB, I, dl, TII->get(PPC::BC)).addReg(CRBit).addImm(2); } else if (I->getOpcode() == PPC::BDNZ) { BuildMI(MBB, I, dl, TII->get(PPC::BDZ)).addImm(2); } else if (I->getOpcode() == PPC::BDNZ8) { BuildMI(MBB, I, dl, TII->get(PPC::BDZ8)).addImm(2); } else if (I->getOpcode() == PPC::BDZ) { BuildMI(MBB, I, dl, TII->get(PPC::BDNZ)).addImm(2); } else if (I->getOpcode() == PPC::BDZ8) { BuildMI(MBB, I, dl, TII->get(PPC::BDNZ8)).addImm(2); } else { llvm_unreachable("Unhandled branch type!"); } // Uncond branch to the real destination. I = BuildMI(MBB, I, dl, TII->get(PPC::B)).addMBB(Dest); // Remove the old branch from the function. OldBranch->eraseFromParent(); // Remember that this instruction is 8-bytes, increase the size of the // block by 4, remember to iterate. BlockSizes[MBB.getNumber()] += 4; MBBStartOffset += 8; ++NumExpanded; MadeChange = true; } } EverMadeChange |= MadeChange; } BlockSizes.clear(); return true; }
/// runOnMachineFunction - This uses the printInstruction() /// method to print assembly for each instruction. /// bool PIC16AsmPrinter::runOnMachineFunction(MachineFunction &MF) { this->MF = &MF; // This calls the base class function required to be called at beginning // of runOnMachineFunction. SetupMachineFunction(MF); // Get the mangled name. const Function *F = MF.getFunction(); CurrentFnName = Mang->getValueName(F); // Emit the function variables. EmitFunctionFrame(MF); // Emit function begin debug directives DbgInfo.EmitFunctBeginDI(F); EmitAutos(CurrentFnName); const char *codeSection = PAN::getCodeSectionName(CurrentFnName).c_str(); const Section *fCodeSection = TAI->getNamedSection(codeSection, SectionFlags::Code); O << "\n"; // Start the Code Section. SwitchToSection (fCodeSection); // Emit the frame address of the function at the beginning of code. O << "\tretlw low(" << PAN::getFrameLabel(CurrentFnName) << ")\n"; O << "\tretlw high(" << PAN::getFrameLabel(CurrentFnName) << ")\n"; // Emit function start label. O << CurrentFnName << ":\n"; // For emitting line directives, we need to keep track of the current // source line. When it changes then only emit the line directive. unsigned CurLine = 0; O << "\n"; // Print out code for the function. for (MachineFunction::const_iterator I = MF.begin(), E = MF.end(); I != E; ++I) { // Print a label for the basic block. if (I != MF.begin()) { printBasicBlockLabel(I, true); O << '\n'; } for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end(); II != E; ++II) { // Emit the line directive if source line changed. const DebugLoc DL = II->getDebugLoc(); if (!DL.isUnknown()) { unsigned line = MF.getDebugLocTuple(DL).Line; if (line != CurLine) { O << "\t.line " << line << "\n"; CurLine = line; } } // Print the assembly for the instruction. printMachineInstruction(II); } } // Emit function end debug directives. DbgInfo.EmitFunctEndDI(F, CurLine); return false; // we didn't modify anything. }
bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); std::vector<unsigned> I1Defs; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; MachineBasicBlock::iterator I, Next; for (I = MBB.begin(); I != MBB.end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; if (MI.getOpcode() == AMDGPU::V_MOV_I1) { I1Defs.push_back(MI.getOperand(0).getReg()); MI.setDesc(TII->get(AMDGPU::V_MOV_B32_e32)); continue; } if (MI.getOpcode() == AMDGPU::V_AND_I1) { I1Defs.push_back(MI.getOperand(0).getReg()); MI.setDesc(TII->get(AMDGPU::V_AND_B32_e32)); continue; } if (MI.getOpcode() == AMDGPU::V_OR_I1) { I1Defs.push_back(MI.getOperand(0).getReg()); MI.setDesc(TII->get(AMDGPU::V_OR_B32_e32)); continue; } if (MI.getOpcode() == AMDGPU::V_XOR_I1) { I1Defs.push_back(MI.getOperand(0).getReg()); MI.setDesc(TII->get(AMDGPU::V_XOR_B32_e32)); continue; } if (MI.getOpcode() != AMDGPU::COPY || !TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()) || !TargetRegisterInfo::isVirtualRegister(MI.getOperand(1).getReg())) continue; const TargetRegisterClass *DstRC = MRI.getRegClass(MI.getOperand(0).getReg()); const TargetRegisterClass *SrcRC = MRI.getRegClass(MI.getOperand(1).getReg()); if (DstRC == &AMDGPU::VReg_1RegClass && TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) { I1Defs.push_back(MI.getOperand(0).getReg()); BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CNDMASK_B32_e64)) .addOperand(MI.getOperand(0)) .addImm(0) .addImm(-1) .addOperand(MI.getOperand(1)) .addImm(0) .addImm(0) .addImm(0) .addImm(0); MI.eraseFromParent(); } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) && SrcRC == &AMDGPU::VReg_1RegClass) { BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_I32_e64)) .addOperand(MI.getOperand(0)) .addOperand(MI.getOperand(1)) .addImm(0); MI.eraseFromParent(); } } } for (unsigned Reg : I1Defs) MRI.setRegClass(Reg, &AMDGPU::VReg_32RegClass); return false; }