// Try to fuse comparison instruction Compare into a later branch. // Return true on success and if Compare is therefore redundant. bool SystemZElimCompare:: fuseCompareAndBranch(MachineInstr *Compare, SmallVectorImpl<MachineInstr *> &CCUsers) { // See whether we have a comparison that can be fused. unsigned FusedOpcode = TII->getCompareAndBranch(Compare->getOpcode(), Compare); if (!FusedOpcode) return false; // See whether we have a single branch with which to fuse. if (CCUsers.size() != 1) return false; MachineInstr *Branch = CCUsers[0]; if (Branch->getOpcode() != SystemZ::BRC) return false; // Make sure that the operands are available at the branch. unsigned SrcReg = Compare->getOperand(0).getReg(); unsigned SrcReg2 = (Compare->getOperand(1).isReg() ? Compare->getOperand(1).getReg() : 0); MachineBasicBlock::iterator MBBI = Compare, MBBE = Branch; for (++MBBI; MBBI != MBBE; ++MBBI) if (MBBI->modifiesRegister(SrcReg, TRI) || (SrcReg2 && MBBI->modifiesRegister(SrcReg2, TRI))) return false; // Read the branch mask and target. MachineOperand CCMask(MBBI->getOperand(1)); MachineOperand Target(MBBI->getOperand(2)); assert((CCMask.getImm() & ~SystemZ::CCMASK_ICMP) == 0 && "Invalid condition-code mask for integer comparison"); // Clear out all current operands. int CCUse = MBBI->findRegisterUseOperandIdx(SystemZ::CC, false, TRI); assert(CCUse >= 0 && "BRC must use CC"); Branch->RemoveOperand(CCUse); Branch->RemoveOperand(2); Branch->RemoveOperand(1); Branch->RemoveOperand(0); // Rebuild Branch as a fused compare and branch. Branch->setDesc(TII->get(FusedOpcode)); MachineInstrBuilder(*Branch->getParent()->getParent(), Branch) .addOperand(Compare->getOperand(0)) .addOperand(Compare->getOperand(1)) .addOperand(CCMask) .addOperand(Target) .addReg(SystemZ::CC, RegState::ImplicitDefine); // Clear any intervening kills of SrcReg and SrcReg2. MBBI = Compare; for (++MBBI; MBBI != MBBE; ++MBBI) { MBBI->clearRegisterKills(SrcReg, TRI); if (SrcReg2) MBBI->clearRegisterKills(SrcReg2, TRI); } FusedComparisons += 1; return true; }
// We have identified this II could be feeder to NVJ, // verify that it can be. static bool canBeFeederToNewValueJump(const HexagonInstrInfo *QII, const TargetRegisterInfo *TRI, MachineBasicBlock::iterator II, MachineBasicBlock::iterator end, MachineBasicBlock::iterator skip, MachineFunction &MF) { // Predicated instruction can not be feeder to NVJ. if (QII->isPredicated(*II)) return false; // Bail out if feederReg is a paired register (double regs in // our case). One would think that we can check to see if a given // register cmpReg1 or cmpReg2 is a sub register of feederReg // using -- if (QRI->isSubRegister(feederReg, cmpReg1) logic // before the callsite of this function // But we can not as it comes in the following fashion. // %D0<def> = Hexagon_S2_lsr_r_p %D0<kill>, %R2<kill> // %R0<def> = KILL %R0, %D0<imp-use,kill> // %P0<def> = CMPEQri %R0<kill>, 0 // Hence, we need to check if it's a KILL instruction. if (II->getOpcode() == TargetOpcode::KILL) return false; if (II->isImplicitDef()) return false; // Make sure there there is no 'def' or 'use' of any of the uses of // feeder insn between it's definition, this MI and jump, jmpInst // skipping compare, cmpInst. // Here's the example. // r21=memub(r22+r24<<#0) // p0 = cmp.eq(r21, #0) // r4=memub(r3+r21<<#0) // if (p0.new) jump:t .LBB29_45 // Without this check, it will be converted into // r4=memub(r3+r21<<#0) // r21=memub(r22+r24<<#0) // p0 = cmp.eq(r21, #0) // if (p0.new) jump:t .LBB29_45 // and result WAR hazards if converted to New Value Jump. for (unsigned i = 0; i < II->getNumOperands(); ++i) { if (II->getOperand(i).isReg() && (II->getOperand(i).isUse() || II->getOperand(i).isDef())) { MachineBasicBlock::iterator localII = II; ++localII; unsigned Reg = II->getOperand(i).getReg(); for (MachineBasicBlock::iterator localBegin = localII; localBegin != end; ++localBegin) { if (localBegin == skip ) continue; // Check for Subregisters too. if (localBegin->modifiesRegister(Reg, TRI) || localBegin->readsRegister(Reg, TRI)) return false; } } } return true; }
// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States" // around other non-memory instructions. bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { bool Changes = false; ST = &MF.getSubtarget<SISubtarget>(); TII = ST->getInstrInfo(); TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); IV = getIsaVersion(ST->getFeatureBits()); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); HardwareLimits.Named.VM = getVmcntBitMask(IV); HardwareLimits.Named.EXP = getExpcntBitMask(IV); HardwareLimits.Named.LGKM = getLgkmcntBitMask(IV); WaitedOn = ZeroCounts; DelayedWaitOn = ZeroCounts; LastIssued = ZeroCounts; LastOpcodeType = OTHER; LastInstWritesM0 = false; IsFlatOutstanding = false; ReturnsVoid = MFI->returnsVoid(); memset(&UsedRegs, 0, sizeof(UsedRegs)); memset(&DefinedRegs, 0, sizeof(DefinedRegs)); SmallVector<MachineInstr *, 4> RemoveMI; SmallVector<MachineBasicBlock *, 4> EndPgmBlocks; bool HaveScalarStores = false; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I) { if (!HaveScalarStores && TII->isScalarStore(*I)) HaveScalarStores = true; if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) { // There is a hardware bug on CI/SI where SMRD instruction may corrupt // vccz bit, so when we detect that an instruction may read from a // corrupt vccz bit, we need to: // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to // complete. // 2. Restore the correct value of vccz by writing the current value // of vcc back to vcc. if (TII->isSMRD(I->getOpcode())) { VCCZCorrupt = true; } else if (!hasOutstandingLGKM() && I->modifiesRegister(AMDGPU::VCC, TRI)) { // FIXME: We only care about SMRD instructions here, not LDS or GDS. // Whenever we store a value in vcc, the correct value of vccz is // restored. VCCZCorrupt = false; } // Check if we need to apply the bug work-around if (VCCZCorrupt && readsVCCZ(*I)) { DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n'); // Wait on everything, not just LGKM. vccz reads usually come from // terminators, and we always wait on everything at the end of the // block, so if we only wait on LGKM here, we might end up with // another s_waitcnt inserted right after this if there are non-LGKM // instructions still outstanding. insertWait(MBB, I, LastIssued); // Restore the vccz bit. Any time a value is written to vcc, the vcc // bit is updated, so we can restore the bit by reading the value of // vcc and then writing it back to the register. BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), AMDGPU::VCC) .addReg(AMDGPU::VCC); } } // Record pre-existing, explicitly requested waits if (I->getOpcode() == AMDGPU::S_WAITCNT) { handleExistingWait(*I); RemoveMI.push_back(&*I); continue; } Counters Required; // Wait for everything before a barrier. // // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish, // but we also want to wait for any other outstanding transfers before // signalling other hardware blocks if ((I->getOpcode() == AMDGPU::S_BARRIER && ST->needWaitcntBeforeBarrier()) || I->getOpcode() == AMDGPU::S_SENDMSG) Required = LastIssued; else Required = handleOperands(*I); Counters Increment = getHwCounts(*I); if (countersNonZero(Required) || countersNonZero(Increment)) increaseCounters(Required, DelayedWaitOn); Changes |= insertWait(MBB, I, Required); pushInstruction(MBB, I, Increment); handleSendMsg(MBB, I); if (I->getOpcode() == AMDGPU::S_ENDPGM || I->getOpcode() == AMDGPU::SI_RETURN) EndPgmBlocks.push_back(&MBB); } // Wait for everything at the end of the MBB Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); } if (HaveScalarStores) { // If scalar writes are used, the cache must be flushed or else the next // wave to reuse the same scratch memory can be clobbered. // // Insert s_dcache_wb at wave termination points if there were any scalar // stores, and only if the cache hasn't already been flushed. This could be // improved by looking across blocks for flushes in postdominating blocks // from the stores but an explicitly requested flush is probably very rare. for (MachineBasicBlock *MBB : EndPgmBlocks) { bool SeenDCacheWB = false; for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; ++I) { if (I->getOpcode() == AMDGPU::S_DCACHE_WB) SeenDCacheWB = true; else if (TII->isScalarStore(*I)) SeenDCacheWB = false; // FIXME: It would be better to insert this before a waitcnt if any. if ((I->getOpcode() == AMDGPU::S_ENDPGM || I->getOpcode() == AMDGPU::SI_RETURN) && !SeenDCacheWB) { Changes = true; BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB)); } } } } for (MachineInstr *I : RemoveMI) I->eraseFromParent(); return Changes; }
static bool canCompareBeNewValueJump(const HexagonInstrInfo *QII, const TargetRegisterInfo *TRI, MachineBasicBlock::iterator II, unsigned pReg, bool secondReg, bool optLocation, MachineBasicBlock::iterator end, MachineFunction &MF) { MachineInstr &MI = *II; // If the second operand of the compare is an imm, make sure it's in the // range specified by the arch. if (!secondReg) { int64_t v = MI.getOperand(2).getImm(); if (!(isUInt<5>(v) || ((MI.getOpcode() == Hexagon::C2_cmpeqi || MI.getOpcode() == Hexagon::C2_cmpgti) && (v == -1)))) return false; } unsigned cmpReg1, cmpOp2 = 0; // cmpOp2 assignment silences compiler warning. cmpReg1 = MI.getOperand(1).getReg(); if (secondReg) { cmpOp2 = MI.getOperand(2).getReg(); // Make sure that that second register is not from COPY // At machine code level, we don't need this, but if we decide // to move new value jump prior to RA, we would be needing this. MachineRegisterInfo &MRI = MF.getRegInfo(); if (secondReg && !TargetRegisterInfo::isPhysicalRegister(cmpOp2)) { MachineInstr *def = MRI.getVRegDef(cmpOp2); if (def->getOpcode() == TargetOpcode::COPY) return false; } } // Walk the instructions after the compare (predicate def) to the jump, // and satisfy the following conditions. ++II ; for (MachineBasicBlock::iterator localII = II; localII != end; ++localII) { // Check 1. // If "common" checks fail, bail out. if (!commonChecksToProhibitNewValueJump(optLocation, localII)) return false; // Check 2. // If there is a def or use of predicate (result of compare), bail out. if (localII->modifiesRegister(pReg, TRI) || localII->readsRegister(pReg, TRI)) return false; // Check 3. // If there is a def of any of the use of the compare (operands of compare), // bail out. // Eg. // p0 = cmp.eq(r2, r0) // r2 = r4 // if (p0.new) jump:t .LBB28_3 if (localII->modifiesRegister(cmpReg1, TRI) || (secondReg && localII->modifiesRegister(cmpOp2, TRI))) return false; } return true; }
// Try to fuse comparison instruction Compare into a later branch. // Return true on success and if Compare is therefore redundant. bool SystemZElimCompare::fuseCompareOperations( MachineInstr &Compare, SmallVectorImpl<MachineInstr *> &CCUsers) { // See whether we have a single branch with which to fuse. if (CCUsers.size() != 1) return false; MachineInstr *Branch = CCUsers[0]; SystemZII::FusedCompareType Type; switch (Branch->getOpcode()) { case SystemZ::BRC: Type = SystemZII::CompareAndBranch; break; case SystemZ::CondReturn: Type = SystemZII::CompareAndReturn; break; case SystemZ::CallBCR: Type = SystemZII::CompareAndSibcall; break; case SystemZ::CondTrap: Type = SystemZII::CompareAndTrap; break; default: return false; } // See whether we have a comparison that can be fused. unsigned FusedOpcode = TII->getFusedCompare(Compare.getOpcode(), Type, &Compare); if (!FusedOpcode) return false; // Make sure that the operands are available at the branch. // SrcReg2 is the register if the source operand is a register, // 0 if the source operand is immediate, and the base register // if the source operand is memory (index is not supported). unsigned SrcReg = Compare.getOperand(0).getReg(); unsigned SrcReg2 = Compare.getOperand(1).isReg() ? Compare.getOperand(1).getReg() : 0; MachineBasicBlock::iterator MBBI = Compare, MBBE = Branch; for (++MBBI; MBBI != MBBE; ++MBBI) if (MBBI->modifiesRegister(SrcReg, TRI) || (SrcReg2 && MBBI->modifiesRegister(SrcReg2, TRI))) return false; // Read the branch mask, target (if applicable), regmask (if applicable). MachineOperand CCMask(MBBI->getOperand(1)); assert((CCMask.getImm() & ~SystemZ::CCMASK_ICMP) == 0 && "Invalid condition-code mask for integer comparison"); // This is only valid for CompareAndBranch. MachineOperand Target(MBBI->getOperand( Type == SystemZII::CompareAndBranch ? 2 : 0)); const uint32_t *RegMask; if (Type == SystemZII::CompareAndSibcall) RegMask = MBBI->getOperand(2).getRegMask(); // Clear out all current operands. int CCUse = MBBI->findRegisterUseOperandIdx(SystemZ::CC, false, TRI); assert(CCUse >= 0 && "BRC/BCR must use CC"); Branch->RemoveOperand(CCUse); // Remove target (branch) or regmask (sibcall). if (Type == SystemZII::CompareAndBranch || Type == SystemZII::CompareAndSibcall) Branch->RemoveOperand(2); Branch->RemoveOperand(1); Branch->RemoveOperand(0); // Rebuild Branch as a fused compare and branch. // SrcNOps is the number of MI operands of the compare instruction // that we need to copy over. unsigned SrcNOps = 2; if (FusedOpcode == SystemZ::CLT || FusedOpcode == SystemZ::CLGT) SrcNOps = 3; Branch->setDesc(TII->get(FusedOpcode)); MachineInstrBuilder MIB(*Branch->getParent()->getParent(), Branch); for (unsigned I = 0; I < SrcNOps; I++) MIB.addOperand(Compare.getOperand(I)); MIB.addOperand(CCMask); if (Type == SystemZII::CompareAndBranch) { // Only conditional branches define CC, as they may be converted back // to a non-fused branch because of a long displacement. Conditional // returns don't have that problem. MIB.addOperand(Target) .addReg(SystemZ::CC, RegState::ImplicitDefine | RegState::Dead); } if (Type == SystemZII::CompareAndSibcall) MIB.addRegMask(RegMask); // Clear any intervening kills of SrcReg and SrcReg2. MBBI = Compare; for (++MBBI; MBBI != MBBE; ++MBBI) { MBBI->clearRegisterKills(SrcReg, TRI); if (SrcReg2) MBBI->clearRegisterKills(SrcReg2, TRI); } FusedComparisons += 1; return true; }
bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); const SIInstrInfo *TII = ST.getInstrInfo(); // Optimize sequences emitted for control flow lowering. They are originally // emitted as the separate operations because spill code may need to be // inserted for the saved copy of exec. // // x = copy exec // z = s_<op>_b64 x, y // exec = copy z // => // x = s_<op>_saveexec_b64 y // for (MachineBasicBlock &MBB : MF) { MachineBasicBlock::reverse_iterator I = fixTerminators(*TII, MBB); MachineBasicBlock::reverse_iterator E = MBB.rend(); if (I == E) continue; unsigned CopyToExec = isCopyToExec(*I); if (CopyToExec == AMDGPU::NoRegister) continue; // Scan backwards to find the def. auto CopyToExecInst = &*I; auto CopyFromExecInst = findExecCopy(*TII, MBB, I, CopyToExec); if (CopyFromExecInst == E) { auto PrepareExecInst = std::next(I); if (PrepareExecInst == E) continue; // Fold exec = COPY (S_AND_B64 reg, exec) -> exec = S_AND_B64 reg, exec if (CopyToExecInst->getOperand(1).isKill() && isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) { DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst); PrepareExecInst->getOperand(0).setReg(AMDGPU::EXEC); PrepareExecInst->getOperand(0).setIsRenamable(false); DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n'); CopyToExecInst->eraseFromParent(); } continue; } if (isLiveOut(MBB, CopyToExec)) { // The copied register is live out and has a second use in another block. DEBUG(dbgs() << "Exec copy source register is live out\n"); continue; } unsigned CopyFromExec = CopyFromExecInst->getOperand(0).getReg(); MachineInstr *SaveExecInst = nullptr; SmallVector<MachineInstr *, 4> OtherUseInsts; for (MachineBasicBlock::iterator J = std::next(CopyFromExecInst->getIterator()), JE = I->getIterator(); J != JE; ++J) { if (SaveExecInst && J->readsRegister(AMDGPU::EXEC, TRI)) { DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n'); // Make sure this is inserted after any VALU ops that may have been // scheduled in between. SaveExecInst = nullptr; break; } bool ReadsCopyFromExec = J->readsRegister(CopyFromExec, TRI); if (J->modifiesRegister(CopyToExec, TRI)) { if (SaveExecInst) { DEBUG(dbgs() << "Multiple instructions modify " << printReg(CopyToExec, TRI) << '\n'); SaveExecInst = nullptr; break; } unsigned SaveExecOp = getSaveExecOp(J->getOpcode()); if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END) break; if (ReadsCopyFromExec) { SaveExecInst = &*J; DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n'); continue; } else { DEBUG(dbgs() << "Instruction does not read exec copy: " << *J << '\n'); break; } } else if (ReadsCopyFromExec && !SaveExecInst) { // Make sure no other instruction is trying to use this copy, before it // will be rewritten by the saveexec, i.e. hasOneUse. There may have // been another use, such as an inserted spill. For example: // // %sgpr0_sgpr1 = COPY %exec // spill %sgpr0_sgpr1 // %sgpr2_sgpr3 = S_AND_B64 %sgpr0_sgpr1 // DEBUG(dbgs() << "Found second use of save inst candidate: " << *J << '\n'); break; } if (SaveExecInst && J->readsRegister(CopyToExec, TRI)) { assert(SaveExecInst != &*J); OtherUseInsts.push_back(&*J); } } if (!SaveExecInst) continue; DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n'); MachineOperand &Src0 = SaveExecInst->getOperand(1); MachineOperand &Src1 = SaveExecInst->getOperand(2); MachineOperand *OtherOp = nullptr; if (Src0.isReg() && Src0.getReg() == CopyFromExec) { OtherOp = &Src1; } else if (Src1.isReg() && Src1.getReg() == CopyFromExec) { if (!SaveExecInst->isCommutable()) break; OtherOp = &Src0; } else llvm_unreachable("unexpected"); CopyFromExecInst->eraseFromParent(); auto InsPt = SaveExecInst->getIterator(); const DebugLoc &DL = SaveExecInst->getDebugLoc(); BuildMI(MBB, InsPt, DL, TII->get(getSaveExecOp(SaveExecInst->getOpcode())), CopyFromExec) .addReg(OtherOp->getReg()); SaveExecInst->eraseFromParent(); CopyToExecInst->eraseFromParent(); for (MachineInstr *OtherInst : OtherUseInsts) { OtherInst->substituteRegister(CopyToExec, AMDGPU::EXEC, AMDGPU::NoSubRegister, *TRI, /*ClearIsRenamable=*/true); } } return true; }
/// For a given conditional copy, predicate the definition of the source of /// the copy under the given condition (using the same predicate register as /// the copy). bool HexagonExpandCondsets::predicate(MachineInstr *TfrI, bool Cond) { // TfrI - A2_tfr[tf] Instruction (not A2_tfrsi). unsigned Opc = TfrI->getOpcode(); (void)Opc; assert(Opc == Hexagon::A2_tfrt || Opc == Hexagon::A2_tfrf); DEBUG(dbgs() << "\nattempt to predicate if-" << (Cond ? "true" : "false") << ": " << *TfrI); MachineOperand &MD = TfrI->getOperand(0); MachineOperand &MP = TfrI->getOperand(1); MachineOperand &MS = TfrI->getOperand(2); // The source operand should be a <kill>. This is not strictly necessary, // but it makes things a lot simpler. Otherwise, we would need to rename // some registers, which would complicate the transformation considerably. if (!MS.isKill()) return false; RegisterRef RT(MS); unsigned PredR = MP.getReg(); MachineInstr *DefI = getReachingDefForPred(RT, TfrI, PredR, Cond); if (!DefI || !isPredicable(DefI)) return false; DEBUG(dbgs() << "Source def: " << *DefI); // Collect the information about registers defined and used between the // DefI and the TfrI. // Map: reg -> bitmask of subregs ReferenceMap Uses, Defs; MachineBasicBlock::iterator DefIt = DefI, TfrIt = TfrI; // Check if the predicate register is valid between DefI and TfrI. // If it is, we can then ignore instructions predicated on the negated // conditions when collecting def and use information. bool PredValid = true; for (MachineBasicBlock::iterator I = std::next(DefIt); I != TfrIt; ++I) { if (!I->modifiesRegister(PredR, 0)) continue; PredValid = false; break; } for (MachineBasicBlock::iterator I = std::next(DefIt); I != TfrIt; ++I) { MachineInstr *MI = &*I; // If this instruction is predicated on the same register, it could // potentially be ignored. // By default assume that the instruction executes on the same condition // as TfrI (Exec_Then), and also on the opposite one (Exec_Else). unsigned Exec = Exec_Then | Exec_Else; if (PredValid && HII->isPredicated(*MI) && MI->readsRegister(PredR)) Exec = (Cond == HII->isPredicatedTrue(*MI)) ? Exec_Then : Exec_Else; for (auto &Op : MI->operands()) { if (!Op.isReg()) continue; // We don't want to deal with physical registers. The reason is that // they can be aliased with other physical registers. Aliased virtual // registers must share the same register number, and can only differ // in the subregisters, which we are keeping track of. Physical // registers ters no longer have subregisters---their super- and // subregisters are other physical registers, and we are not checking // that. RegisterRef RR = Op; if (!TargetRegisterInfo::isVirtualRegister(RR.Reg)) return false; ReferenceMap &Map = Op.isDef() ? Defs : Uses; addRefToMap(RR, Map, Exec); } } // The situation: // RT = DefI // ... // RD = TfrI ..., RT // If the register-in-the-middle (RT) is used or redefined between // DefI and TfrI, we may not be able proceed with this transformation. // We can ignore a def that will not execute together with TfrI, and a // use that will. If there is such a use (that does execute together with // TfrI), we will not be able to move DefI down. If there is a use that // executed if TfrI's condition is false, then RT must be available // unconditionally (cannot be predicated). // Essentially, we need to be able to rename RT to RD in this segment. if (isRefInMap(RT, Defs, Exec_Then) || isRefInMap(RT, Uses, Exec_Else)) return false; RegisterRef RD = MD; // If the predicate register is defined between DefI and TfrI, the only // potential thing to do would be to move the DefI down to TfrI, and then // predicate. The reaching def (DefI) must be movable down to the location // of the TfrI. // If the target register of the TfrI (RD) is not used or defined between // DefI and TfrI, consider moving TfrI up to DefI. bool CanUp = canMoveOver(TfrI, Defs, Uses); bool CanDown = canMoveOver(DefI, Defs, Uses); // The TfrI does not access memory, but DefI could. Check if it's safe // to move DefI down to TfrI. if (DefI->mayLoad() || DefI->mayStore()) if (!canMoveMemTo(DefI, TfrI, true)) CanDown = false; DEBUG(dbgs() << "Can move up: " << (CanUp ? "yes" : "no") << ", can move down: " << (CanDown ? "yes\n" : "no\n")); MachineBasicBlock::iterator PastDefIt = std::next(DefIt); if (CanUp) predicateAt(RD, DefI, PastDefIt, PredR, Cond, MP.isUndef()); else if (CanDown) predicateAt(RD, DefI, TfrIt, PredR, Cond, MP.isUndef()); else return false; if (RT != RD) renameInRange(RT, RD, PredR, Cond, PastDefIt, TfrIt); // Delete the user of RT first (it should work either way, but this order // of deleting is more natural). removeInstrFromLiveness(TfrI); removeInstrFromLiveness(DefI); return true; }
// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States" // around other non-memory instructions. bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { bool Changes = false; ST = &MF.getSubtarget<SISubtarget>(); TII = ST->getInstrInfo(); TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); WaitedOn = ZeroCounts; DelayedWaitOn = ZeroCounts; LastIssued = ZeroCounts; LastOpcodeType = OTHER; LastInstWritesM0 = false; ReturnsVoid = MF.getInfo<SIMachineFunctionInfo>()->returnsVoid(); memset(&UsedRegs, 0, sizeof(UsedRegs)); memset(&DefinedRegs, 0, sizeof(DefinedRegs)); SmallVector<MachineInstr *, 4> RemoveMI; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I) { if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) { // There is a hardware bug on CI/SI where SMRD instruction may corrupt // vccz bit, so when we detect that an instruction may read from a // corrupt vccz bit, we need to: // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to // complete. // 2. Restore the correct value of vccz by writing the current value // of vcc back to vcc. if (TII->isSMRD(I->getOpcode())) { VCCZCorrupt = true; } else if (!hasOutstandingLGKM() && I->modifiesRegister(AMDGPU::VCC, TRI)) { // FIXME: We only care about SMRD instructions here, not LDS or GDS. // Whenever we store a value in vcc, the correct value of vccz is // restored. VCCZCorrupt = false; } // Check if we need to apply the bug work-around if (readsVCCZ(I->getOpcode()) && VCCZCorrupt) { DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n'); // Wait on everything, not just LGKM. vccz reads usually come from // terminators, and we always wait on everything at the end of the // block, so if we only wait on LGKM here, we might end up with // another s_waitcnt inserted right after this if there are non-LGKM // instructions still outstanding. insertWait(MBB, I, LastIssued); // Restore the vccz bit. Any time a value is written to vcc, the vcc // bit is updated, so we can restore the bit by reading the value of // vcc and then writing it back to the register. BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), AMDGPU::VCC) .addReg(AMDGPU::VCC); } } // Record pre-existing, explicitly requested waits if (I->getOpcode() == AMDGPU::S_WAITCNT) { handleExistingWait(*I); RemoveMI.push_back(&*I); continue; } Counters Required; // Wait for everything before a barrier. // // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish, // but we also want to wait for any other outstanding transfers before // signalling other hardware blocks if (I->getOpcode() == AMDGPU::S_BARRIER || I->getOpcode() == AMDGPU::S_SENDMSG) Required = LastIssued; else Required = handleOperands(*I); Counters Increment = getHwCounts(*I); if (countersNonZero(Required) || countersNonZero(Increment)) increaseCounters(Required, DelayedWaitOn); Changes |= insertWait(MBB, I, Required); pushInstruction(MBB, I, Increment); handleSendMsg(MBB, I); } // Wait for everything at the end of the MBB Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); } for (MachineInstr *I : RemoveMI) I->eraseFromParent(); return Changes; }