bool LazyLiveness::vregLiveIntoMBB(unsigned vreg, MachineBasicBlock* MBB) { MachineDominatorTree& MDT = getAnalysis<MachineDominatorTree>(); MachineBasicBlock* DefMBB = MRI->def_begin(vreg)->getParent(); unsigned def = preorder[DefMBB]; unsigned max_dom = 0; for (df_iterator<MachineDomTreeNode*> DI = df_begin(MDT[DefMBB]), DE = df_end(MDT[DefMBB]); DI != DE; ++DI) { if (preorder[DI->getBlock()] > max_dom) { max_dom = preorder[(*DI)->getBlock()]; } } if (preorder[MBB] <= def || max_dom < preorder[MBB]) return false; SparseBitVector<128>::iterator I = tv[MBB].begin(); while (I != tv[MBB].end() && *I <= def) ++I; while (I != tv[MBB].end() && *I < max_dom) { for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(vreg), UE = MachineRegisterInfo::use_end(); UI != UE; ++UI) { MachineBasicBlock* UseMBB = UI->getParent(); if (rv[rev_preorder[*I]].test(preorder[UseMBB])) return true; unsigned t_dom = 0; for (df_iterator<MachineDomTreeNode*> DI = df_begin(MDT[rev_preorder[*I]]), DE = df_end(MDT[rev_preorder[*I]]); DI != DE; ++DI) if (preorder[DI->getBlock()] > t_dom) { max_dom = preorder[(*DI)->getBlock()]; } I = tv[MBB].begin(); while (I != tv[MBB].end() && *I < t_dom) ++I; } } return false; }
bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); const SIRegisterInfo &TRI = TII->getRegisterInfo(); for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; MachineBasicBlock::iterator I, Next; for (I = MBB.begin(); I != MBB.end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; if (!isSafeToFold(MI.getOpcode())) continue; unsigned OpSize = TII->getOpSize(MI, 1); MachineOperand &OpToFold = MI.getOperand(1); bool FoldingImm = OpToFold.isImm(); // FIXME: We could also be folding things like FrameIndexes and // TargetIndexes. if (!FoldingImm && !OpToFold.isReg()) continue; // Folding immediates with more than one use will increase program size. // FIXME: This will also reduce register usage, which may be better // in some cases. A better heuristic is needed. if (FoldingImm && !TII->isInlineConstant(OpToFold, OpSize) && !MRI.hasOneUse(MI.getOperand(0).getReg())) continue; // FIXME: Fold operands with subregs. if (OpToFold.isReg() && (!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()) || OpToFold.getSubReg())) continue; // We need mutate the operands of new mov instructions to add implicit // uses of EXEC, but adding them invalidates the use_iterator, so defer // this. SmallVector<MachineInstr *, 4> CopiesToReplace; std::vector<FoldCandidate> FoldList; for (MachineRegisterInfo::use_iterator Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end(); Use != E; ++Use) { MachineInstr *UseMI = Use->getParent(); foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList, CopiesToReplace, TII, TRI, MRI); } // Make sure we add EXEC uses to any new v_mov instructions created. for (MachineInstr *Copy : CopiesToReplace) Copy->addImplicitDefUseOperands(MF); for (FoldCandidate &Fold : FoldList) { if (updateOperand(Fold, TRI)) { // Clear kill flags. if (!Fold.isImm()) { assert(Fold.OpToFold && Fold.OpToFold->isReg()); Fold.OpToFold->setIsKill(false); } DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << Fold.UseOpNo << " of " << *Fold.UseMI << '\n'); } } } } return false; }
static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx, std::vector<FoldCandidate> &FoldList, SmallVectorImpl<MachineInstr *> &CopiesToReplace, const SIInstrInfo *TII, const SIRegisterInfo &TRI, MachineRegisterInfo &MRI) { const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); // FIXME: Fold operands with subregs. if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) || UseOp.isImplicit())) { return; } bool FoldingImm = OpToFold.isImm(); APInt Imm; if (FoldingImm) { unsigned UseReg = UseOp.getReg(); const TargetRegisterClass *UseRC = TargetRegisterInfo::isVirtualRegister(UseReg) ? MRI.getRegClass(UseReg) : TRI.getPhysRegClass(UseReg); Imm = APInt(64, OpToFold.getImm()); const MCInstrDesc &FoldDesc = TII->get(OpToFold.getParent()->getOpcode()); const TargetRegisterClass *FoldRC = TRI.getRegClass(FoldDesc.OpInfo[0].RegClass); // Split 64-bit constants into 32-bits for folding. if (FoldRC->getSize() == 8 && UseOp.getSubReg()) { if (UseRC->getSize() != 8) return; if (UseOp.getSubReg() == AMDGPU::sub0) { Imm = Imm.getLoBits(32); } else { assert(UseOp.getSubReg() == AMDGPU::sub1); Imm = Imm.getHiBits(32); } } // In order to fold immediates into copies, we need to change the // copy to a MOV. if (UseMI->getOpcode() == AMDGPU::COPY) { unsigned DestReg = UseMI->getOperand(0).getReg(); const TargetRegisterClass *DestRC = TargetRegisterInfo::isVirtualRegister(DestReg) ? MRI.getRegClass(DestReg) : TRI.getPhysRegClass(DestReg); unsigned MovOp = TII->getMovOpcode(DestRC); if (MovOp == AMDGPU::COPY) return; UseMI->setDesc(TII->get(MovOp)); CopiesToReplace.push_back(UseMI); } } // Special case for REG_SEQUENCE: We can't fold literals into // REG_SEQUENCE instructions, so we have to fold them into the // uses of REG_SEQUENCE. if (UseMI->getOpcode() == AMDGPU::REG_SEQUENCE) { unsigned RegSeqDstReg = UseMI->getOperand(0).getReg(); unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); for (MachineRegisterInfo::use_iterator RSUse = MRI.use_begin(RegSeqDstReg), RSE = MRI.use_end(); RSUse != RSE; ++RSUse) { MachineInstr *RSUseMI = RSUse->getParent(); if (RSUse->getSubReg() != RegSeqDstSubReg) continue; foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList, CopiesToReplace, TII, TRI, MRI); } return; } const MCInstrDesc &UseDesc = UseMI->getDesc(); // Don't fold into target independent nodes. Target independent opcodes // don't have defined register classes. if (UseDesc.isVariadic() || UseDesc.OpInfo[UseOpIdx].RegClass == -1) return; if (FoldingImm) { MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII); return; } tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); // FIXME: We could try to change the instruction from 64-bit to 32-bit // to enable more folding opportunites. The shrink operands pass // already does this. return; }
void SIFoldOperands::foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const { // We need mutate the operands of new mov instructions to add implicit // uses of EXEC, but adding them invalidates the use_iterator, so defer // this. SmallVector<MachineInstr *, 4> CopiesToReplace; SmallVector<FoldCandidate, 4> FoldList; MachineOperand &Dst = MI.getOperand(0); bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); if (FoldingImm) { unsigned NumLiteralUses = 0; MachineOperand *NonInlineUse = nullptr; int NonInlineUseOpNo = -1; MachineRegisterInfo::use_iterator NextUse, NextInstUse; for (MachineRegisterInfo::use_iterator Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end(); Use != E; Use = NextUse) { NextUse = std::next(Use); MachineInstr *UseMI = Use->getParent(); unsigned OpNo = Use.getOperandNo(); // Folding the immediate may reveal operations that can be constant // folded or replaced with a copy. This can happen for example after // frame indices are lowered to constants or from splitting 64-bit // constants. // // We may also encounter cases where one or both operands are // immediates materialized into a register, which would ordinarily not // be folded due to multiple uses or operand constraints. if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) { DEBUG(dbgs() << "Constant folded " << *UseMI <<'\n'); // Some constant folding cases change the same immediate's use to a new // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user // again. The same constant folded instruction could also have a second // use operand. NextUse = MRI->use_begin(Dst.getReg()); continue; } // Try to fold any inline immediate uses, and then only fold other // constants if they have one use. // // The legality of the inline immediate must be checked based on the use // operand, not the defining instruction, because 32-bit instructions // with 32-bit inline immediate sources may be used to materialize // constants used in 16-bit operands. // // e.g. it is unsafe to fold: // s_mov_b32 s0, 1.0 // materializes 0x3f800000 // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00 // Folding immediates with more than one use will increase program size. // FIXME: This will also reduce register usage, which may be better // in some cases. A better heuristic is needed. if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) { foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace); } else { if (++NumLiteralUses == 1) { NonInlineUse = &*Use; NonInlineUseOpNo = OpNo; } } } if (NumLiteralUses == 1) { MachineInstr *UseMI = NonInlineUse->getParent(); foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace); } } else { // Folding register. for (MachineRegisterInfo::use_iterator Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end(); Use != E; ++Use) { MachineInstr *UseMI = Use->getParent(); foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList, CopiesToReplace); } } MachineFunction *MF = MI.getParent()->getParent(); // Make sure we add EXEC uses to any new v_mov instructions created. for (MachineInstr *Copy : CopiesToReplace) Copy->addImplicitDefUseOperands(*MF); for (FoldCandidate &Fold : FoldList) { if (updateOperand(Fold, *TRI)) { // Clear kill flags. if (Fold.isReg()) { assert(Fold.OpToFold && Fold.OpToFold->isReg()); // FIXME: Probably shouldn't bother trying to fold if not an // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR // copies. MRI->clearKillFlags(Fold.OpToFold->getReg()); } DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n'); tryFoldInst(TII, Fold.UseMI); } else if (Fold.isCommuted()) { // Restoring instruction's original operand order if fold has failed. TII->commuteInstruction(*Fold.UseMI, false); } } }
void SIFoldOperands::foldOperand( MachineOperand &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx, SmallVectorImpl<FoldCandidate> &FoldList, SmallVectorImpl<MachineInstr *> &CopiesToReplace) const { const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); if (!isUseSafeToFold(TII, *UseMI, UseOp)) return; // FIXME: Fold operands with subregs. if (UseOp.isReg() && OpToFold.isReg()) { if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister) return; // Don't fold subregister extracts into tied operands, only if it is a full // copy since a subregister use tied to a full register def doesn't really // make sense. e.g. don't fold: // // %vreg1 = COPY %vreg0:sub1 // %vreg2<tied3> = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg1<tied0> // // into // %vreg2<tied3> = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg0:sub1<tied0> if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister) return; } // Special case for REG_SEQUENCE: We can't fold literals into // REG_SEQUENCE instructions, so we have to fold them into the // uses of REG_SEQUENCE. if (UseMI->isRegSequence()) { unsigned RegSeqDstReg = UseMI->getOperand(0).getReg(); unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); for (MachineRegisterInfo::use_iterator RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end(); RSUse != RSE; ++RSUse) { MachineInstr *RSUseMI = RSUse->getParent(); if (RSUse->getSubReg() != RegSeqDstSubReg) continue; foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList, CopiesToReplace); } return; } bool FoldingImm = OpToFold.isImm(); // In order to fold immediates into copies, we need to change the // copy to a MOV. if (FoldingImm && UseMI->isCopy()) { unsigned DestReg = UseMI->getOperand(0).getReg(); const TargetRegisterClass *DestRC = TargetRegisterInfo::isVirtualRegister(DestReg) ? MRI->getRegClass(DestReg) : TRI->getPhysRegClass(DestReg); unsigned MovOp = TII->getMovOpcode(DestRC); if (MovOp == AMDGPU::COPY) return; UseMI->setDesc(TII->get(MovOp)); CopiesToReplace.push_back(UseMI); } else { const MCInstrDesc &UseDesc = UseMI->getDesc(); // Don't fold into target independent nodes. Target independent opcodes // don't have defined register classes. if (UseDesc.isVariadic() || UseDesc.OpInfo[UseOpIdx].RegClass == -1) return; } if (!FoldingImm) { tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); // FIXME: We could try to change the instruction from 64-bit to 32-bit // to enable more folding opportunites. The shrink operands pass // already does this. return; } const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc(); const TargetRegisterClass *FoldRC = TRI->getRegClass(FoldDesc.OpInfo[0].RegClass); // Split 64-bit constants into 32-bits for folding. if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) { unsigned UseReg = UseOp.getReg(); const TargetRegisterClass *UseRC = TargetRegisterInfo::isVirtualRegister(UseReg) ? MRI->getRegClass(UseReg) : TRI->getPhysRegClass(UseReg); if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64) return; APInt Imm(64, OpToFold.getImm()); if (UseOp.getSubReg() == AMDGPU::sub0) { Imm = Imm.getLoBits(32); } else { assert(UseOp.getSubReg() == AMDGPU::sub1); Imm = Imm.getHiBits(32); } MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII); return; } tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); }
bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(*MF.getFunction())) return false; const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); MachineRegisterInfo &MRI = MF.getRegInfo(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; MachineBasicBlock::iterator I, Next; for (I = MBB.begin(); I != MBB.end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; if (!isSafeToFold(MI)) continue; unsigned OpSize = TII->getOpSize(MI, 1); MachineOperand &OpToFold = MI.getOperand(1); bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); // FIXME: We could also be folding things like FrameIndexes and // TargetIndexes. if (!FoldingImm && !OpToFold.isReg()) continue; // Folding immediates with more than one use will increase program size. // FIXME: This will also reduce register usage, which may be better // in some cases. A better heuristic is needed. if (FoldingImm && !TII->isInlineConstant(OpToFold, OpSize) && !MRI.hasOneUse(MI.getOperand(0).getReg())) continue; if (OpToFold.isReg() && !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg())) continue; // Prevent folding operands backwards in the function. For example, // the COPY opcode must not be replaced by 1 in this example: // // %vreg3<def> = COPY %VGPR0; VGPR_32:%vreg3 // ... // %VGPR0<def> = V_MOV_B32_e32 1, %EXEC<imp-use> MachineOperand &Dst = MI.getOperand(0); if (Dst.isReg() && !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) continue; // We need mutate the operands of new mov instructions to add implicit // uses of EXEC, but adding them invalidates the use_iterator, so defer // this. SmallVector<MachineInstr *, 4> CopiesToReplace; std::vector<FoldCandidate> FoldList; for (MachineRegisterInfo::use_iterator Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end(); Use != E; ++Use) { MachineInstr *UseMI = Use->getParent(); foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList, CopiesToReplace, TII, TRI, MRI); } // Make sure we add EXEC uses to any new v_mov instructions created. for (MachineInstr *Copy : CopiesToReplace) Copy->addImplicitDefUseOperands(MF); for (FoldCandidate &Fold : FoldList) { if (updateOperand(Fold, TRI)) { // Clear kill flags. if (Fold.isReg()) { assert(Fold.OpToFold && Fold.OpToFold->isReg()); // FIXME: Probably shouldn't bother trying to fold if not an // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR // copies. MRI.clearKillFlags(Fold.OpToFold->getReg()); } DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << Fold.UseOpNo << " of " << *Fold.UseMI << '\n'); // Folding the immediate may reveal operations that can be constant // folded or replaced with a copy. This can happen for example after // frame indices are lowered to constants or from splitting 64-bit // constants. tryConstantFoldOp(MRI, TII, Fold.UseMI); } } } } return false; }
bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); const SIRegisterInfo &TRI = TII->getRegisterInfo(); for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; MachineBasicBlock::iterator I, Next; for (I = MBB.begin(); I != MBB.end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; if (!isSafeToFold(MI.getOpcode())) continue; unsigned OpSize = TII->getOpSize(MI, 1); MachineOperand &OpToFold = MI.getOperand(1); bool FoldingImm = OpToFold.isImm(); // FIXME: We could also be folding things like FrameIndexes and // TargetIndexes. if (!FoldingImm && !OpToFold.isReg()) continue; // Folding immediates with more than one use will increase program size. // FIXME: This will also reduce register usage, which may be better // in some cases. A better heuristic is needed. if (FoldingImm && !TII->isInlineConstant(OpToFold, OpSize) && !MRI.hasOneUse(MI.getOperand(0).getReg())) continue; // FIXME: Fold operands with subregs. if (OpToFold.isReg() && (!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()) || OpToFold.getSubReg())) continue; std::vector<FoldCandidate> FoldList; for (MachineRegisterInfo::use_iterator Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end(); Use != E; ++Use) { MachineInstr *UseMI = Use->getParent(); const MachineOperand &UseOp = UseMI->getOperand(Use.getOperandNo()); // FIXME: Fold operands with subregs. if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) || UseOp.isImplicit())) { continue; } APInt Imm; if (FoldingImm) { unsigned UseReg = UseOp.getReg(); const TargetRegisterClass *UseRC = TargetRegisterInfo::isVirtualRegister(UseReg) ? MRI.getRegClass(UseReg) : TRI.getRegClass(UseReg); Imm = APInt(64, OpToFold.getImm()); // Split 64-bit constants into 32-bits for folding. if (UseOp.getSubReg()) { if (UseRC->getSize() != 8) continue; if (UseOp.getSubReg() == AMDGPU::sub0) { Imm = Imm.getLoBits(32); } else { assert(UseOp.getSubReg() == AMDGPU::sub1); Imm = Imm.getHiBits(32); } } // In order to fold immediates into copies, we need to change the // copy to a MOV. if (UseMI->getOpcode() == AMDGPU::COPY) { unsigned DestReg = UseMI->getOperand(0).getReg(); const TargetRegisterClass *DestRC = TargetRegisterInfo::isVirtualRegister(DestReg) ? MRI.getRegClass(DestReg) : TRI.getRegClass(DestReg); unsigned MovOp = TII->getMovOpcode(DestRC); if (MovOp == AMDGPU::COPY) continue; UseMI->setDesc(TII->get(MovOp)); } } const MCInstrDesc &UseDesc = UseMI->getDesc(); // Don't fold into target independent nodes. Target independent opcodes // don't have defined register classes. if (UseDesc.isVariadic() || UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1) continue; if (FoldingImm) { MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &ImmOp, TII); continue; } tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &OpToFold, TII); // FIXME: We could try to change the instruction from 64-bit to 32-bit // to enable more folding opportunites. The shrink operands pass // already does this. } for (FoldCandidate &Fold : FoldList) { if (updateOperand(Fold, TRI)) { // Clear kill flags. if (!Fold.isImm()) { assert(Fold.OpToFold && Fold.OpToFold->isReg()); Fold.OpToFold->setIsKill(false); } DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << Fold.UseOpNo << " of " << *Fold.UseMI << '\n'); } } } } return false; }