void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS) const { MachineFunction *MF = MI->getParent()->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); MachineBasicBlock *MBB = MI->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const DebugLoc &DL = MI->getDebugLoc(); unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); unsigned SuperReg = MI->getOperand(0).getReg(); bool IsKill = MI->getOperand(0).isKill(); // SubReg carries the "Kill" flag when SubReg == SuperReg. unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned SubReg = NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, getSubRegFromChannel(i)); struct SIMachineFunctionInfo::SpilledReg Spill = MFI->getSpilledReg(MF, Index, i); if (Spill.hasReg()) { if (SuperReg == AMDGPU::M0) { assert(NumSubRegs == 1); unsigned CopyM0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), CopyM0) .addReg(SuperReg, getKillRegState(IsKill)); // The real spill now kills the temp copy. SubReg = SuperReg = CopyM0; IsKill = true; } BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), Spill.VGPR) .addReg(SubReg, getKillRegState(IsKill)) .addImm(Spill.Lane); // FIXME: Since this spills to another register instead of an actual // frame index, we should delete the frame index when all references to // it are fixed. } else { // Spill SGPR to a frame index. // FIXME we should use S_STORE_DWORD here for VI. MachineInstrBuilder Mov = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) .addReg(SubReg, SubKillState); // There could be undef components of a spilled super register. // TODO: Can we detect this and skip the spill? if (NumSubRegs > 1) { // The last implicit use of the SuperReg carries the "Kill" flag. unsigned SuperKillState = 0; if (i + 1 == e) SuperKillState |= getKillRegState(IsKill); Mov.addReg(SuperReg, RegState::Implicit | SuperKillState); } unsigned Size = FrameInfo.getObjectSize(Index); unsigned Align = FrameInfo.getObjectAlignment(Index); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, Index); MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, Size, Align); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) .addReg(TmpReg, RegState::Kill) // src .addFrameIndex(Index) // vaddr .addReg(MFI->getScratchRSrcReg()) // srrsrc .addReg(MFI->getScratchWaveOffsetReg()) // soffset .addImm(i * 4) // offset .addMemOperand(MMO); } } MI->eraseFromParent(); MFI->addToSpilledSGPRs(NumSubRegs); }
void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS) const { MachineFunction *MF = MI->getParent()->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); MachineBasicBlock *MBB = MI->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const DebugLoc &DL = MI->getDebugLoc(); unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); unsigned SuperReg = MI->getOperand(0).getReg(); // m0 is not allowed as with readlane/writelane, so a temporary SGPR and // extra copy is needed. bool IsM0 = (SuperReg == AMDGPU::M0); if (IsM0) { assert(NumSubRegs == 1); SuperReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); } for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned SubReg = NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, getSubRegFromChannel(i)); SIMachineFunctionInfo::SpilledReg Spill = MFI->getSpilledReg(MF, Index, i); if (Spill.hasReg()) { BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), SubReg) .addReg(Spill.VGPR) .addImm(Spill.Lane) .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); } else { // Restore SGPR from a stack slot. // FIXME: We should use S_LOAD_DWORD here for VI. unsigned Align = FrameInfo.getObjectAlignment(Index); unsigned Size = FrameInfo.getObjectSize(Index); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, Index); MachineMemOperand *MMO = MF->getMachineMemOperand( PtrInfo, MachineMemOperand::MOLoad, Size, Align); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg) .addFrameIndex(Index) // vaddr .addReg(MFI->getScratchRSrcReg()) // srsrc .addReg(MFI->getScratchWaveOffsetReg()) // soffset .addImm(i * 4) // offset .addMemOperand(MMO); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) .addReg(TmpReg, RegState::Kill) .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); } } if (IsM0 && SuperReg != AMDGPU::M0) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) .addReg(SuperReg); } MI->eraseFromParent(); }
void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { MachineFunction *MF = MI->getParent()->getParent(); MachineBasicBlock *MBB = MI->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo *FrameInfo = MF->getFrameInfo(); const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(MF->getSubtarget().getInstrInfo()); DebugLoc DL = MI->getDebugLoc(); MachineOperand &FIOp = MI->getOperand(FIOperandNum); int Index = MI->getOperand(FIOperandNum).getIndex(); switch (MI->getOpcode()) { // SGPR register spill case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S256_SAVE: case AMDGPU::SI_SPILL_S128_SAVE: case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S32_SAVE: { unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(), &AMDGPU::SGPR_32RegClass, i); struct SIMachineFunctionInfo::SpilledReg Spill = MFI->getSpilledReg(MF, Index, i); if (Spill.VGPR == AMDGPU::NoRegister) { LLVMContext &Ctx = MF->getFunction()->getContext(); Ctx.emitError("Ran out of VGPRs for spilling SGPR"); } BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), Spill.VGPR) .addReg(SubReg) .addImm(Spill.Lane); } MI->eraseFromParent(); break; } // SGPR register restore case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S256_RESTORE: case AMDGPU::SI_SPILL_S128_RESTORE: case AMDGPU::SI_SPILL_S64_RESTORE: case AMDGPU::SI_SPILL_S32_RESTORE: { unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(), &AMDGPU::SGPR_32RegClass, i); struct SIMachineFunctionInfo::SpilledReg Spill = MFI->getSpilledReg(MF, Index, i); if (Spill.VGPR == AMDGPU::NoRegister) { LLVMContext &Ctx = MF->getFunction()->getContext(); Ctx.emitError("Ran out of VGPRs for spilling SGPR"); } BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), SubReg) .addReg(Spill.VGPR) .addImm(Spill.Lane) .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); } // TODO: only do this when it is needed switch (MF->getSubtarget<AMDGPUSubtarget>().getGeneration()) { case AMDGPUSubtarget::SOUTHERN_ISLANDS: // "VALU writes SGPR" -> "SMRD reads that SGPR" needs "S_NOP 3" on SI TII->insertNOPs(MI, 3); break; case AMDGPUSubtarget::SEA_ISLANDS: break; default: // VOLCANIC_ISLANDS and later // "VALU writes SGPR -> VMEM reads that SGPR" needs "S_NOP 4" on VI // and later. This also applies to VALUs which write VCC, but we're // unlikely to see VMEM use VCC. TII->insertNOPs(MI, 4); } MI->eraseFromParent(); break; } // VGPR register spill case AMDGPU::SI_SPILL_V512_SAVE: case AMDGPU::SI_SPILL_V256_SAVE: case AMDGPU::SI_SPILL_V128_SAVE: case AMDGPU::SI_SPILL_V96_SAVE: case AMDGPU::SI_SPILL_V64_SAVE: case AMDGPU::SI_SPILL_V32_SAVE: buildScratchLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, TII->getNamedOperand(*MI, AMDGPU::OpName::src)->getReg(), TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(), TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(), FrameInfo->getObjectOffset(Index), RS); MI->eraseFromParent(); break; case AMDGPU::SI_SPILL_V32_RESTORE: case AMDGPU::SI_SPILL_V64_RESTORE: case AMDGPU::SI_SPILL_V96_RESTORE: case AMDGPU::SI_SPILL_V128_RESTORE: case AMDGPU::SI_SPILL_V256_RESTORE: case AMDGPU::SI_SPILL_V512_RESTORE: { buildScratchLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, TII->getNamedOperand(*MI, AMDGPU::OpName::dst)->getReg(), TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(), TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(), FrameInfo->getObjectOffset(Index), RS); MI->eraseFromParent(); break; } default: { int64_t Offset = FrameInfo->getObjectOffset(Index); FIOp.ChangeToImmediate(Offset); if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) { unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, SPAdj); BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) .addImm(Offset); FIOp.ChangeToRegister(TmpReg, false, false, true); } } } }
bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); TRI = static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); bool HaveKill = false; bool NeedWQM = false; bool NeedFlat = false; unsigned Depth = 0; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; MachineBasicBlock::iterator I, Next; for (I = MBB.begin(); I != MBB.end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; if (TII->isWQM(MI) || TII->isDS(MI)) NeedWQM = true; // Flat uses m0 in case it needs to access LDS. if (TII->isFLAT(MI)) NeedFlat = true; switch (MI.getOpcode()) { default: break; case AMDGPU::SI_IF: ++Depth; If(MI); break; case AMDGPU::SI_ELSE: Else(MI); break; case AMDGPU::SI_BREAK: Break(MI); break; case AMDGPU::SI_IF_BREAK: IfBreak(MI); break; case AMDGPU::SI_ELSE_BREAK: ElseBreak(MI); break; case AMDGPU::SI_LOOP: ++Depth; Loop(MI); break; case AMDGPU::SI_END_CF: if (--Depth == 0 && HaveKill) { SkipIfDead(MI); HaveKill = false; } EndCf(MI); break; case AMDGPU::SI_KILL: if (Depth == 0) SkipIfDead(MI); else HaveKill = true; Kill(MI); break; case AMDGPU::S_BRANCH: Branch(MI); break; case AMDGPU::SI_INDIRECT_SRC_V1: case AMDGPU::SI_INDIRECT_SRC_V2: case AMDGPU::SI_INDIRECT_SRC_V4: case AMDGPU::SI_INDIRECT_SRC_V8: case AMDGPU::SI_INDIRECT_SRC_V16: IndirectSrc(MI); break; case AMDGPU::SI_INDIRECT_DST_V1: case AMDGPU::SI_INDIRECT_DST_V2: case AMDGPU::SI_INDIRECT_DST_V4: case AMDGPU::SI_INDIRECT_DST_V8: case AMDGPU::SI_INDIRECT_DST_V16: IndirectDst(MI); break; } } } if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) { MachineBasicBlock &MBB = MF.front(); BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC).addReg(AMDGPU::EXEC); } if (NeedFlat && MFI->IsKernel) { // TODO: What to use with function calls? // We will need to Initialize the flat scratch register pair. if (NeedFlat) MFI->setHasFlatInstructions(true); } return true; }
void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS) const { MachineFunction *MF = MI->getParent()->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); MachineBasicBlock *MBB = MI->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const DebugLoc &DL = MI->getDebugLoc(); unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); unsigned SuperReg = MI->getOperand(0).getReg(); bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM; assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); unsigned OffsetReg = AMDGPU::M0; unsigned M0CopyReg = AMDGPU::NoRegister; if (SpillToSMEM) { if (RS->isRegUsed(AMDGPU::M0)) { M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg) .addReg(AMDGPU::M0); } } // SubReg carries the "Kill" flag when SubReg == SuperReg. int64_t FrOffset = FrameInfo.getObjectOffset(Index); const unsigned EltSize = 4; for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { unsigned SubReg = NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, getSubRegFromChannel(i)); if (SpillToSMEM) { unsigned Align = FrameInfo.getObjectAlignment(Index); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, EltSize, MinAlign(Align, EltSize * i)); // Add i * 4 offset int64_t Offset = ST.getWavefrontSize() * (FrOffset + 4 * i); if (Offset != 0) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) .addReg(MFI->getScratchWaveOffsetReg()) .addImm(Offset); } else { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) .addReg(MFI->getScratchWaveOffsetReg()); } auto MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BUFFER_LOAD_DWORD_SGPR), SubReg) .addReg(MFI->getScratchRSrcReg()) // sbase .addReg(OffsetReg, RegState::Kill) // soff .addImm(0) // glc .addMemOperand(MMO); if (NumSubRegs > 1) MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); continue; } SIMachineFunctionInfo::SpilledReg Spill = MFI->getSpilledReg(MF, Index, i); if (Spill.hasReg()) { auto MIB = BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), SubReg) .addReg(Spill.VGPR) .addImm(Spill.Lane); if (NumSubRegs > 1) MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); } else { // Restore SGPR from a stack slot. // FIXME: We should use S_LOAD_DWORD here for VI. unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned Align = FrameInfo.getObjectAlignment(Index); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, EltSize, MinAlign(Align, EltSize * i)); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg) .addFrameIndex(Index) // vaddr .addReg(MFI->getScratchRSrcReg()) // srsrc .addReg(MFI->getScratchWaveOffsetReg()) // soffset .addImm(i * 4) // offset .addMemOperand(MMO); auto MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) .addReg(TmpReg, RegState::Kill); if (NumSubRegs > 1) MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); } } if (M0CopyReg != AMDGPU::NoRegister) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0) .addReg(M0CopyReg, RegState::Kill); } MI->eraseFromParent(); }
void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { MachineFunction *MF = MI->getParent()->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); MachineBasicBlock *MBB = MI->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); MachineOperand &FIOp = MI->getOperand(FIOperandNum); int Index = MI->getOperand(FIOperandNum).getIndex(); switch (MI->getOpcode()) { // SGPR register spill case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S256_SAVE: case AMDGPU::SI_SPILL_S128_SAVE: case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S32_SAVE: { spillSGPR(MI, Index, RS); break; } // SGPR register restore case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S256_RESTORE: case AMDGPU::SI_SPILL_S128_RESTORE: case AMDGPU::SI_SPILL_S64_RESTORE: case AMDGPU::SI_SPILL_S32_RESTORE: { restoreSGPR(MI, Index, RS); break; } // VGPR register spill case AMDGPU::SI_SPILL_V512_SAVE: case AMDGPU::SI_SPILL_V256_SAVE: case AMDGPU::SI_SPILL_V128_SAVE: case AMDGPU::SI_SPILL_V96_SAVE: case AMDGPU::SI_SPILL_V64_SAVE: case AMDGPU::SI_SPILL_V32_SAVE: { const MachineOperand *VData = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, Index, VData->getReg(), VData->isKill(), TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(), TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), *MI->memoperands_begin(), RS); MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); MI->eraseFromParent(); break; } case AMDGPU::SI_SPILL_V32_RESTORE: case AMDGPU::SI_SPILL_V64_RESTORE: case AMDGPU::SI_SPILL_V96_RESTORE: case AMDGPU::SI_SPILL_V128_RESTORE: case AMDGPU::SI_SPILL_V256_RESTORE: case AMDGPU::SI_SPILL_V512_RESTORE: { const MachineOperand *VData = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, Index, VData->getReg(), VData->isKill(), TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(), TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), *MI->memoperands_begin(), RS); MI->eraseFromParent(); break; } default: { if (TII->isMUBUF(*MI)) { // Disable offen so we don't need a 0 vgpr base. assert(static_cast<int>(FIOperandNum) == AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr)); int64_t Offset = FrameInfo.getObjectOffset(Index); int64_t OldImm = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(); int64_t NewOffset = OldImm + Offset; if (isUInt<12>(NewOffset) && buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)) { MI->eraseFromParent(); break; } } int64_t Offset = FrameInfo.getObjectOffset(Index); FIOp.ChangeToImmediate(Offset); if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) .addImm(Offset); FIOp.ChangeToRegister(TmpReg, false, false, true); } } } }
void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS) const { MachineFunction *MF = MI->getParent()->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); MachineBasicBlock *MBB = MI->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const DebugLoc &DL = MI->getDebugLoc(); unsigned SuperReg = MI->getOperand(0).getReg(); bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM; assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); unsigned OffsetReg = AMDGPU::M0; unsigned M0CopyReg = AMDGPU::NoRegister; if (SpillToSMEM) { if (RS->isRegUsed(AMDGPU::M0)) { M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg) .addReg(AMDGPU::M0); } } unsigned EltSize = 4; unsigned ScalarLoadOp; const TargetRegisterClass *RC = getPhysRegClass(SuperReg); if (SpillToSMEM && isSGPRClass(RC)) { // XXX - if private_element_size is larger than 4 it might be useful to be // able to spill wider vmem spills. std::tie(EltSize, ScalarLoadOp) = getSpillEltSize(RC->getSize(), false); } ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); // SubReg carries the "Kill" flag when SubReg == SuperReg. int64_t FrOffset = FrameInfo.getObjectOffset(Index); for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { unsigned SubReg = NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); if (SpillToSMEM) { // FIXME: Size may be > 4 but extra bytes wasted. unsigned Align = FrameInfo.getObjectAlignment(Index); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, EltSize, MinAlign(Align, EltSize * i)); // Add i * 4 offset int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); if (Offset != 0) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) .addReg(MFI->getScratchWaveOffsetReg()) .addImm(Offset); } else { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) .addReg(MFI->getScratchWaveOffsetReg()); } auto MIB = BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg) .addReg(MFI->getScratchRSrcReg()) // sbase .addReg(OffsetReg, RegState::Kill) // soff .addImm(0) // glc .addMemOperand(MMO); if (NumSubRegs > 1) MIB.addReg(SuperReg, RegState::ImplicitDefine); continue; } SIMachineFunctionInfo::SpilledReg Spill = MFI->getSpilledReg(MF, Index, i); if (Spill.hasReg()) { auto MIB = BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), SubReg) .addReg(Spill.VGPR) .addImm(Spill.Lane); if (NumSubRegs > 1) MIB.addReg(SuperReg, RegState::ImplicitDefine); } else { // Restore SGPR from a stack slot. // FIXME: We should use S_LOAD_DWORD here for VI. unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned Align = FrameInfo.getObjectAlignment(Index); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, EltSize, MinAlign(Align, EltSize * i)); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg) .addFrameIndex(Index) // vaddr .addReg(MFI->getScratchRSrcReg()) // srsrc .addReg(MFI->getScratchWaveOffsetReg()) // soffset .addImm(i * 4) // offset .addMemOperand(MMO); auto MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) .addReg(TmpReg, RegState::Kill); if (NumSubRegs > 1) MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); } } if (M0CopyReg != AMDGPU::NoRegister) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0) .addReg(M0CopyReg, RegState::Kill); } MI->eraseFromParent(); }
void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS) const { MachineBasicBlock *MBB = MI->getParent(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); unsigned SuperReg = MI->getOperand(0).getReg(); bool IsKill = MI->getOperand(0).isKill(); const DebugLoc &DL = MI->getDebugLoc(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM; assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); const unsigned EltSize = 4; unsigned OffsetReg = AMDGPU::M0; unsigned M0CopyReg = AMDGPU::NoRegister; if (SpillToSMEM) { if (RS->isRegUsed(AMDGPU::M0)) { M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg) .addReg(AMDGPU::M0); } } // SubReg carries the "Kill" flag when SubReg == SuperReg. unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { unsigned SubReg = NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, getSubRegFromChannel(i)); if (SpillToSMEM) { int64_t FrOffset = FrameInfo.getObjectOffset(Index); unsigned Align = FrameInfo.getObjectAlignment(Index); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, EltSize, MinAlign(Align, EltSize * i)); // Add i * 4 wave offset. // // SMEM instructions only support a single offset, so increment the wave // offset. int64_t Offset = ST.getWavefrontSize() * (FrOffset + 4 * i); if (Offset != 0) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) .addReg(MFI->getScratchWaveOffsetReg()) .addImm(Offset); } else { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) .addReg(MFI->getScratchWaveOffsetReg()); } BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BUFFER_STORE_DWORD_SGPR)) .addReg(SubReg, getKillRegState(IsKill)) // sdata .addReg(MFI->getScratchRSrcReg()) // sbase .addReg(OffsetReg, RegState::Kill) // soff .addImm(0) // glc .addMemOperand(MMO); continue; } struct SIMachineFunctionInfo::SpilledReg Spill = MFI->getSpilledReg(MF, Index, i); if (Spill.hasReg()) { BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), Spill.VGPR) .addReg(SubReg, getKillRegState(IsKill)) .addImm(Spill.Lane); // FIXME: Since this spills to another register instead of an actual // frame index, we should delete the frame index when all references to // it are fixed. } else { // Spill SGPR to a frame index. // TODO: Should VI try to spill to VGPR and then spill to SMEM? unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); // TODO: Should VI try to spill to VGPR and then spill to SMEM? MachineInstrBuilder Mov = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) .addReg(SubReg, SubKillState); // There could be undef components of a spilled super register. // TODO: Can we detect this and skip the spill? if (NumSubRegs > 1) { // The last implicit use of the SuperReg carries the "Kill" flag. unsigned SuperKillState = 0; if (i + 1 == e) SuperKillState |= getKillRegState(IsKill); Mov.addReg(SuperReg, RegState::Implicit | SuperKillState); } unsigned Align = FrameInfo.getObjectAlignment(Index); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, EltSize, MinAlign(Align, EltSize * i)); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) .addReg(TmpReg, RegState::Kill) // src .addFrameIndex(Index) // vaddr .addReg(MFI->getScratchRSrcReg()) // srrsrc .addReg(MFI->getScratchWaveOffsetReg()) // soffset .addImm(i * 4) // offset .addMemOperand(MMO); } } if (M0CopyReg != AMDGPU::NoRegister) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0) .addReg(M0CopyReg, RegState::Kill); } MI->eraseFromParent(); MFI->addToSpilledSGPRs(NumSubRegs); }
void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS) const { MachineBasicBlock *MBB = MI->getParent(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); unsigned SuperReg = MI->getOperand(0).getReg(); bool IsKill = MI->getOperand(0).isKill(); const DebugLoc &DL = MI->getDebugLoc(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM; assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); unsigned OffsetReg = AMDGPU::M0; unsigned M0CopyReg = AMDGPU::NoRegister; if (SpillToSMEM) { if (RS->isRegUsed(AMDGPU::M0)) { M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg) .addReg(AMDGPU::M0); } } unsigned ScalarStoreOp; unsigned EltSize = 4; const TargetRegisterClass *RC = getPhysRegClass(SuperReg); if (SpillToSMEM && isSGPRClass(RC)) { // XXX - if private_element_size is larger than 4 it might be useful to be // able to spill wider vmem spills. std::tie(EltSize, ScalarStoreOp) = getSpillEltSize(RC->getSize(), true); } ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize); unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size(); // SubReg carries the "Kill" flag when SubReg == SuperReg. unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { unsigned SubReg = NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]); if (SpillToSMEM) { int64_t FrOffset = FrameInfo.getObjectOffset(Index); // The allocated memory size is really the wavefront size * the frame // index size. The widest register class is 64 bytes, so a 4-byte scratch // allocation is enough to spill this in a single stack object. // // FIXME: Frame size/offsets are computed earlier than this, so the extra // space is still unnecessarily allocated. unsigned Align = FrameInfo.getObjectAlignment(Index); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, EltSize, MinAlign(Align, EltSize * i)); // SMEM instructions only support a single offset, so increment the wave // offset. int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i); if (Offset != 0) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg) .addReg(MFI->getScratchWaveOffsetReg()) .addImm(Offset); } else { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) .addReg(MFI->getScratchWaveOffsetReg()); } BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp)) .addReg(SubReg, getKillRegState(IsKill)) // sdata .addReg(MFI->getScratchRSrcReg()) // sbase .addReg(OffsetReg, RegState::Kill) // soff .addImm(0) // glc .addMemOperand(MMO); continue; } struct SIMachineFunctionInfo::SpilledReg Spill = MFI->getSpilledReg(MF, Index, i); if (Spill.hasReg()) { BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), Spill.VGPR) .addReg(SubReg, getKillRegState(IsKill)) .addImm(Spill.Lane); // FIXME: Since this spills to another register instead of an actual // frame index, we should delete the frame index when all references to // it are fixed. } else { // Spill SGPR to a frame index. // TODO: Should VI try to spill to VGPR and then spill to SMEM? unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); // TODO: Should VI try to spill to VGPR and then spill to SMEM? MachineInstrBuilder Mov = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) .addReg(SubReg, SubKillState); // There could be undef components of a spilled super register. // TODO: Can we detect this and skip the spill? if (NumSubRegs > 1) { // The last implicit use of the SuperReg carries the "Kill" flag. unsigned SuperKillState = 0; if (i + 1 == e) SuperKillState |= getKillRegState(IsKill); Mov.addReg(SuperReg, RegState::Implicit | SuperKillState); } unsigned Align = FrameInfo.getObjectAlignment(Index); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i); MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, EltSize, MinAlign(Align, EltSize * i)); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) .addReg(TmpReg, RegState::Kill) // src .addFrameIndex(Index) // vaddr .addReg(MFI->getScratchRSrcReg()) // srrsrc .addReg(MFI->getScratchWaveOffsetReg()) // soffset .addImm(i * 4) // offset .addMemOperand(MMO); } } if (M0CopyReg != AMDGPU::NoRegister) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0) .addReg(M0CopyReg, RegState::Kill); } MI->eraseFromParent(); MFI->addToSpilledSGPRs(NumSubRegs); }
bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); TRI = static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); bool HaveKill = false; bool NeedM0 = false; bool NeedWQM = false; bool NeedFlat = false; unsigned Depth = 0; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; MachineBasicBlock::iterator I, Next; for (I = MBB.begin(); I != MBB.end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; if (TII->isDS(MI.getOpcode())) { NeedM0 = true; NeedWQM = true; } // Flat uses m0 in case it needs to access LDS. if (TII->isFLAT(MI.getOpcode())) { NeedM0 = true; NeedFlat = true; } switch (MI.getOpcode()) { default: break; case AMDGPU::SI_IF: ++Depth; If(MI); break; case AMDGPU::SI_ELSE: Else(MI); break; case AMDGPU::SI_BREAK: Break(MI); break; case AMDGPU::SI_IF_BREAK: IfBreak(MI); break; case AMDGPU::SI_ELSE_BREAK: ElseBreak(MI); break; case AMDGPU::SI_LOOP: ++Depth; Loop(MI); break; case AMDGPU::SI_END_CF: if (--Depth == 0 && HaveKill) { SkipIfDead(MI); HaveKill = false; } EndCf(MI); break; case AMDGPU::SI_KILL: if (Depth == 0) SkipIfDead(MI); else HaveKill = true; Kill(MI); break; case AMDGPU::S_BRANCH: Branch(MI); break; case AMDGPU::SI_INDIRECT_SRC: IndirectSrc(MI); break; case AMDGPU::SI_INDIRECT_DST_V1: case AMDGPU::SI_INDIRECT_DST_V2: case AMDGPU::SI_INDIRECT_DST_V4: case AMDGPU::SI_INDIRECT_DST_V8: case AMDGPU::SI_INDIRECT_DST_V16: IndirectDst(MI); break; case AMDGPU::V_INTERP_P1_F32: case AMDGPU::V_INTERP_P2_F32: case AMDGPU::V_INTERP_MOV_F32: NeedWQM = true; break; } } } if (NeedM0) { MachineBasicBlock &MBB = MF.front(); // Initialize M0 to a value that won't cause LDS access to be discarded // due to offset clamping InitM0ForLDS(MBB.getFirstNonPHI()); } if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) { MachineBasicBlock &MBB = MF.front(); BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC).addReg(AMDGPU::EXEC); } // FIXME: This seems inappropriate to do here. if (NeedFlat && MFI->IsKernel) { // Insert the prologue initializing the SGPRs pointing to the scratch space // for flat accesses. const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); // TODO: What to use with function calls? // FIXME: This is reporting stack size that is used in a scratch buffer // rather than registers as well. uint64_t StackSizeBytes = FrameInfo->getStackSize(); int IndirectBegin = static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF); // Convert register index to 256-byte unit. uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256); assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff && "Stack limits should be smaller than 16-bits"); // Initialize the flat scratch register pair. // TODO: Can we use one s_mov_b64 here? // Offset is in units of 256-bytes. MachineBasicBlock &MBB = MF.front(); DebugLoc NoDL; MachineBasicBlock::iterator Start = MBB.getFirstNonPHI(); const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32); BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO) .addImm(StackOffset); // Documentation says size is "per-thread scratch size in bytes" BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI) .addImm(StackSizeBytes); } return true; }
void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { MachineFunction *MF = MI->getParent()->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); MachineBasicBlock *MBB = MI->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo *FrameInfo = MF->getFrameInfo(); const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); MachineOperand &FIOp = MI->getOperand(FIOperandNum); int Index = MI->getOperand(FIOperandNum).getIndex(); switch (MI->getOpcode()) { // SGPR register spill case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S256_SAVE: case AMDGPU::SI_SPILL_S128_SAVE: case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S32_SAVE: { unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned SuperReg = MI->getOperand(0).getReg(); bool IsKill = MI->getOperand(0).isKill(); // SubReg carries the "Kill" flag when SubReg == SuperReg. unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { unsigned SubReg = getPhysRegSubReg(SuperReg, &AMDGPU::SGPR_32RegClass, i); struct SIMachineFunctionInfo::SpilledReg Spill = MFI->getSpilledReg(MF, Index, i); if (Spill.hasReg()) { BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), Spill.VGPR) .addReg(SubReg, getKillRegState(IsKill)) .addImm(Spill.Lane); // FIXME: Since this spills to another register instead of an actual // frame index, we should delete the frame index when all references to // it are fixed. } else { // Spill SGPR to a frame index. // FIXME we should use S_STORE_DWORD here for VI. MachineInstrBuilder Mov = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) .addReg(SubReg, SubKillState); // There could be undef components of a spilled super register. // TODO: Can we detect this and skip the spill? if (NumSubRegs > 1) { // The last implicit use of the SuperReg carries the "Kill" flag. unsigned SuperKillState = 0; if (i + 1 == e) SuperKillState |= getKillRegState(IsKill); Mov.addReg(SuperReg, RegState::Implicit | SuperKillState); } unsigned Size = FrameInfo->getObjectSize(Index); unsigned Align = FrameInfo->getObjectAlignment(Index); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, Index); MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, Size, Align); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) .addReg(TmpReg, RegState::Kill) // src .addFrameIndex(Index) // frame_idx .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset .addImm(i * 4) // offset .addMemOperand(MMO); } } MI->eraseFromParent(); break; } // SGPR register restore case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S256_RESTORE: case AMDGPU::SI_SPILL_S128_RESTORE: case AMDGPU::SI_SPILL_S64_RESTORE: case AMDGPU::SI_SPILL_S32_RESTORE: { unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(), &AMDGPU::SGPR_32RegClass, i); struct SIMachineFunctionInfo::SpilledReg Spill = MFI->getSpilledReg(MF, Index, i); if (Spill.hasReg()) { BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), SubReg) .addReg(Spill.VGPR) .addImm(Spill.Lane) .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); } else { // Restore SGPR from a stack slot. // FIXME: We should use S_LOAD_DWORD here for VI. unsigned Align = FrameInfo->getObjectAlignment(Index); unsigned Size = FrameInfo->getObjectSize(Index); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, Index); MachineMemOperand *MMO = MF->getMachineMemOperand( PtrInfo, MachineMemOperand::MOLoad, Size, Align); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg) .addFrameIndex(Index) // frame_idx .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset .addImm(i * 4) // offset .addMemOperand(MMO); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) .addReg(TmpReg, RegState::Kill) .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); } } MI->eraseFromParent(); break; } // VGPR register spill case AMDGPU::SI_SPILL_V512_SAVE: case AMDGPU::SI_SPILL_V256_SAVE: case AMDGPU::SI_SPILL_V128_SAVE: case AMDGPU::SI_SPILL_V96_SAVE: case AMDGPU::SI_SPILL_V64_SAVE: case AMDGPU::SI_SPILL_V32_SAVE: buildScratchLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, TII->getNamedOperand(*MI, AMDGPU::OpName::src), TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(), TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(), FrameInfo->getObjectOffset(Index) + TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS); MI->eraseFromParent(); break; case AMDGPU::SI_SPILL_V32_RESTORE: case AMDGPU::SI_SPILL_V64_RESTORE: case AMDGPU::SI_SPILL_V96_RESTORE: case AMDGPU::SI_SPILL_V128_RESTORE: case AMDGPU::SI_SPILL_V256_RESTORE: case AMDGPU::SI_SPILL_V512_RESTORE: { buildScratchLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, TII->getNamedOperand(*MI, AMDGPU::OpName::dst), TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(), TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(), FrameInfo->getObjectOffset(Index) + TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS); MI->eraseFromParent(); break; } default: { int64_t Offset = FrameInfo->getObjectOffset(Index); FIOp.ChangeToImmediate(Offset); if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) { unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) .addImm(Offset); FIOp.ChangeToRegister(TmpReg, false, false, true); } } } }
bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); TRI = static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); bool HaveKill = false; bool NeedFlat = false; unsigned Depth = 0; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock *EmptyMBBAtEnd = NULL; MachineBasicBlock &MBB = *BI; MachineBasicBlock::iterator I, Next; bool ExecModified = false; for (I = MBB.begin(); I != MBB.end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; // Flat uses m0 in case it needs to access LDS. if (TII->isFLAT(MI)) NeedFlat = true; for (const auto &Def : I->defs()) { if (Def.isReg() && Def.isDef() && Def.getReg() == AMDGPU::EXEC) { ExecModified = true; break; } } switch (MI.getOpcode()) { default: break; case AMDGPU::SI_IF: ++Depth; If(MI); break; case AMDGPU::SI_ELSE: Else(MI, ExecModified); break; case AMDGPU::SI_BREAK: Break(MI); break; case AMDGPU::SI_IF_BREAK: IfBreak(MI); break; case AMDGPU::SI_ELSE_BREAK: ElseBreak(MI); break; case AMDGPU::SI_LOOP: ++Depth; Loop(MI); break; case AMDGPU::SI_END_CF: if (--Depth == 0 && HaveKill) { SkipIfDead(MI); HaveKill = false; } EndCf(MI); break; case AMDGPU::SI_KILL: if (Depth == 0) SkipIfDead(MI); else HaveKill = true; Kill(MI); break; case AMDGPU::S_BRANCH: Branch(MI); break; case AMDGPU::SI_INDIRECT_SRC_V1: case AMDGPU::SI_INDIRECT_SRC_V2: case AMDGPU::SI_INDIRECT_SRC_V4: case AMDGPU::SI_INDIRECT_SRC_V8: case AMDGPU::SI_INDIRECT_SRC_V16: IndirectSrc(MI); break; case AMDGPU::SI_INDIRECT_DST_V1: case AMDGPU::SI_INDIRECT_DST_V2: case AMDGPU::SI_INDIRECT_DST_V4: case AMDGPU::SI_INDIRECT_DST_V8: case AMDGPU::SI_INDIRECT_DST_V16: IndirectDst(MI); break; case AMDGPU::S_ENDPGM: { if (MF.getInfo<SIMachineFunctionInfo>()->returnsVoid()) break; // Graphics shaders returning non-void shouldn't contain S_ENDPGM, // because external bytecode will be appended at the end. if (BI != --MF.end() || I != MBB.getFirstTerminator()) { // S_ENDPGM is not the last instruction. Add an empty block at // the end and jump there. if (!EmptyMBBAtEnd) { EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); MF.insert(MF.end(), EmptyMBBAtEnd); } MBB.addSuccessor(EmptyMBBAtEnd); BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) .addMBB(EmptyMBBAtEnd); } I->eraseFromParent(); break; } } } } if (NeedFlat && MFI->IsKernel) { // TODO: What to use with function calls? // We will need to Initialize the flat scratch register pair. if (NeedFlat) MFI->setHasFlatInstructions(true); } return true; }
void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { MachineFunction *MF = MI->getParent()->getParent(); MachineBasicBlock *MBB = MI->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo *FrameInfo = MF->getFrameInfo(); const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo()); DebugLoc DL = MI->getDebugLoc(); MachineOperand &FIOp = MI->getOperand(FIOperandNum); int Index = MI->getOperand(FIOperandNum).getIndex(); switch (MI->getOpcode()) { // SGPR register spill case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S256_SAVE: case AMDGPU::SI_SPILL_S128_SAVE: case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S32_SAVE: { unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(), &AMDGPU::SGPR_32RegClass, i); struct SIMachineFunctionInfo::SpilledReg Spill = MFI->getSpilledReg(MF, Index, i); if (Spill.VGPR == AMDGPU::NoRegister) { LLVMContext &Ctx = MF->getFunction()->getContext(); Ctx.emitError("Ran out of VGPRs for spilling SGPR"); } BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill.VGPR) .addReg(SubReg) .addImm(Spill.Lane); } MI->eraseFromParent(); break; } // SGPR register restore case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S256_RESTORE: case AMDGPU::SI_SPILL_S128_RESTORE: case AMDGPU::SI_SPILL_S64_RESTORE: case AMDGPU::SI_SPILL_S32_RESTORE: { unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(), &AMDGPU::SGPR_32RegClass, i); bool isM0 = SubReg == AMDGPU::M0; struct SIMachineFunctionInfo::SpilledReg Spill = MFI->getSpilledReg(MF, Index, i); if (Spill.VGPR == AMDGPU::NoRegister) { LLVMContext &Ctx = MF->getFunction()->getContext(); Ctx.emitError("Ran out of VGPRs for spilling SGPR"); } if (isM0) { SubReg = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0); } BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg) .addReg(Spill.VGPR) .addImm(Spill.Lane); if (isM0) { BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) .addReg(SubReg); } } TII->insertNOPs(MI, 3); MI->eraseFromParent(); break; } // VGPR register spill case AMDGPU::SI_SPILL_V512_SAVE: case AMDGPU::SI_SPILL_V256_SAVE: case AMDGPU::SI_SPILL_V128_SAVE: case AMDGPU::SI_SPILL_V96_SAVE: case AMDGPU::SI_SPILL_V64_SAVE: case AMDGPU::SI_SPILL_V32_SAVE: { unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); unsigned SrcReg = MI->getOperand(0).getReg(); int64_t Offset = FrameInfo->getObjectOffset(Index); unsigned Size = NumSubRegs * 4; unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); for (unsigned i = 0, e = NumSubRegs; i != e; ++i) { unsigned SubReg = NumSubRegs > 1 ? getPhysRegSubReg(SrcReg, &AMDGPU::VGPR_32RegClass, i) : SrcReg; Offset += (i * 4); MFI->LDSWaveSpillSize = std::max((unsigned)Offset + 4, (unsigned)MFI->LDSWaveSpillSize); unsigned AddrReg = TII->calculateLDSSpillAddress(*MBB, MI, RS, TmpReg, Offset, Size); if (AddrReg == AMDGPU::NoRegister) { LLVMContext &Ctx = MF->getFunction()->getContext(); Ctx.emitError("Ran out of VGPRs for spilling VGPRS"); AddrReg = AMDGPU::VGPR0; } // Store the value in LDS BuildMI(*MBB, MI, DL, TII->get(AMDGPU::DS_WRITE_B32)) .addImm(0) // gds .addReg(AddrReg, RegState::Kill) // addr .addReg(SubReg) // data0 .addImm(0); // offset } MI->eraseFromParent(); break; } case AMDGPU::SI_SPILL_V32_RESTORE: case AMDGPU::SI_SPILL_V64_RESTORE: case AMDGPU::SI_SPILL_V128_RESTORE: case AMDGPU::SI_SPILL_V256_RESTORE: case AMDGPU::SI_SPILL_V512_RESTORE: { unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); unsigned DstReg = MI->getOperand(0).getReg(); int64_t Offset = FrameInfo->getObjectOffset(Index); unsigned Size = NumSubRegs * 4; unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); // FIXME: We could use DS_READ_B64 here to optimize for larger registers. for (unsigned i = 0, e = NumSubRegs; i != e; ++i) { unsigned SubReg = NumSubRegs > 1 ? getPhysRegSubReg(DstReg, &AMDGPU::VGPR_32RegClass, i) : DstReg; Offset += (i * 4); unsigned AddrReg = TII->calculateLDSSpillAddress(*MBB, MI, RS, TmpReg, Offset, Size); if (AddrReg == AMDGPU::NoRegister) { LLVMContext &Ctx = MF->getFunction()->getContext(); Ctx.emitError("Ran out of VGPRs for spilling VGPRs"); AddrReg = AMDGPU::VGPR0; } BuildMI(*MBB, MI, DL, TII->get(AMDGPU::DS_READ_B32), SubReg) .addImm(0) // gds .addReg(AddrReg, RegState::Kill) // addr .addImm(0); //offset } MI->eraseFromParent(); break; } default: { int64_t Offset = FrameInfo->getObjectOffset(Index); FIOp.ChangeToImmediate(Offset); if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) { unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VReg_32RegClass, MI, SPAdj); BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) .addImm(Offset); FIOp.ChangeToRegister(TmpReg, false); } } } }