// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not // need to handle the case where an SGPR may need to be spilled while spilling. static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, MachineFrameInfo &MFI, MachineBasicBlock::iterator MI, int Index, int64_t Offset) { MachineBasicBlock *MBB = MI->getParent(); const DebugLoc &DL = MI->getDebugLoc(); bool IsStore = MI->mayStore(); unsigned Opc = MI->getOpcode(); int LoadStoreOp = IsStore ? getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc); if (LoadStoreOp == -1) return false; unsigned Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata)->getReg(); BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) .addReg(Reg, getDefRegState(!IsStore)) .addOperand(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) .addOperand(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) .addImm(Offset) .addImm(0) // glc .addImm(0) // slc .addImm(0) // tfe .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); return true; }
void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI) const { MachineFunction &MF = *MI->getParent()->getParent(); const ARMSubtarget &Subtarget = MF.getSubtarget<ARMSubtarget>(); const TargetMachine &TM = MF.getTarget(); if (!Subtarget.useMovt(MF)) { if (TM.isPositionIndependent()) expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_pcrel, ARM::LDRi12); else expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_abs, ARM::LDRi12); return; } if (!TM.isPositionIndependent()) { expandLoadStackGuardBase(MI, ARM::MOVi32imm, ARM::LDRi12); return; } const GlobalValue *GV = cast<GlobalValue>((*MI->memoperands_begin())->getValue()); if (!Subtarget.isGVIndirectSymbol(GV)) { expandLoadStackGuardBase(MI, ARM::MOV_ga_pcrel, ARM::LDRi12); return; } MachineBasicBlock &MBB = *MI->getParent(); DebugLoc DL = MI->getDebugLoc(); unsigned Reg = MI->getOperand(0).getReg(); MachineInstrBuilder MIB; MIB = BuildMI(MBB, MI, DL, get(ARM::MOV_ga_pcrel_ldr), Reg) .addGlobalAddress(GV, 0, ARMII::MO_NONLAZY); auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant; MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand( MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 4, 4); MIB.addMemOperand(MMO); BuildMI(MBB, MI, DL, get(ARM::LDRi12), Reg) .addReg(Reg, RegState::Kill) .addImm(0) .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()) .add(predOps(ARMCC::AL)); }
void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, unsigned LoadStoreOp, unsigned Value, unsigned ScratchRsrcReg, unsigned ScratchOffset, int64_t Offset, RegScavenger *RS) const { MachineBasicBlock *MBB = MI->getParent(); const MachineFunction *MF = MI->getParent()->getParent(); const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(MF->getSubtarget().getInstrInfo()); LLVMContext &Ctx = MF->getFunction()->getContext(); DebugLoc DL = MI->getDebugLoc(); bool IsLoad = TII->get(LoadStoreOp).mayLoad(); bool RanOutOfSGPRs = false; unsigned SOffset = ScratchOffset; unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); unsigned Size = NumSubRegs * 4; if (!isUInt<12>(Offset + Size)) { SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0); if (SOffset == AMDGPU::NoRegister) { RanOutOfSGPRs = true; SOffset = AMDGPU::SGPR0; } BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) .addReg(ScratchOffset) .addImm(Offset); Offset = 0; } if (RanOutOfSGPRs) Ctx.emitError("Ran out of SGPRs for spilling VGPRS"); for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) { unsigned SubReg = NumSubRegs > 1 ? getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) : Value; bool IsKill = (i == e - 1); BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) .addReg(SubReg, getDefRegState(IsLoad)) .addReg(ScratchRsrcReg, getKillRegState(IsKill)) .addReg(SOffset) .addImm(Offset) .addImm(0) // glc .addImm(0) // slc .addImm(0) // tfe .addReg(Value, RegState::Implicit | getDefRegState(IsLoad)) .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); } }
void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI, Reloc::Model RM) const { MachineFunction &MF = *MI->getParent()->getParent(); const ARMSubtarget &Subtarget = MF.getSubtarget<ARMSubtarget>(); if (!Subtarget.useMovt(MF)) { if (RM == Reloc::PIC_) expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_pcrel, ARM::LDRi12, RM); else expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_abs, ARM::LDRi12, RM); return; } if (RM != Reloc::PIC_) { expandLoadStackGuardBase(MI, ARM::MOVi32imm, ARM::LDRi12, RM); return; } const GlobalValue *GV = cast<GlobalValue>((*MI->memoperands_begin())->getValue()); if (!Subtarget.GVIsIndirectSymbol(GV, RM)) { expandLoadStackGuardBase(MI, ARM::MOV_ga_pcrel, ARM::LDRi12, RM); return; } MachineBasicBlock &MBB = *MI->getParent(); DebugLoc DL = MI->getDebugLoc(); unsigned Reg = MI->getOperand(0).getReg(); MachineInstrBuilder MIB; MIB = BuildMI(MBB, MI, DL, get(ARM::MOV_ga_pcrel_ldr), Reg) .addGlobalAddress(GV, 0, ARMII::MO_NONLAZY); unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant; MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand( MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 4, 4); MIB.addMemOperand(MMO); MIB = BuildMI(MBB, MI, DL, get(ARM::LDRi12), Reg); MIB.addReg(Reg, RegState::Kill).addImm(0); MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); AddDefaultPred(MIB); }
MachineInstrBuilder MipsInstrInfo::genInstrWithNewOpc(unsigned NewOpc, MachineBasicBlock::iterator I) const { MachineInstrBuilder MIB; // Certain branches have two forms: e.g beq $1, $zero, dst vs beqz $1, dest // Pick the zero form of the branch for readable assembly and for greater // branch distance in non-microMIPS mode. // FIXME: Certain atomic sequences on mips64 generate 32bit references to // Mips::ZERO, which is incorrect. This test should be updated to use // Subtarget.getABI().GetZeroReg() when those atomic sequences and others // are fixed. bool BranchWithZeroOperand = (I->isBranch() && !I->isPseudo() && I->getOperand(1).isReg() && (I->getOperand(1).getReg() == Mips::ZERO || I->getOperand(1).getReg() == Mips::ZERO_64)); if (BranchWithZeroOperand) { switch (NewOpc) { case Mips::BEQC: NewOpc = Mips::BEQZC; break; case Mips::BNEC: NewOpc = Mips::BNEZC; break; case Mips::BGEC: NewOpc = Mips::BGEZC; break; case Mips::BLTC: NewOpc = Mips::BLTZC; break; } } MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), get(NewOpc)); // For MIPSR6 JI*C requires an immediate 0 as an operand, JIALC(64) an // immediate 0 as an operand and requires the removal of it's %RA<imp-def> // implicit operand as copying the implicit operations of the instructio we're // looking at will give us the correct flags. if (NewOpc == Mips::JIC || NewOpc == Mips::JIALC || NewOpc == Mips::JIC64 || NewOpc == Mips::JIALC64) { if (NewOpc == Mips::JIALC || NewOpc == Mips::JIALC64) MIB->RemoveOperand(0); for (unsigned J = 0, E = I->getDesc().getNumOperands(); J < E; ++J) { MIB.addOperand(I->getOperand(J)); } MIB.addImm(0); } else if (BranchWithZeroOperand) { // For MIPSR6 and microMIPS branches with an explicit zero operand, copy // everything after the zero. MIB.addOperand(I->getOperand(0)); for (unsigned J = 2, E = I->getDesc().getNumOperands(); J < E; ++J) { MIB.addOperand(I->getOperand(J)); } } else { // All other cases copy all other operands. for (unsigned J = 0, E = I->getDesc().getNumOperands(); J < E; ++J) { MIB.addOperand(I->getOperand(J)); } } MIB.copyImplicitOps(*I); MIB.setMemRefs(I->memoperands_begin(), I->memoperands_end()); return MIB; }
MachineInstrBuilder MipsInstrInfo::genInstrWithNewOpc(unsigned NewOpc, MachineBasicBlock::iterator I) const { MachineInstrBuilder MIB; // Certain branches have two forms: e.g beq $1, $zero, dest vs beqz $1, dest // Pick the zero form of the branch for readable assembly and for greater // branch distance in non-microMIPS mode. // Additional MIPSR6 does not permit the use of register $zero for compact // branches. // FIXME: Certain atomic sequences on mips64 generate 32bit references to // Mips::ZERO, which is incorrect. This test should be updated to use // Subtarget.getABI().GetZeroReg() when those atomic sequences and others // are fixed. int ZeroOperandPosition = -1; bool BranchWithZeroOperand = false; if (I->isBranch() && !I->isPseudo()) { auto TRI = I->getParent()->getParent()->getSubtarget().getRegisterInfo(); ZeroOperandPosition = I->findRegisterUseOperandIdx(Mips::ZERO, false, TRI); BranchWithZeroOperand = ZeroOperandPosition != -1; } if (BranchWithZeroOperand) { switch (NewOpc) { case Mips::BEQC: NewOpc = Mips::BEQZC; break; case Mips::BNEC: NewOpc = Mips::BNEZC; break; case Mips::BGEC: NewOpc = Mips::BGEZC; break; case Mips::BLTC: NewOpc = Mips::BLTZC; break; case Mips::BEQC64: NewOpc = Mips::BEQZC64; break; case Mips::BNEC64: NewOpc = Mips::BNEZC64; break; } } MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), get(NewOpc)); // For MIPSR6 JI*C requires an immediate 0 as an operand, JIALC(64) an // immediate 0 as an operand and requires the removal of it's %RA<imp-def> // implicit operand as copying the implicit operations of the instructio we're // looking at will give us the correct flags. if (NewOpc == Mips::JIC || NewOpc == Mips::JIALC || NewOpc == Mips::JIC64 || NewOpc == Mips::JIALC64) { if (NewOpc == Mips::JIALC || NewOpc == Mips::JIALC64) MIB->RemoveOperand(0); for (unsigned J = 0, E = I->getDesc().getNumOperands(); J < E; ++J) { MIB.add(I->getOperand(J)); } MIB.addImm(0); } else { for (unsigned J = 0, E = I->getDesc().getNumOperands(); J < E; ++J) { if (BranchWithZeroOperand && (unsigned)ZeroOperandPosition == J) continue; MIB.add(I->getOperand(J)); } } MIB.copyImplicitOps(*I); MIB.setMemRefs(I->memoperands_begin(), I->memoperands_end()); return MIB; }
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, unsigned EltSize, ArrayRef<MachineInstr*> InstsToMove) { MachineBasicBlock *MBB = I->getParent(); // Be sure to use .addOperand(), and not .addReg() with these. We want to be // sure we preserve the subregister index and any register flags set on them. const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr); const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0); const MachineOperand *Data1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0); unsigned Offset0 = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; unsigned Offset1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; unsigned NewOffset0 = Offset0 / EltSize; unsigned NewOffset1 = Offset1 / EltSize; unsigned Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; // Prefer the st64 form if we can use it, even if we can fit the offset in the // non st64 version. I'm not sure if there's any real reason to do this. bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); if (UseST64) { NewOffset0 /= 64; NewOffset1 /= 64; Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; } if (NewOffset0 > NewOffset1) { // Canonicalize the merged instruction so the smaller offset comes first. std::swap(NewOffset0, NewOffset1); std::swap(Data0, Data1); } assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); const MCInstrDesc &Write2Desc = TII->get(Opc); DebugLoc DL = I->getDebugLoc(); MachineInstrBuilder Write2 = BuildMI(*MBB, Paired, DL, Write2Desc) .addOperand(*Addr) // addr .addOperand(*Data0) // data0 .addOperand(*Data1) // data1 .addImm(NewOffset0) // offset0 .addImm(NewOffset1) // offset1 .addImm(0) // gds .addMemOperand(*I->memoperands_begin()) .addMemOperand(*Paired->memoperands_begin()); moveInstsAfter(Write2, InstsToMove); MachineBasicBlock::iterator Next = std::next(I); I->eraseFromParent(); Paired->eraseFromParent(); DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); return Next; }
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, unsigned EltSize, ArrayRef<MachineInstr*> InstsToMove) { MachineBasicBlock *MBB = I->getParent(); // Be careful, since the addresses could be subregisters themselves in weird // cases, like vectors of pointers. const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr); const MachineOperand *Dest0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst); const MachineOperand *Dest1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst); unsigned Offset0 = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; unsigned Offset1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; unsigned NewOffset0 = Offset0 / EltSize; unsigned NewOffset1 = Offset1 / EltSize; unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; // Prefer the st64 form if we can use it, even if we can fit the offset in the // non st64 version. I'm not sure if there's any real reason to do this. bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); if (UseST64) { NewOffset0 /= 64; NewOffset1 /= 64; Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; } unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; if (NewOffset0 > NewOffset1) { // Canonicalize the merged instruction so the smaller offset comes first. std::swap(NewOffset0, NewOffset1); std::swap(SubRegIdx0, SubRegIdx1); } assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); const MCInstrDesc &Read2Desc = TII->get(Opc); const TargetRegisterClass *SuperRC = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; unsigned DestReg = MRI->createVirtualRegister(SuperRC); DebugLoc DL = I->getDebugLoc(); MachineInstrBuilder Read2 = BuildMI(*MBB, Paired, DL, Read2Desc, DestReg) .addOperand(*AddrReg) // addr .addImm(NewOffset0) // offset0 .addImm(NewOffset1) // offset1 .addImm(0) // gds .addMemOperand(*I->memoperands_begin()) .addMemOperand(*Paired->memoperands_begin()); (void)Read2; const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); // Copy to the old destination registers. BuildMI(*MBB, Paired, DL, CopyDesc) .addOperand(*Dest0) // Copy to same destination including flags and sub reg. .addReg(DestReg, 0, SubRegIdx0); MachineInstr *Copy1 = BuildMI(*MBB, Paired, DL, CopyDesc) .addOperand(*Dest1) .addReg(DestReg, RegState::Kill, SubRegIdx1); moveInstsAfter(Copy1, InstsToMove); MachineBasicBlock::iterator Next = std::next(I); I->eraseFromParent(); Paired->eraseFromParent(); DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); return Next; }
void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, unsigned LoadStoreOp, unsigned Value, unsigned ScratchRsrcReg, unsigned ScratchOffset, int64_t Offset, RegScavenger *RS) const { MachineBasicBlock *MBB = MI->getParent(); MachineFunction *MF = MI->getParent()->getParent(); const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(MF->getSubtarget().getInstrInfo()); DebugLoc DL = MI->getDebugLoc(); bool IsLoad = TII->get(LoadStoreOp).mayLoad(); bool RanOutOfSGPRs = false; bool Scavenged = false; unsigned SOffset = ScratchOffset; unsigned OriginalImmOffset = Offset; unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); unsigned Size = NumSubRegs * 4; if (!isUInt<12>(Offset + Size)) { SOffset = AMDGPU::NoRegister; // We don't have access to the register scavenger if this function is called // during PEI::scavengeFrameVirtualRegs(). if (RS) SOffset = RS->FindUnusedReg(&AMDGPU::SGPR_32RegClass); if (SOffset == AMDGPU::NoRegister) { // There are no free SGPRs, and since we are in the process of spilling // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true // on SI/CI and on VI it is true until we implement spilling using scalar // stores), we have no way to free up an SGPR. Our solution here is to // add the offset directly to the ScratchOffset register, and then // subtract the offset after the spill to return ScratchOffset to it's // original value. RanOutOfSGPRs = true; SOffset = ScratchOffset; } else { Scavenged = true; } BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) .addReg(ScratchOffset) .addImm(Offset); Offset = 0; } for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) { unsigned SubReg = NumSubRegs > 1 ? getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) : Value; unsigned SOffsetRegState = 0; if (i + 1 == e && Scavenged) SOffsetRegState |= RegState::Kill; BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) .addReg(SubReg, getDefRegState(IsLoad)) .addReg(ScratchRsrcReg) .addReg(SOffset, SOffsetRegState) .addImm(Offset) .addImm(0) // glc .addImm(0) // slc .addImm(0) // tfe .addReg(Value, RegState::Implicit | getDefRegState(IsLoad)) .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); } if (RanOutOfSGPRs) { // Subtract the offset we added to the ScratchOffset register. BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffset) .addReg(ScratchOffset) .addImm(OriginalImmOffset); } }
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, unsigned EltSize) { MachineBasicBlock *MBB = I->getParent(); // Be sure to use .addOperand(), and not .addReg() with these. We want to be // sure we preserve the subregister index and any register flags set on them. const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr); const MachineOperand *M0Reg = TII->getNamedOperand(*I, AMDGPU::OpName::m0); const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0); const MachineOperand *Data1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0); unsigned Offset0 = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; unsigned Offset1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; unsigned NewOffset0 = Offset0 / EltSize; unsigned NewOffset1 = Offset1 / EltSize; unsigned Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; // Prefer the st64 form if we can use it, even if we can fit the offset in the // non st64 version. I'm not sure if there's any real reason to do this. bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); if (UseST64) { NewOffset0 /= 64; NewOffset1 /= 64; Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; } assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); const MCInstrDesc &Write2Desc = TII->get(Opc); DebugLoc DL = I->getDebugLoc(); MachineInstrBuilder Write2 = BuildMI(*MBB, I, DL, Write2Desc) .addOperand(*Addr) // addr .addOperand(*Data0) // data0 .addOperand(*Data1) // data1 .addImm(NewOffset0) // offset0 .addImm(NewOffset1) // offset1 .addImm(0) // gds .addOperand(*M0Reg) // m0 .addMemOperand(*I->memoperands_begin()) .addMemOperand(*Paired->memoperands_begin()); // XXX - How do we express subregisters here? unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg(), M0Reg->getReg() }; LIS->RemoveMachineInstrFromMaps(I); LIS->RemoveMachineInstrFromMaps(Paired); I->eraseFromParent(); Paired->eraseFromParent(); LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs); DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); return Write2.getInstr(); }
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, unsigned EltSize) { MachineBasicBlock *MBB = I->getParent(); // Be careful, since the addresses could be subregisters themselves in weird // cases, like vectors of pointers. const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr); const MachineOperand *M0Reg = TII->getNamedOperand(*I, AMDGPU::OpName::m0); unsigned DestReg0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst)->getReg(); unsigned DestReg1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst)->getReg(); unsigned Offset0 = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; unsigned Offset1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; unsigned NewOffset0 = Offset0 / EltSize; unsigned NewOffset1 = Offset1 / EltSize; unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; // Prefer the st64 form if we can use it, even if we can fit the offset in the // non st64 version. I'm not sure if there's any real reason to do this. bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); if (UseST64) { NewOffset0 /= 64; NewOffset1 /= 64; Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; } assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); const MCInstrDesc &Read2Desc = TII->get(Opc); const TargetRegisterClass *SuperRC = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; unsigned DestReg = MRI->createVirtualRegister(SuperRC); DebugLoc DL = I->getDebugLoc(); MachineInstrBuilder Read2 = BuildMI(*MBB, I, DL, Read2Desc, DestReg) .addOperand(*AddrReg) // addr .addImm(NewOffset0) // offset0 .addImm(NewOffset1) // offset1 .addImm(0) // gds .addOperand(*M0Reg) // M0 .addMemOperand(*I->memoperands_begin()) .addMemOperand(*Paired->memoperands_begin()); LIS->InsertMachineInstrInMaps(Read2); unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; updateRegDefsUses(DestReg0, DestReg, SubRegIdx0); updateRegDefsUses(DestReg1, DestReg, SubRegIdx1); LIS->RemoveMachineInstrFromMaps(I); LIS->RemoveMachineInstrFromMaps(Paired); I->eraseFromParent(); Paired->eraseFromParent(); LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg()); LIS->shrinkToUses(&AddrRegLI); LiveInterval &M0RegLI = LIS->getInterval(M0Reg->getReg()); LIS->shrinkToUses(&M0RegLI); // Currently m0 is treated as a register class with one member instead of an // implicit physical register. We are using the virtual register for the first // one, but we still need to update the live range of the now unused second m0 // virtual register to avoid verifier errors. const MachineOperand *PairedM0Reg = TII->getNamedOperand(*Paired, AMDGPU::OpName::m0); LiveInterval &PairedM0RegLI = LIS->getInterval(PairedM0Reg->getReg()); LIS->shrinkToUses(&PairedM0RegLI); LIS->getInterval(DestReg); // Create new LI DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); return Read2.getInstr(); }
void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { MachineFunction *MF = MI->getParent()->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); MachineBasicBlock *MBB = MI->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); MachineOperand &FIOp = MI->getOperand(FIOperandNum); int Index = MI->getOperand(FIOperandNum).getIndex(); switch (MI->getOpcode()) { // SGPR register spill case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S256_SAVE: case AMDGPU::SI_SPILL_S128_SAVE: case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S32_SAVE: { spillSGPR(MI, Index, RS); break; } // SGPR register restore case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S256_RESTORE: case AMDGPU::SI_SPILL_S128_RESTORE: case AMDGPU::SI_SPILL_S64_RESTORE: case AMDGPU::SI_SPILL_S32_RESTORE: { restoreSGPR(MI, Index, RS); break; } // VGPR register spill case AMDGPU::SI_SPILL_V512_SAVE: case AMDGPU::SI_SPILL_V256_SAVE: case AMDGPU::SI_SPILL_V128_SAVE: case AMDGPU::SI_SPILL_V96_SAVE: case AMDGPU::SI_SPILL_V64_SAVE: case AMDGPU::SI_SPILL_V32_SAVE: { const MachineOperand *VData = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, Index, VData->getReg(), VData->isKill(), TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(), TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), *MI->memoperands_begin(), RS); MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); MI->eraseFromParent(); break; } case AMDGPU::SI_SPILL_V32_RESTORE: case AMDGPU::SI_SPILL_V64_RESTORE: case AMDGPU::SI_SPILL_V96_RESTORE: case AMDGPU::SI_SPILL_V128_RESTORE: case AMDGPU::SI_SPILL_V256_RESTORE: case AMDGPU::SI_SPILL_V512_RESTORE: { const MachineOperand *VData = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, Index, VData->getReg(), VData->isKill(), TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(), TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), *MI->memoperands_begin(), RS); MI->eraseFromParent(); break; } default: { if (TII->isMUBUF(*MI)) { // Disable offen so we don't need a 0 vgpr base. assert(static_cast<int>(FIOperandNum) == AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr)); int64_t Offset = FrameInfo.getObjectOffset(Index); int64_t OldImm = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(); int64_t NewOffset = OldImm + Offset; if (isUInt<12>(NewOffset) && buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)) { MI->eraseFromParent(); break; } } int64_t Offset = FrameInfo.getObjectOffset(Index); FIOp.ChangeToImmediate(Offset); if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) .addImm(Offset); FIOp.ChangeToRegister(TmpReg, false, false, true); } } } }
// Convert callee-save register save/restore instruction to do stack pointer // decrement/increment to allocate/deallocate the callee-save stack area by // converting store/load to use pre/post increment version. static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc) { unsigned NewOpc; bool NewIsUnscaled = false; switch (MBBI->getOpcode()) { default: llvm_unreachable("Unexpected callee-save save/restore opcode!"); case AArch64::STPXi: NewOpc = AArch64::STPXpre; break; case AArch64::STPDi: NewOpc = AArch64::STPDpre; break; case AArch64::STRXui: NewOpc = AArch64::STRXpre; NewIsUnscaled = true; break; case AArch64::STRDui: NewOpc = AArch64::STRDpre; NewIsUnscaled = true; break; case AArch64::LDPXi: NewOpc = AArch64::LDPXpost; break; case AArch64::LDPDi: NewOpc = AArch64::LDPDpost; break; case AArch64::LDRXui: NewOpc = AArch64::LDRXpost; NewIsUnscaled = true; break; case AArch64::LDRDui: NewOpc = AArch64::LDRDpost; NewIsUnscaled = true; break; } MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc)); MIB.addReg(AArch64::SP, RegState::Define); // Copy all operands other than the immediate offset. unsigned OpndIdx = 0; for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd; ++OpndIdx) MIB.add(MBBI->getOperand(OpndIdx)); assert(MBBI->getOperand(OpndIdx).getImm() == 0 && "Unexpected immediate offset in first/last callee-save save/restore " "instruction!"); assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP && "Unexpected base register in callee-save save/restore instruction!"); // Last operand is immediate offset that needs fixing. assert(CSStackSizeInc % 8 == 0); int64_t CSStackSizeIncImm = CSStackSizeInc; if (!NewIsUnscaled) CSStackSizeIncImm /= 8; MIB.addImm(CSStackSizeIncImm); MIB.setMIFlags(MBBI->getFlags()); MIB.setMemRefs(MBBI->memoperands_begin(), MBBI->memoperands_end()); return std::prev(MBB.erase(MBBI)); }
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, unsigned EltSize) { MachineBasicBlock *MBB = I->getParent(); // Be careful, since the addresses could be subregisters themselves in weird // cases, like vectors of pointers. const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr); const MachineOperand *Dest0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst); const MachineOperand *Dest1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst); unsigned Offset0 = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; unsigned Offset1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; unsigned NewOffset0 = Offset0 / EltSize; unsigned NewOffset1 = Offset1 / EltSize; unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; // Prefer the st64 form if we can use it, even if we can fit the offset in the // non st64 version. I'm not sure if there's any real reason to do this. bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); if (UseST64) { NewOffset0 /= 64; NewOffset1 /= 64; Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; } assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); const MCInstrDesc &Read2Desc = TII->get(Opc); const TargetRegisterClass *SuperRC = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; unsigned DestReg = MRI->createVirtualRegister(SuperRC); DebugLoc DL = I->getDebugLoc(); MachineInstrBuilder Read2 = BuildMI(*MBB, I, DL, Read2Desc, DestReg) .addOperand(*AddrReg) // addr .addImm(NewOffset0) // offset0 .addImm(NewOffset1) // offset1 .addImm(0) // gds .addMemOperand(*I->memoperands_begin()) .addMemOperand(*Paired->memoperands_begin()); unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); // Copy to the old destination registers. MachineInstr *Copy0 = BuildMI(*MBB, I, DL, CopyDesc) .addOperand(*Dest0) // Copy to same destination including flags and sub reg. .addReg(DestReg, 0, SubRegIdx0); MachineInstr *Copy1 = BuildMI(*MBB, I, DL, CopyDesc) .addOperand(*Dest1) .addReg(DestReg, RegState::Kill, SubRegIdx1); LIS->InsertMachineInstrInMaps(*Read2); // repairLiveintervalsInRange() doesn't handle physical register, so we have // to update the M0 range manually. SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired); LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI)); LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex); bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot(); // The new write to the original destination register is now the copy. Steal // the old SlotIndex. LIS->ReplaceMachineInstrInMaps(*I, *Copy0); LIS->ReplaceMachineInstrInMaps(*Paired, *Copy1); I->eraseFromParent(); Paired->eraseFromParent(); LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg()); LIS->shrinkToUses(&AddrRegLI); LIS->createAndComputeVirtRegInterval(DestReg); if (UpdateM0Range) { SlotIndex Read2Index = LIS->getInstructionIndex(*Read2); M0Segment->end = Read2Index.getRegSlot(); } DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); return Read2.getInstr(); }