static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, MachineBasicBlock::iterator B, AliasAnalysis *AA) { // RAW or WAR - cannot reorder // WAW - cannot reorder // RAR - safe to reorder return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true); }
static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, MachineBasicBlock::iterator B, const SIInstrInfo *TII, AliasAnalysis *AA) { // RAW or WAR - cannot reorder // WAW - cannot reorder // RAR - safe to reorder return !(A->mayStore() || B->mayStore()) || TII->areMemAccessesTriviallyDisjoint(*A, *B, AA); }
bool Filler::delayHasHazard(MachineBasicBlock::iterator candidate, bool &sawLoad, bool &sawStore, SmallSet<unsigned, 32> &RegDefs, SmallSet<unsigned, 32> &RegUses) { if (candidate->isImplicitDef() || candidate->isKill()) return true; if (candidate->mayLoad()) { sawLoad = true; if (sawStore) return true; } if (candidate->mayStore()) { if (sawStore) return true; sawStore = true; if (sawLoad) return true; } for (unsigned i = 0, e = candidate->getNumOperands(); i!= e; ++i) { const MachineOperand &MO = candidate->getOperand(i); if (!MO.isReg()) continue; // skip unsigned Reg = MO.getReg(); if (MO.isDef()) { // check whether Reg is defined or used before delay slot. if (IsRegInSet(RegDefs, Reg) || IsRegInSet(RegUses, Reg)) return true; } if (MO.isUse()) { // check whether Reg is defined before delay slot. if (IsRegInSet(RegDefs, Reg)) return true; } } unsigned Opcode = candidate->getOpcode(); // LD and LDD may have NOPs inserted afterwards in the case of some LEON // processors, so we can't use the delay slot if this feature is switched-on. if (Subtarget->insertNOPLoad() && Opcode >= SP::LDDArr && Opcode <= SP::LDrr) return true; // Same as above for FDIV and FSQRT on some LEON processors. if (Subtarget->fixAllFDIVSQRT() && Opcode >= SP::FDIVD && Opcode <= SP::FSQRTD) return true; return false; }
// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not // need to handle the case where an SGPR may need to be spilled while spilling. static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, MachineFrameInfo &MFI, MachineBasicBlock::iterator MI, int Index, int64_t Offset) { MachineBasicBlock *MBB = MI->getParent(); const DebugLoc &DL = MI->getDebugLoc(); bool IsStore = MI->mayStore(); unsigned Opc = MI->getOpcode(); int LoadStoreOp = IsStore ? getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc); if (LoadStoreOp == -1) return false; unsigned Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata)->getReg(); BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) .addReg(Reg, getDefRegState(!IsStore)) .addOperand(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) .addOperand(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) .addImm(Offset) .addImm(0) // glc .addImm(0) // slc .addImm(0) // tfe .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); return true; }
bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI) { assert(!MI->mayLoad() && MI->mayStore()); bool Changed = false; if (MOI.isAtomic()) { if (MOI.getOrdering() == AtomicOrdering::Release || MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(), SIMemOp::LOAD | SIMemOp::STORE, MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE); return Changed; } // Atomic instructions do not have the nontemporal attribute. if (MOI.isNonTemporal()) { Changed |= CC->enableNonTemporal(MI); return Changed; } return Changed; }
bool SIGfx6CacheControl::enableLoadCacheBypass( const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const { assert(MI->mayLoad() && !MI->mayStore()); bool Changed = false; if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { /// TODO: Do not set glc for rmw atomic operations as they /// implicitly bypass the L1 cache. switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: Changed |= enableGLCBit(MI); break; case SIAtomicScope::WORKGROUP: case SIAtomicScope::WAVEFRONT: case SIAtomicScope::SINGLETHREAD: // No cache to bypass. break; default: llvm_unreachable("Unsupported synchronization scope"); } } /// The scratch address space does not need the global memory caches /// to be bypassed as all memory operations by the same thread are /// sequentially consistent, and no other thread can access scratch /// memory. /// Other address spaces do not hava a cache. return Changed; }
X86CallFrameOptimization::InstClassification X86CallFrameOptimization::classifyInstruction( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const X86RegisterInfo &RegInfo, DenseSet<unsigned int> &UsedRegs) { if (MI == MBB.end()) return Exit; // The instructions we actually care about are movs onto the stack int Opcode = MI->getOpcode(); if (Opcode == X86::MOV32mi || Opcode == X86::MOV32mr || Opcode == X86::MOV64mi32 || Opcode == X86::MOV64mr) return Convert; // Not all calling conventions have only stack MOVs between the stack // adjust and the call. // We want to tolerate other instructions, to cover more cases. // In particular: // a) PCrel calls, where we expect an additional COPY of the basereg. // b) Passing frame-index addresses. // c) Calling conventions that have inreg parameters. These generate // both copies and movs into registers. // To avoid creating lots of special cases, allow any instruction // that does not write into memory, does not def or use the stack // pointer, and does not def any register that was used by a preceding // push. // (Reading from memory is allowed, even if referenced through a // frame index, since these will get adjusted properly in PEI) // The reason for the last condition is that the pushes can't replace // the movs in place, because the order must be reversed. // So if we have a MOV32mr that uses EDX, then an instruction that defs // EDX, and then the call, after the transformation the push will use // the modified version of EDX, and not the original one. // Since we are still in SSA form at this point, we only need to // make sure we don't clobber any *physical* registers that were // used by an earlier mov that will become a push. if (MI->isCall() || MI->mayStore()) return Exit; for (const MachineOperand &MO : MI->operands()) { if (!MO.isReg()) continue; unsigned int Reg = MO.getReg(); if (!RegInfo.isPhysicalRegister(Reg)) continue; if (RegInfo.regsOverlap(Reg, RegInfo.getStackRegister())) return Exit; if (MO.isDef()) { for (unsigned int U : UsedRegs) if (RegInfo.regsOverlap(Reg, U)) return Exit; } } return Skip; }
bool SIGfx6CacheControl::enableNonTemporal( const MachineBasicBlock::iterator &MI) const { assert(MI->mayLoad() ^ MI->mayStore()); bool Changed = false; /// TODO: Do not enableGLCBit if rmw atomic. Changed |= enableGLCBit(MI); Changed |= enableSLCBit(MI); return Changed; }
Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( const MachineBasicBlock::iterator &MI) const { assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); if (!(MI->mayLoad() && MI->mayStore())) return None; // Be conservative if there are no memory operands. if (MI->getNumMemOperands() == 0) return SIMemOpInfo(); return constructFromMIWithMMO(MI); }
bool Filler::delayHasHazard(MachineBasicBlock::iterator candidate, bool &sawLoad, bool &sawStore, SmallSet<unsigned, 32> &RegDefs, SmallSet<unsigned, 32> &RegUses) { if (candidate->isImplicitDef() || candidate->isKill()) return true; // Loads or stores cannot be moved past a store to the delay slot // and stores cannot be moved past a load. if (candidate->mayLoad()) { if (sawStore) return true; sawLoad = true; } if (candidate->mayStore()) { if (sawStore) return true; sawStore = true; if (sawLoad) return true; } assert((!candidate->isCall() && !candidate->isReturn()) && "Cannot put calls or returns in delay slot."); for (unsigned i = 0, e = candidate->getNumOperands(); i!= e; ++i) { const MachineOperand &MO = candidate->getOperand(i); unsigned Reg; if (!MO.isReg() || !(Reg = MO.getReg())) continue; // skip if (MO.isDef()) { // check whether Reg is defined or used before delay slot. if (IsRegInSet(RegDefs, Reg) || IsRegInSet(RegUses, Reg)) return true; } if (MO.isUse()) { // check whether Reg is defined before delay slot. if (IsRegInSet(RegDefs, Reg)) return true; } } return false; }
bool Filler::delayHasHazard(MachineBasicBlock::iterator candidate, bool &sawLoad, bool &sawStore, SmallSet<unsigned, 32> &RegDefs, SmallSet<unsigned, 32> &RegUses) { if (candidate->isImplicitDef() || candidate->isKill()) return true; if (candidate->mayLoad()) { sawLoad = true; if (sawStore) return true; } if (candidate->mayStore()) { if (sawStore) return true; sawStore = true; if (sawLoad) return true; } for (unsigned i = 0, e = candidate->getNumOperands(); i!= e; ++i) { const MachineOperand &MO = candidate->getOperand(i); if (!MO.isReg()) continue; // skip unsigned Reg = MO.getReg(); if (MO.isDef()) { //check whether Reg is defined or used before delay slot. if (IsRegInSet(RegDefs, Reg) || IsRegInSet(RegUses, Reg)) return true; } if (MO.isUse()) { //check whether Reg is defined before delay slot. if (IsRegInSet(RegDefs, Reg)) return true; } } return false; }
// These are the common checks that need to performed // to determine if // 1. compare instruction can be moved before jump. // 2. feeder to the compare instruction can be moved before jump. static bool commonChecksToProhibitNewValueJump(bool afterRA, MachineBasicBlock::iterator MII) { // If store in path, bail out. if (MII->mayStore()) return false; // if call in path, bail out. if (MII->isCall()) return false; // if NVJ is running prior to RA, do the following checks. if (!afterRA) { // The following Target Opcode instructions are spurious // to new value jump. If they are in the path, bail out. // KILL sets kill flag on the opcode. It also sets up a // single register, out of pair. // %D0<def> = S2_lsr_r_p %D0<kill>, %R2<kill> // %R0<def> = KILL %R0, %D0<imp-use,kill> // %P0<def> = C2_cmpeqi %R0<kill>, 0 // PHI can be anything after RA. // COPY can remateriaze things in between feeder, compare and nvj. if (MII->getOpcode() == TargetOpcode::KILL || MII->getOpcode() == TargetOpcode::PHI || MII->getOpcode() == TargetOpcode::COPY) return false; // The following pseudo Hexagon instructions sets "use" and "def" // of registers by individual passes in the backend. At this time, // we don't know the scope of usage and definitions of these // instructions. if (MII->getOpcode() == Hexagon::LDriw_pred || MII->getOpcode() == Hexagon::STriw_pred) return false; } return true; }
bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI) { assert(MI->mayLoad() && MI->mayStore()); bool Changed = false; if (MOI.isAtomic()) { if (MOI.getOrdering() == AtomicOrdering::Release || MOI.getOrdering() == AtomicOrdering::AcquireRelease || MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(), SIMemOp::LOAD | SIMemOp::STORE, MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE); if (MOI.getOrdering() == AtomicOrdering::Acquire || MOI.getOrdering() == AtomicOrdering::AcquireRelease || MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || MOI.getFailureOrdering() == AtomicOrdering::Acquire || MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(), isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE, MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER); Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(), MOI.getOrderingAddrSpace(), Position::AFTER); } return Changed; } return Changed; }
void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, unsigned LoadStoreOp, const MachineOperand *SrcDst, unsigned ScratchRsrcReg, unsigned ScratchOffset, int64_t Offset, RegScavenger *RS) const { unsigned Value = SrcDst->getReg(); bool IsKill = SrcDst->isKill(); MachineBasicBlock *MBB = MI->getParent(); MachineFunction *MF = MI->getParent()->getParent(); const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); bool IsStore = MI->mayStore(); bool RanOutOfSGPRs = false; bool Scavenged = false; unsigned SOffset = ScratchOffset; unsigned OriginalImmOffset = Offset; unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); unsigned Size = NumSubRegs * 4; if (!isUInt<12>(Offset + Size)) { SOffset = AMDGPU::NoRegister; // We don't have access to the register scavenger if this function is called // during PEI::scavengeFrameVirtualRegs(). if (RS) SOffset = RS->FindUnusedReg(&AMDGPU::SGPR_32RegClass); if (SOffset == AMDGPU::NoRegister) { // There are no free SGPRs, and since we are in the process of spilling // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true // on SI/CI and on VI it is true until we implement spilling using scalar // stores), we have no way to free up an SGPR. Our solution here is to // add the offset directly to the ScratchOffset register, and then // subtract the offset after the spill to return ScratchOffset to it's // original value. RanOutOfSGPRs = true; SOffset = ScratchOffset; } else { Scavenged = true; } BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) .addReg(ScratchOffset) .addImm(Offset); Offset = 0; } for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) { unsigned SubReg = NumSubRegs == 1 ? Value : getSubReg(Value, getSubRegFromChannel(i)); unsigned SOffsetRegState = 0; unsigned SrcDstRegState = getDefRegState(!IsStore); if (i + 1 == e) { SOffsetRegState |= getKillRegState(Scavenged); // The last implicit use carries the "Kill" flag. SrcDstRegState |= getKillRegState(IsKill); } BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) .addReg(SubReg, getDefRegState(!IsStore)) .addReg(ScratchRsrcReg) .addReg(SOffset, SOffsetRegState) .addImm(Offset) .addImm(0) // glc .addImm(0) // slc .addImm(0) // tfe .addReg(Value, RegState::Implicit | SrcDstRegState) .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); } if (RanOutOfSGPRs) { // Subtract the offset we added to the ScratchOffset register. BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffset) .addReg(ScratchOffset) .addImm(OriginalImmOffset); } }