Example #1
0
// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
// need to handle the case where an SGPR may need to be spilled while spilling.
static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
                                      MachineFrameInfo &MFI,
                                      MachineBasicBlock::iterator MI,
                                      int Index,
                                      int64_t Offset) {
  MachineBasicBlock *MBB = MI->getParent();
  const DebugLoc &DL = MI->getDebugLoc();
  bool IsStore = MI->mayStore();

  unsigned Opc = MI->getOpcode();
  int LoadStoreOp = IsStore ?
    getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc);
  if (LoadStoreOp == -1)
    return false;

  unsigned Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata)->getReg();

  BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
    .addReg(Reg, getDefRegState(!IsStore))
    .addOperand(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
    .addOperand(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
    .addImm(Offset)
    .addImm(0) // glc
    .addImm(0) // slc
    .addImm(0) // tfe
    .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
  return true;
}
Example #2
0
void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI) const {
  MachineFunction &MF = *MI->getParent()->getParent();
  const ARMSubtarget &Subtarget = MF.getSubtarget<ARMSubtarget>();
  const TargetMachine &TM = MF.getTarget();

  if (!Subtarget.useMovt(MF)) {
    if (TM.isPositionIndependent())
      expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_pcrel, ARM::LDRi12);
    else
      expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_abs, ARM::LDRi12);
    return;
  }

  if (!TM.isPositionIndependent()) {
    expandLoadStackGuardBase(MI, ARM::MOVi32imm, ARM::LDRi12);
    return;
  }

  const GlobalValue *GV =
      cast<GlobalValue>((*MI->memoperands_begin())->getValue());

  if (!Subtarget.isGVIndirectSymbol(GV)) {
    expandLoadStackGuardBase(MI, ARM::MOV_ga_pcrel, ARM::LDRi12);
    return;
  }

  MachineBasicBlock &MBB = *MI->getParent();
  DebugLoc DL = MI->getDebugLoc();
  unsigned Reg = MI->getOperand(0).getReg();
  MachineInstrBuilder MIB;

  MIB = BuildMI(MBB, MI, DL, get(ARM::MOV_ga_pcrel_ldr), Reg)
            .addGlobalAddress(GV, 0, ARMII::MO_NONLAZY);
  auto Flags = MachineMemOperand::MOLoad |
               MachineMemOperand::MODereferenceable |
               MachineMemOperand::MOInvariant;
  MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
      MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 4, 4);
  MIB.addMemOperand(MMO);
  BuildMI(MBB, MI, DL, get(ARM::LDRi12), Reg)
      .addReg(Reg, RegState::Kill)
      .addImm(0)
      .setMemRefs(MI->memoperands_begin(), MI->memoperands_end())
      .add(predOps(ARMCC::AL));
}
Example #3
0
void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
                                           unsigned LoadStoreOp,
                                           unsigned Value,
                                           unsigned ScratchRsrcReg,
                                           unsigned ScratchOffset,
                                           int64_t Offset,
                                           RegScavenger *RS) const {

  MachineBasicBlock *MBB = MI->getParent();
  const MachineFunction *MF = MI->getParent()->getParent();
  const SIInstrInfo *TII =
      static_cast<const SIInstrInfo *>(MF->getSubtarget().getInstrInfo());
  LLVMContext &Ctx = MF->getFunction()->getContext();
  DebugLoc DL = MI->getDebugLoc();
  bool IsLoad = TII->get(LoadStoreOp).mayLoad();

  bool RanOutOfSGPRs = false;
  unsigned SOffset = ScratchOffset;

  unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
  unsigned Size = NumSubRegs * 4;

  if (!isUInt<12>(Offset + Size)) {
    SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0);
    if (SOffset == AMDGPU::NoRegister) {
      RanOutOfSGPRs = true;
      SOffset = AMDGPU::SGPR0;
    }
    BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
            .addReg(ScratchOffset)
            .addImm(Offset);
    Offset = 0;
  }

  if (RanOutOfSGPRs)
    Ctx.emitError("Ran out of SGPRs for spilling VGPRS");

  for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) {
    unsigned SubReg = NumSubRegs > 1 ?
        getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) :
        Value;
    bool IsKill = (i == e - 1);

    BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
      .addReg(SubReg, getDefRegState(IsLoad))
      .addReg(ScratchRsrcReg, getKillRegState(IsKill))
      .addReg(SOffset)
      .addImm(Offset)
      .addImm(0) // glc
      .addImm(0) // slc
      .addImm(0) // tfe
      .addReg(Value, RegState::Implicit | getDefRegState(IsLoad))
      .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
  }
}
Example #4
0
void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI,
                                        Reloc::Model RM) const {
  MachineFunction &MF = *MI->getParent()->getParent();
  const ARMSubtarget &Subtarget = MF.getSubtarget<ARMSubtarget>();

  if (!Subtarget.useMovt(MF)) {
    if (RM == Reloc::PIC_)
      expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_pcrel, ARM::LDRi12, RM);
    else
      expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_abs, ARM::LDRi12, RM);
    return;
  }

  if (RM != Reloc::PIC_) {
    expandLoadStackGuardBase(MI, ARM::MOVi32imm, ARM::LDRi12, RM);
    return;
  }

  const GlobalValue *GV =
      cast<GlobalValue>((*MI->memoperands_begin())->getValue());

  if (!Subtarget.GVIsIndirectSymbol(GV, RM)) {
    expandLoadStackGuardBase(MI, ARM::MOV_ga_pcrel, ARM::LDRi12, RM);
    return;
  }

  MachineBasicBlock &MBB = *MI->getParent();
  DebugLoc DL = MI->getDebugLoc();
  unsigned Reg = MI->getOperand(0).getReg();
  MachineInstrBuilder MIB;

  MIB = BuildMI(MBB, MI, DL, get(ARM::MOV_ga_pcrel_ldr), Reg)
            .addGlobalAddress(GV, 0, ARMII::MO_NONLAZY);
  unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
  MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
      MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 4, 4);
  MIB.addMemOperand(MMO);
  MIB = BuildMI(MBB, MI, DL, get(ARM::LDRi12), Reg);
  MIB.addReg(Reg, RegState::Kill).addImm(0);
  MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
  AddDefaultPred(MIB);
}
Example #5
0
MachineInstrBuilder
MipsInstrInfo::genInstrWithNewOpc(unsigned NewOpc,
                                  MachineBasicBlock::iterator I) const {
  MachineInstrBuilder MIB;

  // Certain branches have two forms: e.g beq $1, $zero, dst vs beqz $1, dest
  // Pick the zero form of the branch for readable assembly and for greater
  // branch distance in non-microMIPS mode.
  // FIXME: Certain atomic sequences on mips64 generate 32bit references to
  // Mips::ZERO, which is incorrect. This test should be updated to use
  // Subtarget.getABI().GetZeroReg() when those atomic sequences and others
  // are fixed.
  bool BranchWithZeroOperand =
      (I->isBranch() && !I->isPseudo() && I->getOperand(1).isReg() &&
       (I->getOperand(1).getReg() == Mips::ZERO ||
        I->getOperand(1).getReg() == Mips::ZERO_64));

  if (BranchWithZeroOperand) {
    switch (NewOpc) {
    case Mips::BEQC:
      NewOpc = Mips::BEQZC;
      break;
    case Mips::BNEC:
      NewOpc = Mips::BNEZC;
      break;
    case Mips::BGEC:
      NewOpc = Mips::BGEZC;
      break;
    case Mips::BLTC:
      NewOpc = Mips::BLTZC;
      break;
    }
  }

  MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), get(NewOpc));

  // For MIPSR6 JI*C requires an immediate 0 as an operand, JIALC(64) an
  // immediate 0 as an operand and requires the removal of it's %RA<imp-def>
  // implicit operand as copying the implicit operations of the instructio we're
  // looking at will give us the correct flags.
  if (NewOpc == Mips::JIC || NewOpc == Mips::JIALC || NewOpc == Mips::JIC64 ||
      NewOpc == Mips::JIALC64) {

    if (NewOpc == Mips::JIALC || NewOpc == Mips::JIALC64)
      MIB->RemoveOperand(0);

    for (unsigned J = 0, E = I->getDesc().getNumOperands(); J < E; ++J) {
      MIB.addOperand(I->getOperand(J));
    }

    MIB.addImm(0);

 } else if (BranchWithZeroOperand) {
    // For MIPSR6 and microMIPS branches with an explicit zero operand, copy
    // everything after the zero.
     MIB.addOperand(I->getOperand(0));

    for (unsigned J = 2, E = I->getDesc().getNumOperands(); J < E; ++J) {
      MIB.addOperand(I->getOperand(J));
    }
  } else {
    // All other cases copy all other operands.
    for (unsigned J = 0, E = I->getDesc().getNumOperands(); J < E; ++J) {
      MIB.addOperand(I->getOperand(J));
    }
  }

  MIB.copyImplicitOps(*I);

  MIB.setMemRefs(I->memoperands_begin(), I->memoperands_end());
  return MIB;
}
Example #6
0
MachineInstrBuilder
MipsInstrInfo::genInstrWithNewOpc(unsigned NewOpc,
                                  MachineBasicBlock::iterator I) const {
  MachineInstrBuilder MIB;

  // Certain branches have two forms: e.g beq $1, $zero, dest vs beqz $1, dest
  // Pick the zero form of the branch for readable assembly and for greater
  // branch distance in non-microMIPS mode.
  // Additional MIPSR6 does not permit the use of register $zero for compact
  // branches.
  // FIXME: Certain atomic sequences on mips64 generate 32bit references to
  // Mips::ZERO, which is incorrect. This test should be updated to use
  // Subtarget.getABI().GetZeroReg() when those atomic sequences and others
  // are fixed.
  int ZeroOperandPosition = -1;
  bool BranchWithZeroOperand = false;
  if (I->isBranch() && !I->isPseudo()) {
    auto TRI = I->getParent()->getParent()->getSubtarget().getRegisterInfo();
    ZeroOperandPosition = I->findRegisterUseOperandIdx(Mips::ZERO, false, TRI);
    BranchWithZeroOperand = ZeroOperandPosition != -1;
  }

  if (BranchWithZeroOperand) {
    switch (NewOpc) {
    case Mips::BEQC:
      NewOpc = Mips::BEQZC;
      break;
    case Mips::BNEC:
      NewOpc = Mips::BNEZC;
      break;
    case Mips::BGEC:
      NewOpc = Mips::BGEZC;
      break;
    case Mips::BLTC:
      NewOpc = Mips::BLTZC;
      break;
    case Mips::BEQC64:
      NewOpc = Mips::BEQZC64;
      break;
    case Mips::BNEC64:
      NewOpc = Mips::BNEZC64;
      break;
    }
  }

  MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), get(NewOpc));

  // For MIPSR6 JI*C requires an immediate 0 as an operand, JIALC(64) an
  // immediate 0 as an operand and requires the removal of it's %RA<imp-def>
  // implicit operand as copying the implicit operations of the instructio we're
  // looking at will give us the correct flags.
  if (NewOpc == Mips::JIC || NewOpc == Mips::JIALC || NewOpc == Mips::JIC64 ||
      NewOpc == Mips::JIALC64) {

    if (NewOpc == Mips::JIALC || NewOpc == Mips::JIALC64)
      MIB->RemoveOperand(0);

    for (unsigned J = 0, E = I->getDesc().getNumOperands(); J < E; ++J) {
      MIB.add(I->getOperand(J));
    }

    MIB.addImm(0);

  } else {
    for (unsigned J = 0, E = I->getDesc().getNumOperands(); J < E; ++J) {
      if (BranchWithZeroOperand && (unsigned)ZeroOperandPosition == J)
        continue;

      MIB.add(I->getOperand(J));
    }
  }

  MIB.copyImplicitOps(*I);

  MIB.setMemRefs(I->memoperands_begin(), I->memoperands_end());
  return MIB;
}
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
  MachineBasicBlock::iterator I,
  MachineBasicBlock::iterator Paired,
  unsigned EltSize,
  ArrayRef<MachineInstr*> InstsToMove) {
  MachineBasicBlock *MBB = I->getParent();

  // Be sure to use .addOperand(), and not .addReg() with these. We want to be
  // sure we preserve the subregister index and any register flags set on them.
  const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
  const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0);
  const MachineOperand *Data1
    = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0);


  unsigned Offset0
    = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff;
  unsigned Offset1
    = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff;

  unsigned NewOffset0 = Offset0 / EltSize;
  unsigned NewOffset1 = Offset1 / EltSize;
  unsigned Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;

  // Prefer the st64 form if we can use it, even if we can fit the offset in the
  // non st64 version. I'm not sure if there's any real reason to do this.
  bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0);
  if (UseST64) {
    NewOffset0 /= 64;
    NewOffset1 /= 64;
    Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
  }

  if (NewOffset0 > NewOffset1) {
    // Canonicalize the merged instruction so the smaller offset comes first.
    std::swap(NewOffset0, NewOffset1);
    std::swap(Data0, Data1);
  }

  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
         (NewOffset0 != NewOffset1) &&
         "Computed offset doesn't fit");

  const MCInstrDesc &Write2Desc = TII->get(Opc);
  DebugLoc DL = I->getDebugLoc();

  MachineInstrBuilder Write2
    = BuildMI(*MBB, Paired, DL, Write2Desc)
    .addOperand(*Addr) // addr
    .addOperand(*Data0) // data0
    .addOperand(*Data1) // data1
    .addImm(NewOffset0) // offset0
    .addImm(NewOffset1) // offset1
    .addImm(0) // gds
    .addMemOperand(*I->memoperands_begin())
    .addMemOperand(*Paired->memoperands_begin());

  moveInstsAfter(Write2, InstsToMove);

  MachineBasicBlock::iterator Next = std::next(I);
  I->eraseFromParent();
  Paired->eraseFromParent();

  DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
  return Next;
}
MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
  MachineBasicBlock::iterator I,
  MachineBasicBlock::iterator Paired,
  unsigned EltSize,
  ArrayRef<MachineInstr*> InstsToMove) {
  MachineBasicBlock *MBB = I->getParent();

  // Be careful, since the addresses could be subregisters themselves in weird
  // cases, like vectors of pointers.
  const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr);

  const MachineOperand *Dest0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst);
  const MachineOperand *Dest1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst);

  unsigned Offset0
    = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff;
  unsigned Offset1
    = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff;

  unsigned NewOffset0 = Offset0 / EltSize;
  unsigned NewOffset1 = Offset1 / EltSize;
  unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;

  // Prefer the st64 form if we can use it, even if we can fit the offset in the
  // non st64 version. I'm not sure if there's any real reason to do this.
  bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0);
  if (UseST64) {
    NewOffset0 /= 64;
    NewOffset1 /= 64;
    Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
  }

  unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
  unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;

  if (NewOffset0 > NewOffset1) {
    // Canonicalize the merged instruction so the smaller offset comes first.
    std::swap(NewOffset0, NewOffset1);
    std::swap(SubRegIdx0, SubRegIdx1);
  }

  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
         (NewOffset0 != NewOffset1) &&
         "Computed offset doesn't fit");

  const MCInstrDesc &Read2Desc = TII->get(Opc);

  const TargetRegisterClass *SuperRC
    = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
  unsigned DestReg = MRI->createVirtualRegister(SuperRC);

  DebugLoc DL = I->getDebugLoc();
  MachineInstrBuilder Read2
    = BuildMI(*MBB, Paired, DL, Read2Desc, DestReg)
    .addOperand(*AddrReg) // addr
    .addImm(NewOffset0) // offset0
    .addImm(NewOffset1) // offset1
    .addImm(0) // gds
    .addMemOperand(*I->memoperands_begin())
    .addMemOperand(*Paired->memoperands_begin());
  (void)Read2;

  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);

  // Copy to the old destination registers.
  BuildMI(*MBB, Paired, DL, CopyDesc)
    .addOperand(*Dest0) // Copy to same destination including flags and sub reg.
    .addReg(DestReg, 0, SubRegIdx0);
  MachineInstr *Copy1 = BuildMI(*MBB, Paired, DL, CopyDesc)
    .addOperand(*Dest1)
    .addReg(DestReg, RegState::Kill, SubRegIdx1);

  moveInstsAfter(Copy1, InstsToMove);

  MachineBasicBlock::iterator Next = std::next(I);
  I->eraseFromParent();
  Paired->eraseFromParent();

  DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
  return Next;
}
Example #9
0
void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
                                           unsigned LoadStoreOp,
                                           unsigned Value,
                                           unsigned ScratchRsrcReg,
                                           unsigned ScratchOffset,
                                           int64_t Offset,
                                           RegScavenger *RS) const {

  MachineBasicBlock *MBB = MI->getParent();
  MachineFunction *MF = MI->getParent()->getParent();
  const SIInstrInfo *TII =
      static_cast<const SIInstrInfo *>(MF->getSubtarget().getInstrInfo());
  DebugLoc DL = MI->getDebugLoc();
  bool IsLoad = TII->get(LoadStoreOp).mayLoad();

  bool RanOutOfSGPRs = false;
  bool Scavenged = false;
  unsigned SOffset = ScratchOffset;
  unsigned OriginalImmOffset = Offset;

  unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
  unsigned Size = NumSubRegs * 4;

  if (!isUInt<12>(Offset + Size)) {
    SOffset = AMDGPU::NoRegister;

    // We don't have access to the register scavenger if this function is called
    // during  PEI::scavengeFrameVirtualRegs().
    if (RS)
      SOffset = RS->FindUnusedReg(&AMDGPU::SGPR_32RegClass);

    if (SOffset == AMDGPU::NoRegister) {
      // There are no free SGPRs, and since we are in the process of spilling
      // VGPRs too.  Since we need a VGPR in order to spill SGPRs (this is true
      // on SI/CI and on VI it is true until we implement spilling using scalar
      // stores), we have no way to free up an SGPR.  Our solution here is to
      // add the offset directly to the ScratchOffset register, and then
      // subtract the offset after the spill to return ScratchOffset to it's
      // original value.
      RanOutOfSGPRs = true;
      SOffset = ScratchOffset;
    } else {
      Scavenged = true;
    }
    BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
            .addReg(ScratchOffset)
            .addImm(Offset);
    Offset = 0;
  }

  for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) {
    unsigned SubReg = NumSubRegs > 1 ?
        getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) :
        Value;

    unsigned SOffsetRegState = 0;
    if (i + 1 == e && Scavenged)
      SOffsetRegState |= RegState::Kill;

    BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
      .addReg(SubReg, getDefRegState(IsLoad))
      .addReg(ScratchRsrcReg)
      .addReg(SOffset, SOffsetRegState)
      .addImm(Offset)
      .addImm(0) // glc
      .addImm(0) // slc
      .addImm(0) // tfe
      .addReg(Value, RegState::Implicit | getDefRegState(IsLoad))
      .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
  }

  if (RanOutOfSGPRs) {
    // Subtract the offset we added to the ScratchOffset register.
    BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffset)
            .addReg(ScratchOffset)
            .addImm(OriginalImmOffset);
  }
}
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
    MachineBasicBlock::iterator I,
    MachineBasicBlock::iterator Paired,
    unsigned EltSize) {
    MachineBasicBlock *MBB = I->getParent();

    // Be sure to use .addOperand(), and not .addReg() with these. We want to be
    // sure we preserve the subregister index and any register flags set on them.
    const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
    const MachineOperand *M0Reg = TII->getNamedOperand(*I, AMDGPU::OpName::m0);
    const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0);
    const MachineOperand *Data1
        = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0);


    unsigned Offset0
        = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff;
    unsigned Offset1
        = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff;

    unsigned NewOffset0 = Offset0 / EltSize;
    unsigned NewOffset1 = Offset1 / EltSize;
    unsigned Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;

    // Prefer the st64 form if we can use it, even if we can fit the offset in the
    // non st64 version. I'm not sure if there's any real reason to do this.
    bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0);
    if (UseST64) {
        NewOffset0 /= 64;
        NewOffset1 /= 64;
        Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
    }

    assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
           (NewOffset0 != NewOffset1) &&
           "Computed offset doesn't fit");

    const MCInstrDesc &Write2Desc = TII->get(Opc);
    DebugLoc DL = I->getDebugLoc();

    MachineInstrBuilder Write2
        = BuildMI(*MBB, I, DL, Write2Desc)
          .addOperand(*Addr) // addr
          .addOperand(*Data0) // data0
          .addOperand(*Data1) // data1
          .addImm(NewOffset0) // offset0
          .addImm(NewOffset1) // offset1
          .addImm(0) // gds
          .addOperand(*M0Reg)  // m0
          .addMemOperand(*I->memoperands_begin())
          .addMemOperand(*Paired->memoperands_begin());

    // XXX - How do we express subregisters here?
    unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg(),
                            M0Reg->getReg()
                          };

    LIS->RemoveMachineInstrFromMaps(I);
    LIS->RemoveMachineInstrFromMaps(Paired);
    I->eraseFromParent();
    Paired->eraseFromParent();

    LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs);

    DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
    return Write2.getInstr();
}
MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
    MachineBasicBlock::iterator I,
    MachineBasicBlock::iterator Paired,
    unsigned EltSize) {
    MachineBasicBlock *MBB = I->getParent();

    // Be careful, since the addresses could be subregisters themselves in weird
    // cases, like vectors of pointers.
    const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
    const MachineOperand *M0Reg = TII->getNamedOperand(*I, AMDGPU::OpName::m0);

    unsigned DestReg0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst)->getReg();
    unsigned DestReg1
        = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst)->getReg();

    unsigned Offset0
        = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff;
    unsigned Offset1
        = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff;

    unsigned NewOffset0 = Offset0 / EltSize;
    unsigned NewOffset1 = Offset1 / EltSize;
    unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;

    // Prefer the st64 form if we can use it, even if we can fit the offset in the
    // non st64 version. I'm not sure if there's any real reason to do this.
    bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0);
    if (UseST64) {
        NewOffset0 /= 64;
        NewOffset1 /= 64;
        Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
    }

    assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
           (NewOffset0 != NewOffset1) &&
           "Computed offset doesn't fit");

    const MCInstrDesc &Read2Desc = TII->get(Opc);

    const TargetRegisterClass *SuperRC
        = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
    unsigned DestReg = MRI->createVirtualRegister(SuperRC);

    DebugLoc DL = I->getDebugLoc();
    MachineInstrBuilder Read2
        = BuildMI(*MBB, I, DL, Read2Desc, DestReg)
          .addOperand(*AddrReg) // addr
          .addImm(NewOffset0) // offset0
          .addImm(NewOffset1) // offset1
          .addImm(0) // gds
          .addOperand(*M0Reg) // M0
          .addMemOperand(*I->memoperands_begin())
          .addMemOperand(*Paired->memoperands_begin());

    LIS->InsertMachineInstrInMaps(Read2);

    unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
    unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
    updateRegDefsUses(DestReg0, DestReg, SubRegIdx0);
    updateRegDefsUses(DestReg1, DestReg, SubRegIdx1);

    LIS->RemoveMachineInstrFromMaps(I);
    LIS->RemoveMachineInstrFromMaps(Paired);
    I->eraseFromParent();
    Paired->eraseFromParent();

    LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg());
    LIS->shrinkToUses(&AddrRegLI);

    LiveInterval &M0RegLI = LIS->getInterval(M0Reg->getReg());
    LIS->shrinkToUses(&M0RegLI);

    // Currently m0 is treated as a register class with one member instead of an
    // implicit physical register. We are using the virtual register for the first
    // one, but we still need to update the live range of the now unused second m0
    // virtual register to avoid verifier errors.
    const MachineOperand *PairedM0Reg
        = TII->getNamedOperand(*Paired, AMDGPU::OpName::m0);
    LiveInterval &PairedM0RegLI = LIS->getInterval(PairedM0Reg->getReg());
    LIS->shrinkToUses(&PairedM0RegLI);

    LIS->getInterval(DestReg); // Create new LI

    DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
    return Read2.getInstr();
}
Example #12
0
void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                                        int SPAdj, unsigned FIOperandNum,
                                        RegScavenger *RS) const {
  MachineFunction *MF = MI->getParent()->getParent();
  MachineRegisterInfo &MRI = MF->getRegInfo();
  MachineBasicBlock *MBB = MI->getParent();
  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
  const SISubtarget &ST =  MF->getSubtarget<SISubtarget>();
  const SIInstrInfo *TII = ST.getInstrInfo();
  DebugLoc DL = MI->getDebugLoc();

  MachineOperand &FIOp = MI->getOperand(FIOperandNum);
  int Index = MI->getOperand(FIOperandNum).getIndex();

  switch (MI->getOpcode()) {
    // SGPR register spill
    case AMDGPU::SI_SPILL_S512_SAVE:
    case AMDGPU::SI_SPILL_S256_SAVE:
    case AMDGPU::SI_SPILL_S128_SAVE:
    case AMDGPU::SI_SPILL_S64_SAVE:
    case AMDGPU::SI_SPILL_S32_SAVE: {
      spillSGPR(MI, Index, RS);
      break;
    }

    // SGPR register restore
    case AMDGPU::SI_SPILL_S512_RESTORE:
    case AMDGPU::SI_SPILL_S256_RESTORE:
    case AMDGPU::SI_SPILL_S128_RESTORE:
    case AMDGPU::SI_SPILL_S64_RESTORE:
    case AMDGPU::SI_SPILL_S32_RESTORE: {
      restoreSGPR(MI, Index, RS);
      break;
    }

    // VGPR register spill
    case AMDGPU::SI_SPILL_V512_SAVE:
    case AMDGPU::SI_SPILL_V256_SAVE:
    case AMDGPU::SI_SPILL_V128_SAVE:
    case AMDGPU::SI_SPILL_V96_SAVE:
    case AMDGPU::SI_SPILL_V64_SAVE:
    case AMDGPU::SI_SPILL_V32_SAVE: {
      const MachineOperand *VData = TII->getNamedOperand(*MI,
                                                         AMDGPU::OpName::vdata);
      buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
            Index,
            VData->getReg(), VData->isKill(),
            TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
            TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(),
            TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
            *MI->memoperands_begin(),
            RS);
      MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
      MI->eraseFromParent();
      break;
    }
    case AMDGPU::SI_SPILL_V32_RESTORE:
    case AMDGPU::SI_SPILL_V64_RESTORE:
    case AMDGPU::SI_SPILL_V96_RESTORE:
    case AMDGPU::SI_SPILL_V128_RESTORE:
    case AMDGPU::SI_SPILL_V256_RESTORE:
    case AMDGPU::SI_SPILL_V512_RESTORE: {
      const MachineOperand *VData = TII->getNamedOperand(*MI,
                                                         AMDGPU::OpName::vdata);

      buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
            Index,
            VData->getReg(), VData->isKill(),
            TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
            TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg(),
            TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
            *MI->memoperands_begin(),
            RS);
      MI->eraseFromParent();
      break;
    }

    default: {
      if (TII->isMUBUF(*MI)) {
        // Disable offen so we don't need a 0 vgpr base.
        assert(static_cast<int>(FIOperandNum) ==
               AMDGPU::getNamedOperandIdx(MI->getOpcode(),
                                          AMDGPU::OpName::vaddr));

        int64_t Offset = FrameInfo.getObjectOffset(Index);
        int64_t OldImm
          = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
        int64_t NewOffset = OldImm + Offset;

        if (isUInt<12>(NewOffset) &&
            buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)) {
          MI->eraseFromParent();
          break;
        }
      }

      int64_t Offset = FrameInfo.getObjectOffset(Index);
      FIOp.ChangeToImmediate(Offset);
      if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
        unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
        BuildMI(*MBB, MI, MI->getDebugLoc(),
                TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
                .addImm(Offset);
        FIOp.ChangeToRegister(TmpReg, false, false, true);
      }
    }
  }
}
Example #13
0
// Convert callee-save register save/restore instruction to do stack pointer
// decrement/increment to allocate/deallocate the callee-save stack area by
// converting store/load to use pre/post increment version.
static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
    const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc) {
  unsigned NewOpc;
  bool NewIsUnscaled = false;
  switch (MBBI->getOpcode()) {
  default:
    llvm_unreachable("Unexpected callee-save save/restore opcode!");
  case AArch64::STPXi:
    NewOpc = AArch64::STPXpre;
    break;
  case AArch64::STPDi:
    NewOpc = AArch64::STPDpre;
    break;
  case AArch64::STRXui:
    NewOpc = AArch64::STRXpre;
    NewIsUnscaled = true;
    break;
  case AArch64::STRDui:
    NewOpc = AArch64::STRDpre;
    NewIsUnscaled = true;
    break;
  case AArch64::LDPXi:
    NewOpc = AArch64::LDPXpost;
    break;
  case AArch64::LDPDi:
    NewOpc = AArch64::LDPDpost;
    break;
  case AArch64::LDRXui:
    NewOpc = AArch64::LDRXpost;
    NewIsUnscaled = true;
    break;
  case AArch64::LDRDui:
    NewOpc = AArch64::LDRDpost;
    NewIsUnscaled = true;
    break;
  }

  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
  MIB.addReg(AArch64::SP, RegState::Define);

  // Copy all operands other than the immediate offset.
  unsigned OpndIdx = 0;
  for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd;
       ++OpndIdx)
    MIB.add(MBBI->getOperand(OpndIdx));

  assert(MBBI->getOperand(OpndIdx).getImm() == 0 &&
         "Unexpected immediate offset in first/last callee-save save/restore "
         "instruction!");
  assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP &&
         "Unexpected base register in callee-save save/restore instruction!");
  // Last operand is immediate offset that needs fixing.
  assert(CSStackSizeInc % 8 == 0);
  int64_t CSStackSizeIncImm = CSStackSizeInc;
  if (!NewIsUnscaled)
    CSStackSizeIncImm /= 8;
  MIB.addImm(CSStackSizeIncImm);

  MIB.setMIFlags(MBBI->getFlags());
  MIB.setMemRefs(MBBI->memoperands_begin(), MBBI->memoperands_end());

  return std::prev(MBB.erase(MBBI));
}
Example #14
0
MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
  MachineBasicBlock::iterator I,
  MachineBasicBlock::iterator Paired,
  unsigned EltSize) {
  MachineBasicBlock *MBB = I->getParent();

  // Be careful, since the addresses could be subregisters themselves in weird
  // cases, like vectors of pointers.
  const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr);

  const MachineOperand *Dest0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst);
  const MachineOperand *Dest1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst);

  unsigned Offset0
    = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff;
  unsigned Offset1
    = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff;

  unsigned NewOffset0 = Offset0 / EltSize;
  unsigned NewOffset1 = Offset1 / EltSize;
  unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;

  // Prefer the st64 form if we can use it, even if we can fit the offset in the
  // non st64 version. I'm not sure if there's any real reason to do this.
  bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0);
  if (UseST64) {
    NewOffset0 /= 64;
    NewOffset1 /= 64;
    Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
  }

  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
         (NewOffset0 != NewOffset1) &&
         "Computed offset doesn't fit");

  const MCInstrDesc &Read2Desc = TII->get(Opc);

  const TargetRegisterClass *SuperRC
    = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
  unsigned DestReg = MRI->createVirtualRegister(SuperRC);

  DebugLoc DL = I->getDebugLoc();
  MachineInstrBuilder Read2
    = BuildMI(*MBB, I, DL, Read2Desc, DestReg)
    .addOperand(*AddrReg) // addr
    .addImm(NewOffset0) // offset0
    .addImm(NewOffset1) // offset1
    .addImm(0) // gds
    .addMemOperand(*I->memoperands_begin())
    .addMemOperand(*Paired->memoperands_begin());

  unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
  unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;

  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);

  // Copy to the old destination registers.
  MachineInstr *Copy0 = BuildMI(*MBB, I, DL, CopyDesc)
    .addOperand(*Dest0) // Copy to same destination including flags and sub reg.
    .addReg(DestReg, 0, SubRegIdx0);
  MachineInstr *Copy1 = BuildMI(*MBB, I, DL, CopyDesc)
    .addOperand(*Dest1)
    .addReg(DestReg, RegState::Kill, SubRegIdx1);

  LIS->InsertMachineInstrInMaps(*Read2);

  // repairLiveintervalsInRange() doesn't handle physical register, so we have
  // to update the M0 range manually.
  SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired);
  LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI));
  LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex);
  bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot();

  // The new write to the original destination register is now the copy. Steal
  // the old SlotIndex.
  LIS->ReplaceMachineInstrInMaps(*I, *Copy0);
  LIS->ReplaceMachineInstrInMaps(*Paired, *Copy1);

  I->eraseFromParent();
  Paired->eraseFromParent();

  LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg());
  LIS->shrinkToUses(&AddrRegLI);

  LIS->createAndComputeVirtRegInterval(DestReg);

  if (UpdateM0Range) {
    SlotIndex Read2Index = LIS->getInstructionIndex(*Read2);
    M0Segment->end = Read2Index.getRegSlot();
  }

  DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
  return Read2.getInstr();
}