Example #1
0
// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
// around other non-memory instructions.
bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
  bool Changes = false;

  ST = &MF.getSubtarget<SISubtarget>();
  TII = ST->getInstrInfo();
  TRI = &TII->getRegisterInfo();
  MRI = &MF.getRegInfo();

  WaitedOn = ZeroCounts;
  DelayedWaitOn = ZeroCounts;
  LastIssued = ZeroCounts;
  LastOpcodeType = OTHER;
  LastInstWritesM0 = false;
  ReturnsVoid = MF.getInfo<SIMachineFunctionInfo>()->returnsVoid();

  memset(&UsedRegs, 0, sizeof(UsedRegs));
  memset(&DefinedRegs, 0, sizeof(DefinedRegs));

  SmallVector<MachineInstr *, 4> RemoveMI;

  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
       BI != BE; ++BI) {

    MachineBasicBlock &MBB = *BI;
    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
         I != E; ++I) {

      if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
        // There is a hardware bug on CI/SI where SMRD instruction may corrupt
        // vccz bit, so when we detect that an instruction may read from a
        // corrupt vccz bit, we need to:
        // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to
        //    complete.
        // 2. Restore the correct value of vccz by writing the current value
        //    of vcc back to vcc.

        if (TII->isSMRD(I->getOpcode())) {
          VCCZCorrupt = true;
        } else if (!hasOutstandingLGKM() && I->modifiesRegister(AMDGPU::VCC, TRI)) {
          // FIXME: We only care about SMRD instructions here, not LDS or GDS.
          // Whenever we store a value in vcc, the correct value of vccz is
          // restored.
          VCCZCorrupt = false;
        }

        // Check if we need to apply the bug work-around
        if (readsVCCZ(I->getOpcode()) && VCCZCorrupt) {
          DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n');

          // Wait on everything, not just LGKM.  vccz reads usually come from
          // terminators, and we always wait on everything at the end of the
          // block, so if we only wait on LGKM here, we might end up with
          // another s_waitcnt inserted right after this if there are non-LGKM
          // instructions still outstanding.
          insertWait(MBB, I, LastIssued);

          // Restore the vccz bit.  Any time a value is written to vcc, the vcc
          // bit is updated, so we can restore the bit by reading the value of
          // vcc and then writing it back to the register.
          BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
                  AMDGPU::VCC)
                  .addReg(AMDGPU::VCC);
        }
      }

      // Record pre-existing, explicitly requested waits
      if (I->getOpcode() == AMDGPU::S_WAITCNT) {
        handleExistingWait(*I);
        RemoveMI.push_back(&*I);
        continue;
      }

      Counters Required;

      // Wait for everything before a barrier.
      //
      // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
      // but we also want to wait for any other outstanding transfers before
      // signalling other hardware blocks
      if (I->getOpcode() == AMDGPU::S_BARRIER ||
          I->getOpcode() == AMDGPU::S_SENDMSG)
        Required = LastIssued;
      else
        Required = handleOperands(*I);

      Counters Increment = getHwCounts(*I);

      if (countersNonZero(Required) || countersNonZero(Increment))
        increaseCounters(Required, DelayedWaitOn);

      Changes |= insertWait(MBB, I, Required);

      pushInstruction(MBB, I, Increment);
      handleSendMsg(MBB, I);
    }

    // Wait for everything at the end of the MBB
    Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
  }

  for (MachineInstr *I : RemoveMI)
    I->eraseFromParent();

  return Changes;
}
Example #2
0
void SystemZFrameLowering::emitPrologue(MachineFunction &MF) const {
  MachineBasicBlock &MBB = MF.front();
  MachineFrameInfo *MFFrame = MF.getFrameInfo();
  auto *ZII =
      static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
  SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
  MachineBasicBlock::iterator MBBI = MBB.begin();
  MachineModuleInfo &MMI = MF.getMMI();
  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
  const std::vector<CalleeSavedInfo> &CSI = MFFrame->getCalleeSavedInfo();
  bool HasFP = hasFP(MF);
  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();

  // The current offset of the stack pointer from the CFA.
  int64_t SPOffsetFromCFA = -SystemZMC::CFAOffsetFromInitialSP;

  if (ZFI->getLowSavedGPR()) {
    // Skip over the GPR saves.
    if (MBBI != MBB.end() && MBBI->getOpcode() == SystemZ::STMG)
      ++MBBI;
    else
      llvm_unreachable("Couldn't skip over GPR saves");

    // Add CFI for the GPR saves.
    for (auto &Save : CSI) {
      unsigned Reg = Save.getReg();
      if (SystemZ::GR64BitRegClass.contains(Reg)) {
        int64_t Offset = SPOffsetFromCFA + RegSpillOffsets[Reg];
        unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
            nullptr, MRI->getDwarfRegNum(Reg, true), Offset));
        BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
            .addCFIIndex(CFIIndex);
      }
    }
  }

  uint64_t StackSize = getAllocatedStackSize(MF);
  if (StackSize) {
    // Allocate StackSize bytes.
    int64_t Delta = -int64_t(StackSize);
    emitIncrement(MBB, MBBI, DL, SystemZ::R15D, Delta, ZII);

    // Add CFI for the allocation.
    unsigned CFIIndex = MMI.addFrameInst(
        MCCFIInstruction::createDefCfaOffset(nullptr, SPOffsetFromCFA + Delta));
    BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
        .addCFIIndex(CFIIndex);
    SPOffsetFromCFA += Delta;
  }

  if (HasFP) {
    // Copy the base of the frame to R11.
    BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::LGR), SystemZ::R11D)
      .addReg(SystemZ::R15D);

    // Add CFI for the new frame location.
    unsigned HardFP = MRI->getDwarfRegNum(SystemZ::R11D, true);
    unsigned CFIIndex = MMI.addFrameInst(
        MCCFIInstruction::createDefCfaRegister(nullptr, HardFP));
    BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
        .addCFIIndex(CFIIndex);

    // Mark the FramePtr as live at the beginning of every block except
    // the entry block.  (We'll have marked R11 as live on entry when
    // saving the GPRs.)
    for (auto I = std::next(MF.begin()), E = MF.end(); I != E; ++I)
      I->addLiveIn(SystemZ::R11D);
  }

  // Skip over the FPR saves.
  SmallVector<unsigned, 8> CFIIndexes;
  for (auto &Save : CSI) {
    unsigned Reg = Save.getReg();
    if (SystemZ::FP64BitRegClass.contains(Reg)) {
      if (MBBI != MBB.end() &&
          (MBBI->getOpcode() == SystemZ::STD ||
           MBBI->getOpcode() == SystemZ::STDY))
        ++MBBI;
      else
        llvm_unreachable("Couldn't skip over FPR save");

      // Add CFI for the this save.
      unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
      int64_t Offset = getFrameIndexOffset(MF, Save.getFrameIdx());
      unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
          nullptr, DwarfReg, SPOffsetFromCFA + Offset));
      CFIIndexes.push_back(CFIIndex);
    }
  }
  // Complete the CFI for the FPR saves, modelling them as taking effect
  // after the last save.
  for (auto CFIIndex : CFIIndexes) {
    BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
        .addCFIIndex(CFIIndex);
  }
}
bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
  TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());

  const R600RegisterInfo &TRI = TII->getRegisterInfo();

  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
                                                  BB != BB_E; ++BB) {
    MachineBasicBlock &MBB = *BB;
    MachineBasicBlock::iterator I = MBB.begin();
    while (I != MBB.end()) {
      MachineInstr &MI = *I;
      I = llvm::next(I);

      // Expand LDS_*_RET instructions
      if (TII->isLDSRetInstr(MI.getOpcode())) {
        int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
        assert(DstIdx != -1);
        MachineOperand &DstOp = MI.getOperand(DstIdx);
        MachineInstr *Mov = TII->buildMovInstr(&MBB, I,
                                               DstOp.getReg(), AMDGPU::OQAP);
        DstOp.setReg(AMDGPU::OQAP);
        int LDSPredSelIdx = TII->getOperandIdx(MI.getOpcode(),
                                           AMDGPU::OpName::pred_sel);
        int MovPredSelIdx = TII->getOperandIdx(Mov->getOpcode(),
                                           AMDGPU::OpName::pred_sel);
        // Copy the pred_sel bit
        Mov->getOperand(MovPredSelIdx).setReg(
            MI.getOperand(LDSPredSelIdx).getReg());
      }

      switch (MI.getOpcode()) {
      default: break;
      // Expand PRED_X to one of the PRED_SET instructions.
      case AMDGPU::PRED_X: {
        uint64_t Flags = MI.getOperand(3).getImm();
        // The native opcode used by PRED_X is stored as an immediate in the
        // third operand.
        MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I,
                                            MI.getOperand(2).getImm(), // opcode
                                            MI.getOperand(0).getReg(), // dst
                                            MI.getOperand(1).getReg(), // src0
                                            AMDGPU::ZERO);             // src1
        TII->addFlag(PredSet, 0, MO_FLAG_MASK);
        if (Flags & MO_FLAG_PUSH) {
          TII->setImmOperand(PredSet, AMDGPU::OpName::update_exec_mask, 1);
        } else {
          TII->setImmOperand(PredSet, AMDGPU::OpName::update_pred, 1);
        }
        MI.eraseFromParent();
        continue;
        }

      case AMDGPU::INTERP_PAIR_XY: {
        MachineInstr *BMI;
        unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
                MI.getOperand(2).getImm());

        for (unsigned Chan = 0; Chan < 4; ++Chan) {
          unsigned DstReg;

          if (Chan < 2)
            DstReg = MI.getOperand(Chan).getReg();
          else
            DstReg = Chan == 2 ? AMDGPU::T0_Z : AMDGPU::T0_W;

          BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_XY,
              DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg);

          if (Chan > 0) {
            BMI->bundleWithPred();
          }
          if (Chan >= 2)
            TII->addFlag(BMI, 0, MO_FLAG_MASK);
          if (Chan != 3)
            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
        }

        MI.eraseFromParent();
        continue;
        }

      case AMDGPU::INTERP_PAIR_ZW: {
        MachineInstr *BMI;
        unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
                MI.getOperand(2).getImm());

        for (unsigned Chan = 0; Chan < 4; ++Chan) {
          unsigned DstReg;

          if (Chan < 2)
            DstReg = Chan == 0 ? AMDGPU::T0_X : AMDGPU::T0_Y;
          else
            DstReg = MI.getOperand(Chan-2).getReg();

          BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_ZW,
              DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg);

          if (Chan > 0) {
            BMI->bundleWithPred();
          }
          if (Chan < 2)
            TII->addFlag(BMI, 0, MO_FLAG_MASK);
          if (Chan != 3)
            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
        }

        MI.eraseFromParent();
        continue;
        }

      case AMDGPU::INTERP_VEC_LOAD: {
        const R600RegisterInfo &TRI = TII->getRegisterInfo();
        MachineInstr *BMI;
        unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
                MI.getOperand(1).getImm());
        unsigned DstReg = MI.getOperand(0).getReg();

        for (unsigned Chan = 0; Chan < 4; ++Chan) {
          BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_LOAD_P0,
              TRI.getSubReg(DstReg, TRI.getSubRegFromChannel(Chan)), PReg);
          if (Chan > 0) {
            BMI->bundleWithPred();
          }
          if (Chan != 3)
            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
        }

        MI.eraseFromParent();
        continue;
        }
      case AMDGPU::DOT_4: {

        const R600RegisterInfo &TRI = TII->getRegisterInfo();

        unsigned DstReg = MI.getOperand(0).getReg();
        unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;

        for (unsigned Chan = 0; Chan < 4; ++Chan) {
          bool Mask = (Chan != TRI.getHWRegChan(DstReg));
          unsigned SubDstReg =
              AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
          MachineInstr *BMI =
              TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, SubDstReg);
          if (Chan > 0) {
            BMI->bundleWithPred();
          }
          if (Mask) {
            TII->addFlag(BMI, 0, MO_FLAG_MASK);
          }
          if (Chan != 3)
            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
          unsigned Opcode = BMI->getOpcode();
          // While not strictly necessary from hw point of view, we force
          // all src operands of a dot4 inst to belong to the same slot.
          unsigned Src0 = BMI->getOperand(
              TII->getOperandIdx(Opcode, AMDGPU::OpName::src0))
              .getReg();
          unsigned Src1 = BMI->getOperand(
              TII->getOperandIdx(Opcode, AMDGPU::OpName::src1))
              .getReg();
          (void) Src0;
          (void) Src1;
          if ((TRI.getEncodingValue(Src0) & 0xff) < 127 &&
              (TRI.getEncodingValue(Src1) & 0xff) < 127)
            assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1));
        }
        MI.eraseFromParent();
        continue;
      }
      }

      bool IsReduction = TII->isReductionOp(MI.getOpcode());
      bool IsVector = TII->isVector(MI);
      bool IsCube = TII->isCubeOp(MI.getOpcode());
      if (!IsReduction && !IsVector && !IsCube) {
        continue;
      }

      // Expand the instruction
      //
      // Reduction instructions:
      // T0_X = DP4 T1_XYZW, T2_XYZW
      // becomes:
      // TO_X = DP4 T1_X, T2_X
      // TO_Y (write masked) = DP4 T1_Y, T2_Y
      // TO_Z (write masked) = DP4 T1_Z, T2_Z
      // TO_W (write masked) = DP4 T1_W, T2_W
      //
      // Vector instructions:
      // T0_X = MULLO_INT T1_X, T2_X
      // becomes:
      // T0_X = MULLO_INT T1_X, T2_X
      // T0_Y (write masked) = MULLO_INT T1_X, T2_X
      // T0_Z (write masked) = MULLO_INT T1_X, T2_X
      // T0_W (write masked) = MULLO_INT T1_X, T2_X
      //
      // Cube instructions:
      // T0_XYZW = CUBE T1_XYZW
      // becomes:
      // TO_X = CUBE T1_Z, T1_Y
      // T0_Y = CUBE T1_Z, T1_X
      // T0_Z = CUBE T1_X, T1_Z
      // T0_W = CUBE T1_Y, T1_Z
      for (unsigned Chan = 0; Chan < 4; Chan++) {
        unsigned DstReg = MI.getOperand(
                            TII->getOperandIdx(MI, AMDGPU::OpName::dst)).getReg();
        unsigned Src0 = MI.getOperand(
                           TII->getOperandIdx(MI, AMDGPU::OpName::src0)).getReg();
        unsigned Src1 = 0;

        // Determine the correct source registers
        if (!IsCube) {
          int Src1Idx = TII->getOperandIdx(MI, AMDGPU::OpName::src1);
          if (Src1Idx != -1) {
            Src1 = MI.getOperand(Src1Idx).getReg();
          }
        }
        if (IsReduction) {
          unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
          Src0 = TRI.getSubReg(Src0, SubRegIndex);
          Src1 = TRI.getSubReg(Src1, SubRegIndex);
        } else if (IsCube) {
          static const int CubeSrcSwz[] = {2, 2, 0, 1};
          unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]);
          unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]);
          Src1 = TRI.getSubReg(Src0, SubRegIndex1);
          Src0 = TRI.getSubReg(Src0, SubRegIndex0);
        }

        // Determine the correct destination registers;
        bool Mask = false;
        bool NotLast = true;
        if (IsCube) {
          unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
          DstReg = TRI.getSubReg(DstReg, SubRegIndex);
        } else {
          // Mask the write if the original instruction does not write to
          // the current Channel.
          Mask = (Chan != TRI.getHWRegChan(DstReg));
          unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
          DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
        }

        // Set the IsLast bit
        NotLast = (Chan != 3 );

        // Add the new instruction
        unsigned Opcode = MI.getOpcode();
        switch (Opcode) {
        case AMDGPU::CUBE_r600_pseudo:
          Opcode = AMDGPU::CUBE_r600_real;
          break;
        case AMDGPU::CUBE_eg_pseudo:
          Opcode = AMDGPU::CUBE_eg_real;
          break;
        default:
          break;
        }

        MachineInstr *NewMI =
          TII->buildDefaultInstruction(MBB, I, Opcode, DstReg, Src0, Src1);

        if (Chan != 0)
          NewMI->bundleWithPred();
        if (Mask) {
          TII->addFlag(NewMI, 0, MO_FLAG_MASK);
        }
        if (NotLast) {
          TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST);
        }
        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::clamp);
        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::literal);
        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_abs);
        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_abs);
        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_neg);
        SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_neg);
      }
      MI.eraseFromParent();
    }
  }
  return false;
}
bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
  MachineRegisterInfo &MRI = MF.getRegInfo();
  const SIInstrInfo *TII =
      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
  const SIRegisterInfo &TRI = TII->getRegisterInfo();
  std::vector<unsigned> I1Defs;

  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
                                                  BI != BE; ++BI) {

    MachineBasicBlock &MBB = *BI;
    MachineBasicBlock::iterator I, Next;
    for (I = MBB.begin(); I != MBB.end(); I = Next) {
      Next = std::next(I);
      MachineInstr &MI = *I;

      // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
      if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
        const MachineOperand &Src = MI.getOperand(1);

        if (Src.isImm()) {
          if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src))
            MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
        }

        continue;
      }

      if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
        continue;

      if (!canShrink(MI, TII, TRI, MRI)) {
        // Try commuting the instruction and see if that enables us to shrink
        // it.
        if (!MI.isCommutable() || !TII->commuteInstruction(&MI) ||
            !canShrink(MI, TII, TRI, MRI))
          continue;
      }

      // getVOPe32 could be -1 here if we started with an instruction that had
      // a 32-bit encoding and then commuted it to an instruction that did not.
      if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
        continue;

      int Op32 = AMDGPU::getVOPe32(MI.getOpcode());

      if (TII->isVOPC(Op32)) {
        unsigned DstReg = MI.getOperand(0).getReg();
        if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
          // VOPC instructions can only write to the VCC register.  We can't
          // force them to use VCC here, because the register allocator has
          // trouble with sequences like this, which cause the allocator to run
          // out of registers if vreg0 and vreg1 belong to the VCCReg register
          // class:
          // vreg0 = VOPC;
          // vreg1 = VOPC;
          // S_AND_B64 vreg0, vreg1
          //
          // So, instead of forcing the instruction to write to VCC, we provide
          // a hint to the register allocator to use VCC and then we we will run
          // this pass again after RA and shrink it if it outputs to VCC.
          MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC);
          continue;
        }
        if (DstReg != AMDGPU::VCC)
          continue;
      }

      // We can shrink this instruction
      DEBUG(dbgs() << "Shrinking "; MI.dump(); dbgs() << '\n';);

      MachineInstrBuilder Inst32 =
          BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));

      // dst
      Inst32.addOperand(MI.getOperand(0));

      Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));

      const MachineOperand *Src1 =
          TII->getNamedOperand(MI, AMDGPU::OpName::src1);
      if (Src1)
        Inst32.addOperand(*Src1);

      ++NumInstructionsShrunk;
      MI.eraseFromParent();

      foldImmediates(*Inst32, TII, MRI);
      DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');


    }
  }
/// replaceFrameIndices - Replace all MO_FrameIndex operands with physical
/// register references and actual offsets.
///
void PEI::replaceFrameIndices(MachineFunction &Fn) {
  if (!Fn.getFrameInfo()->hasStackObjects()) return; // Nothing to do?

  const TargetMachine &TM = Fn.getTarget();
  assert(TM.getRegisterInfo() && "TM::getRegisterInfo() must be implemented!");
  const TargetInstrInfo &TII = *Fn.getTarget().getInstrInfo();
  const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
  const TargetFrameLowering *TFI = TM.getFrameLowering();
  bool StackGrowsDown =
    TFI->getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
  int FrameSetupOpcode   = TII.getCallFrameSetupOpcode();
  int FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();

  for (MachineFunction::iterator BB = Fn.begin(),
         E = Fn.end(); BB != E; ++BB) {
#ifndef NDEBUG
    int SPAdjCount = 0; // frame setup / destroy count.
#endif
    int SPAdj = 0;  // SP offset due to call frame setup / destroy.
    if (RS && !FrameIndexVirtualScavenging) RS->enterBasicBlock(BB);

    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) {

      if (I->getOpcode() == FrameSetupOpcode ||
          I->getOpcode() == FrameDestroyOpcode) {
#ifndef NDEBUG
        // Track whether we see even pairs of them
        SPAdjCount += I->getOpcode() == FrameSetupOpcode ? 1 : -1;
#endif
        // Remember how much SP has been adjusted to create the call
        // frame.
        int Size = I->getOperand(0).getImm();

        if ((!StackGrowsDown && I->getOpcode() == FrameSetupOpcode) ||
            (StackGrowsDown && I->getOpcode() == FrameDestroyOpcode))
          Size = -Size;

        SPAdj += Size;

        MachineBasicBlock::iterator PrevI = BB->end();
        if (I != BB->begin()) PrevI = prior(I);
        TFI->eliminateCallFramePseudoInstr(Fn, *BB, I);

        // Visit the instructions created by eliminateCallFramePseudoInstr().
        if (PrevI == BB->end())
          I = BB->begin();     // The replaced instr was the first in the block.
        else
          I = llvm::next(PrevI);
        continue;
      }

      MachineInstr *MI = I;
      bool DoIncr = true;
      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
        if (!MI->getOperand(i).isFI())
            continue;

        // Some instructions (e.g. inline asm instructions) can have
        // multiple frame indices and/or cause eliminateFrameIndex
        // to insert more than one instruction. We need the register
        // scavenger to go through all of these instructions so that
        // it can update its register information. We keep the
        // iterator at the point before insertion so that we can
        // revisit them in full.
        bool AtBeginning = (I == BB->begin());
        if (!AtBeginning) --I;

        // If this instruction has a FrameIndex operand, we need to
        // use that target machine register info object to eliminate
        // it.
        TRI.eliminateFrameIndex(MI, SPAdj, i,
                                FrameIndexVirtualScavenging ?  NULL : RS);

        // Reset the iterator if we were at the beginning of the BB.
        if (AtBeginning) {
          I = BB->begin();
          DoIncr = false;
        }

        MI = 0;
        break;
      }

      if (DoIncr && I != BB->end()) ++I;

      // Update register states.
      if (RS && !FrameIndexVirtualScavenging && MI) RS->forward(MI);
    }

    // If we have evenly matched pairs of frame setup / destroy instructions,
    // make sure the adjustments come out to zero. If we don't have matched
    // pairs, we can't be sure the missing bit isn't in another basic block
    // due to a custom inserter playing tricks, so just asserting SPAdj==0
    // isn't sufficient. See tMOVCC on Thumb1, for example.
    assert((SPAdjCount || SPAdj == 0) &&
           "Unbalanced call frame setup / destroy pairs?");
  }
}
bool StrongPHIElimination::runOnMachineFunction(MachineFunction &MF) {
  MRI = &MF.getRegInfo();
  TII = MF.getTarget().getInstrInfo();
  DT = &getAnalysis<MachineDominatorTree>();
  LI = &getAnalysis<LiveIntervals>();

  for (MachineFunction::iterator I = MF.begin(), E = MF.end();
       I != E; ++I) {
    for (MachineBasicBlock::iterator BBI = I->begin(), BBE = I->end();
         BBI != BBE && BBI->isPHI(); ++BBI) {
      unsigned DestReg = BBI->getOperand(0).getReg();
      addReg(DestReg);
      PHISrcDefs[I].push_back(BBI);

      for (unsigned i = 1; i < BBI->getNumOperands(); i += 2) {
        MachineOperand &SrcMO = BBI->getOperand(i);
        unsigned SrcReg = SrcMO.getReg();
        addReg(SrcReg);
        unionRegs(DestReg, SrcReg);

        MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
        if (DefMI)
          PHISrcDefs[DefMI->getParent()].push_back(DefMI);
      }
    }
  }

  // Perform a depth-first traversal of the dominator tree, splitting
  // interferences amongst PHI-congruence classes.
  DenseMap<unsigned, unsigned> CurrentDominatingParent;
  DenseMap<unsigned, unsigned> ImmediateDominatingParent;
  for (df_iterator<MachineDomTreeNode*> DI = df_begin(DT->getRootNode()),
       DE = df_end(DT->getRootNode()); DI != DE; ++DI) {
    SplitInterferencesForBasicBlock(*DI->getBlock(),
                                    CurrentDominatingParent,
                                    ImmediateDominatingParent);
  }

  // Insert copies for all PHI source and destination registers.
  for (MachineFunction::iterator I = MF.begin(), E = MF.end();
       I != E; ++I) {
    for (MachineBasicBlock::iterator BBI = I->begin(), BBE = I->end();
         BBI != BBE && BBI->isPHI(); ++BBI) {
      InsertCopiesForPHI(BBI, I);
    }
  }

  // FIXME: Preserve the equivalence classes during copy insertion and use
  // the preversed equivalence classes instead of recomputing them.
  RegNodeMap.clear();
  for (MachineFunction::iterator I = MF.begin(), E = MF.end();
       I != E; ++I) {
    for (MachineBasicBlock::iterator BBI = I->begin(), BBE = I->end();
         BBI != BBE && BBI->isPHI(); ++BBI) {
      unsigned DestReg = BBI->getOperand(0).getReg();
      addReg(DestReg);

      for (unsigned i = 1; i < BBI->getNumOperands(); i += 2) {
        unsigned SrcReg = BBI->getOperand(i).getReg();
        addReg(SrcReg);
        unionRegs(DestReg, SrcReg);
      }
    }
  }

  DenseMap<unsigned, unsigned> RegRenamingMap;
  bool Changed = false;
  for (MachineFunction::iterator I = MF.begin(), E = MF.end();
       I != E; ++I) {
    MachineBasicBlock::iterator BBI = I->begin(), BBE = I->end();
    while (BBI != BBE && BBI->isPHI()) {
      MachineInstr *PHI = BBI;

      assert(PHI->getNumOperands() > 0);

      unsigned SrcReg = PHI->getOperand(1).getReg();
      unsigned SrcColor = getRegColor(SrcReg);
      unsigned NewReg = RegRenamingMap[SrcColor];
      if (!NewReg) {
        NewReg = SrcReg;
        RegRenamingMap[SrcColor] = SrcReg;
      }
      MergeLIsAndRename(SrcReg, NewReg);

      unsigned DestReg = PHI->getOperand(0).getReg();
      if (!InsertedDestCopies.count(DestReg))
        MergeLIsAndRename(DestReg, NewReg);

      for (unsigned i = 3; i < PHI->getNumOperands(); i += 2) {
        unsigned SrcReg = PHI->getOperand(i).getReg();
        MergeLIsAndRename(SrcReg, NewReg);
      }

      ++BBI;
      LI->RemoveMachineInstrFromMaps(PHI);
      PHI->eraseFromParent();
      Changed = true;
    }
  }

  // Due to the insertion of copies to split live ranges, the live intervals are
  // guaranteed to not overlap, except in one case: an original PHI source and a
  // PHI destination copy. In this case, they have the same value and thus don't
  // truly intersect, so we merge them into the value live at that point.
  // FIXME: Is there some better way we can handle this?
  for (DestCopyMap::iterator I = InsertedDestCopies.begin(),
       E = InsertedDestCopies.end(); I != E; ++I) {
    unsigned DestReg = I->first;
    unsigned DestColor = getRegColor(DestReg);
    unsigned NewReg = RegRenamingMap[DestColor];

    LiveInterval &DestLI = LI->getInterval(DestReg);
    LiveInterval &NewLI = LI->getInterval(NewReg);

    assert(DestLI.ranges.size() == 1
           && "PHI destination copy's live interval should be a single live "
               "range from the beginning of the BB to the copy instruction.");
    LiveRange *DestLR = DestLI.begin();
    VNInfo *NewVNI = NewLI.getVNInfoAt(DestLR->start);
    if (!NewVNI) {
      NewVNI = NewLI.createValueCopy(DestLR->valno, LI->getVNInfoAllocator());
      MachineInstr *CopyInstr = I->second;
      CopyInstr->getOperand(1).setIsKill(true);
    }

    LiveRange NewLR(DestLR->start, DestLR->end, NewVNI);
    NewLI.addRange(NewLR);

    LI->removeInterval(DestReg);
    MRI->replaceRegWith(DestReg, NewReg);
  }

  // Adjust the live intervals of all PHI source registers to handle the case
  // where the PHIs in successor blocks were the only later uses of the source
  // register.
  for (SrcCopySet::iterator I = InsertedSrcCopySet.begin(),
       E = InsertedSrcCopySet.end(); I != E; ++I) {
    MachineBasicBlock *MBB = I->first;
    unsigned SrcReg = I->second;
    if (unsigned RenamedRegister = RegRenamingMap[getRegColor(SrcReg)])
      SrcReg = RenamedRegister;

    LiveInterval &SrcLI = LI->getInterval(SrcReg);

    bool isLiveOut = false;
    for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
         SE = MBB->succ_end(); SI != SE; ++SI) {
      if (SrcLI.liveAt(LI->getMBBStartIdx(*SI))) {
        isLiveOut = true;
        break;
      }
    }

    if (isLiveOut)
      continue;

    MachineOperand *LastUse = findLastUse(MBB, SrcReg);
    assert(LastUse);
    SlotIndex LastUseIndex = LI->getInstructionIndex(LastUse->getParent());
    SrcLI.removeRange(LastUseIndex.getDefIndex(), LI->getMBBEndIdx(MBB));
    LastUse->setIsKill(true);
  }

  LI->renumber();

  Allocator.Reset();
  RegNodeMap.clear();
  PHISrcDefs.clear();
  InsertedSrcCopySet.clear();
  InsertedSrcCopyMap.clear();
  InsertedDestCopies.clear();

  return Changed;
}
bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
  MachineRegisterInfo &MRI = MF.getRegInfo();
  const SIInstrInfo *TII =
      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
  const SIRegisterInfo &TRI = TII->getRegisterInfo();
  std::vector<unsigned> I1Defs;

  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
                                                  BI != BE; ++BI) {

    MachineBasicBlock &MBB = *BI;
    MachineBasicBlock::iterator I, Next;
    for (I = MBB.begin(); I != MBB.end(); I = Next) {
      Next = std::next(I);
      MachineInstr &MI = *I;

      // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
      if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
        const MachineOperand &Src = MI.getOperand(1);

        if (Src.isImm()) {
          if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4))
            MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
        }

        continue;
      }

      if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
        continue;

      if (!canShrink(MI, TII, TRI, MRI)) {
        // Try commuting the instruction and see if that enables us to shrink
        // it.
        if (!MI.isCommutable() || !TII->commuteInstruction(&MI) ||
            !canShrink(MI, TII, TRI, MRI))
          continue;
      }

      // getVOPe32 could be -1 here if we started with an instruction that had
      // a 32-bit encoding and then commuted it to an instruction that did not.
      if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
        continue;

      int Op32 = AMDGPU::getVOPe32(MI.getOpcode());

      if (TII->isVOPC(Op32)) {
        unsigned DstReg = MI.getOperand(0).getReg();
        if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
          // VOPC instructions can only write to the VCC register. We can't
          // force them to use VCC here, because this is only one register and
          // cannot deal with sequences which would require multiple copies of
          // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
          //
          // So, instead of forcing the instruction to write to VCC, we provide
          // a hint to the register allocator to use VCC and then we we will run
          // this pass again after RA and shrink it if it outputs to VCC.
          MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC);
          continue;
        }
        if (DstReg != AMDGPU::VCC)
          continue;
      }

      if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
        // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
        // instructions.
        const MachineOperand *Src2 =
            TII->getNamedOperand(MI, AMDGPU::OpName::src2);
        if (!Src2->isReg())
          continue;
        unsigned SReg = Src2->getReg();
        if (TargetRegisterInfo::isVirtualRegister(SReg)) {
          MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC);
          continue;
        }
        if (SReg != AMDGPU::VCC)
          continue;
      }

      // We can shrink this instruction
      DEBUG(dbgs() << "Shrinking " << MI);

      MachineInstrBuilder Inst32 =
          BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));

      // Add the dst operand if the 32-bit encoding also has an explicit $dst.
      // For VOPC instructions, this is replaced by an implicit def of vcc.
      int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::dst);
      if (Op32DstIdx != -1) {
        // dst
        Inst32.addOperand(MI.getOperand(0));
      } else {
        assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
               "Unexpected case");
      }


      Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));

      const MachineOperand *Src1 =
          TII->getNamedOperand(MI, AMDGPU::OpName::src1);
      if (Src1)
        Inst32.addOperand(*Src1);

      const MachineOperand *Src2 =
        TII->getNamedOperand(MI, AMDGPU::OpName::src2);
      if (Src2) {
        int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
        if (Op32Src2Idx != -1) {
          Inst32.addOperand(*Src2);
        } else {
          // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
          // replaced with an implicit read of vcc.
          assert(Src2->getReg() == AMDGPU::VCC &&
                 "Unexpected missing register operand");
          Inst32.addOperand(copyRegOperandAsImplicit(*Src2));
        }
      }

      ++NumInstructionsShrunk;
      MI.eraseFromParent();

      foldImmediates(*Inst32, TII, MRI);
      DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');


    }
  }
  return false;
}
bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {

  DEBUG(dbgs() << "********** Hexagon New Value Jump **********\n"
               << "********** Function: "
               << MF.getName() << "\n");

#if 0
  // for now disable this, if we move NewValueJump before register
  // allocation we need this information.
  LiveVariables &LVs = getAnalysis<LiveVariables>();
#endif

  QII = static_cast<const HexagonInstrInfo *>(MF.getSubtarget().getInstrInfo());
  QRI = static_cast<const HexagonRegisterInfo *>(
      MF.getSubtarget().getRegisterInfo());
  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();

  if (!QRI->Subtarget.hasV4TOps() ||
      DisableNewValueJumps) {
    return false;
  }

  int nvjCount = DbgNVJCount;
  int nvjGenerated = 0;

  // Loop through all the bb's of the function
  for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end();
        MBBb != MBBe; ++MBBb) {
    MachineBasicBlock* MBB = MBBb;

    DEBUG(dbgs() << "** dumping bb ** "
                 << MBB->getNumber() << "\n");
    DEBUG(MBB->dump());
    DEBUG(dbgs() << "\n" << "********** dumping instr bottom up **********\n");
    bool foundJump    = false;
    bool foundCompare = false;
    bool invertPredicate = false;
    unsigned predReg = 0; // predicate reg of the jump.
    unsigned cmpReg1 = 0;
    int cmpOp2 = 0;
    bool MO1IsKill = false;
    bool MO2IsKill = false;
    MachineBasicBlock::iterator jmpPos;
    MachineBasicBlock::iterator cmpPos;
    MachineInstr *cmpInstr = nullptr, *jmpInstr = nullptr;
    MachineBasicBlock *jmpTarget = nullptr;
    bool afterRA = false;
    bool isSecondOpReg = false;
    bool isSecondOpNewified = false;
    // Traverse the basic block - bottom up
    for (MachineBasicBlock::iterator MII = MBB->end(), E = MBB->begin();
             MII != E;) {
      MachineInstr *MI = --MII;
      if (MI->isDebugValue()) {
        continue;
      }

      if ((nvjCount == 0) || (nvjCount > -1 && nvjCount <= nvjGenerated))
        break;

      DEBUG(dbgs() << "Instr: "; MI->dump(); dbgs() << "\n");

      if (!foundJump &&
         (MI->getOpcode() == Hexagon::JMP_t ||
          MI->getOpcode() == Hexagon::JMP_f ||
          MI->getOpcode() == Hexagon::JMP_tnew_t ||
          MI->getOpcode() == Hexagon::JMP_tnew_nt ||
          MI->getOpcode() == Hexagon::JMP_fnew_t ||
          MI->getOpcode() == Hexagon::JMP_fnew_nt)) {
        // This is where you would insert your compare and
        // instr that feeds compare
        jmpPos = MII;
        jmpInstr = MI;
        predReg = MI->getOperand(0).getReg();
        afterRA = TargetRegisterInfo::isPhysicalRegister(predReg);

        // If ifconverter had not messed up with the kill flags of the
        // operands, the following check on the kill flag would suffice.
        // if(!jmpInstr->getOperand(0).isKill()) break;

        // This predicate register is live out out of BB
        // this would only work if we can actually use Live
        // variable analysis on phy regs - but LLVM does not
        // provide LV analysis on phys regs.
        //if(LVs.isLiveOut(predReg, *MBB)) break;

        // Get all the successors of this block - which will always
        // be 2. Check if the predicate register is live in in those
        // successor. If yes, we can not delete the predicate -
        // I am doing this only because LLVM does not provide LiveOut
        // at the BB level.
        bool predLive = false;
        for (MachineBasicBlock::const_succ_iterator SI = MBB->succ_begin(),
                            SIE = MBB->succ_end(); SI != SIE; ++SI) {
          MachineBasicBlock* succMBB = *SI;
         if (succMBB->isLiveIn(predReg)) {
            predLive = true;
          }
        }
        if (predLive)
          break;

        jmpTarget = MI->getOperand(1).getMBB();
        foundJump = true;
        if (MI->getOpcode() == Hexagon::JMP_f ||
            MI->getOpcode() == Hexagon::JMP_fnew_t ||
            MI->getOpcode() == Hexagon::JMP_fnew_nt) {
          invertPredicate = true;
        }
        continue;
      }

      // No new value jump if there is a barrier. A barrier has to be in its
      // own packet. A barrier has zero operands. We conservatively bail out
      // here if we see any instruction with zero operands.
      if (foundJump && MI->getNumOperands() == 0)
        break;

      if (foundJump &&
         !foundCompare &&
          MI->getOperand(0).isReg() &&
          MI->getOperand(0).getReg() == predReg) {

        // Not all compares can be new value compare. Arch Spec: 7.6.1.1
        if (QII->isNewValueJumpCandidate(MI)) {

          assert((MI->getDesc().isCompare()) &&
              "Only compare instruction can be collapsed into New Value Jump");
          isSecondOpReg = MI->getOperand(2).isReg();

          if (!canCompareBeNewValueJump(QII, QRI, MII, predReg, isSecondOpReg,
                                        afterRA, jmpPos, MF))
            break;

          cmpInstr = MI;
          cmpPos = MII;
          foundCompare = true;

          // We need cmpReg1 and cmpOp2(imm or reg) while building
          // new value jump instruction.
          cmpReg1 = MI->getOperand(1).getReg();
          if (MI->getOperand(1).isKill())
            MO1IsKill = true;

          if (isSecondOpReg) {
            cmpOp2 = MI->getOperand(2).getReg();
            if (MI->getOperand(2).isKill())
              MO2IsKill = true;
          } else
            cmpOp2 = MI->getOperand(2).getImm();
          continue;
        }
      }

      if (foundCompare && foundJump) {

        // If "common" checks fail, bail out on this BB.
        if (!commonChecksToProhibitNewValueJump(afterRA, MII))
          break;

        bool foundFeeder = false;
        MachineBasicBlock::iterator feederPos = MII;
        if (MI->getOperand(0).isReg() &&
            MI->getOperand(0).isDef() &&
           (MI->getOperand(0).getReg() == cmpReg1 ||
            (isSecondOpReg &&
             MI->getOperand(0).getReg() == (unsigned) cmpOp2))) {

          unsigned feederReg = MI->getOperand(0).getReg();

          // First try to see if we can get the feeder from the first operand
          // of the compare. If we can not, and if secondOpReg is true
          // (second operand of the compare is also register), try that one.
          // TODO: Try to come up with some heuristic to figure out which
          // feeder would benefit.

          if (feederReg == cmpReg1) {
            if (!canBeFeederToNewValueJump(QII, QRI, MII, jmpPos, cmpPos, MF)) {
              if (!isSecondOpReg)
                break;
              else
                continue;
            } else
              foundFeeder = true;
          }

          if (!foundFeeder &&
               isSecondOpReg &&
               feederReg == (unsigned) cmpOp2)
            if (!canBeFeederToNewValueJump(QII, QRI, MII, jmpPos, cmpPos, MF))
              break;

          if (isSecondOpReg) {
            // In case of CMPLT, or CMPLTU, or EQ with the second register
            // to newify, swap the operands.
            if (cmpInstr->getOpcode() == Hexagon::C2_cmpeq &&
                                     feederReg == (unsigned) cmpOp2) {
              unsigned tmp = cmpReg1;
              bool tmpIsKill = MO1IsKill;
              cmpReg1 = cmpOp2;
              MO1IsKill = MO2IsKill;
              cmpOp2 = tmp;
              MO2IsKill = tmpIsKill;
            }

            // Now we have swapped the operands, all we need to check is,
            // if the second operand (after swap) is the feeder.
            // And if it is, make a note.
            if (feederReg == (unsigned)cmpOp2)
              isSecondOpNewified = true;
          }

          // Now that we are moving feeder close the jump,
          // make sure we are respecting the kill values of
          // the operands of the feeder.

          bool updatedIsKill = false;
          for (unsigned i = 0; i < MI->getNumOperands(); i++) {
            MachineOperand &MO = MI->getOperand(i);
            if (MO.isReg() && MO.isUse()) {
              unsigned feederReg = MO.getReg();
              for (MachineBasicBlock::iterator localII = feederPos,
                   end = jmpPos; localII != end; localII++) {
                MachineInstr *localMI = localII;
                for (unsigned j = 0; j < localMI->getNumOperands(); j++) {
                  MachineOperand &localMO = localMI->getOperand(j);
                  if (localMO.isReg() && localMO.isUse() &&
                      localMO.isKill() && feederReg == localMO.getReg()) {
                    // We found that there is kill of a use register
                    // Set up a kill flag on the register
                    localMO.setIsKill(false);
                    MO.setIsKill();
                    updatedIsKill = true;
                    break;
                  }
                }
                if (updatedIsKill) break;
              }
            }
            if (updatedIsKill) break;
          }

          MBB->splice(jmpPos, MI->getParent(), MI);
          MBB->splice(jmpPos, MI->getParent(), cmpInstr);
          DebugLoc dl = MI->getDebugLoc();
          MachineInstr *NewMI;

           assert((QII->isNewValueJumpCandidate(cmpInstr)) &&
                      "This compare is not a New Value Jump candidate.");
          unsigned opc = getNewValueJumpOpcode(cmpInstr, cmpOp2,
                                               isSecondOpNewified,
                                               jmpTarget, MBPI);
          if (invertPredicate)
            opc = QII->getInvertedPredicatedOpcode(opc);

          if (isSecondOpReg)
            NewMI = BuildMI(*MBB, jmpPos, dl,
                                  QII->get(opc))
                                    .addReg(cmpReg1, getKillRegState(MO1IsKill))
                                    .addReg(cmpOp2, getKillRegState(MO2IsKill))
                                    .addMBB(jmpTarget);

          else if ((cmpInstr->getOpcode() == Hexagon::C2_cmpeqi ||
                    cmpInstr->getOpcode() == Hexagon::C2_cmpgti) &&
                    cmpOp2 == -1 )
            // Corresponding new-value compare jump instructions don't have the
            // operand for -1 immediate value.
            NewMI = BuildMI(*MBB, jmpPos, dl,
                                  QII->get(opc))
                                    .addReg(cmpReg1, getKillRegState(MO1IsKill))
                                    .addMBB(jmpTarget);

          else
            NewMI = BuildMI(*MBB, jmpPos, dl,
                                  QII->get(opc))
                                    .addReg(cmpReg1, getKillRegState(MO1IsKill))
                                    .addImm(cmpOp2)
                                    .addMBB(jmpTarget);

          assert(NewMI && "New Value Jump Instruction Not created!");
          (void)NewMI;
          if (cmpInstr->getOperand(0).isReg() &&
              cmpInstr->getOperand(0).isKill())
            cmpInstr->getOperand(0).setIsKill(false);
          if (cmpInstr->getOperand(1).isReg() &&
              cmpInstr->getOperand(1).isKill())
            cmpInstr->getOperand(1).setIsKill(false);
          cmpInstr->eraseFromParent();
          jmpInstr->eraseFromParent();
          ++nvjGenerated;
          ++NumNVJGenerated;
          break;
        }
      }
    }
  }

  return true;

}
bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
  QII = static_cast<const HexagonInstrInfo *>(MF.getTarget().
                                        getInstrInfo());
  QRI = static_cast<const HexagonRegisterInfo *>(MF.getTarget().
                                       getRegisterInfo());
  MRI = &MF.getRegInfo();

  DenseMap<unsigned, unsigned> PeepholeMap;
  DenseMap<unsigned, std::pair<unsigned, unsigned> > PeepholeDoubleRegsMap;

  if (DisableHexagonPeephole) return false;

  // Loop over all of the basic blocks.
  for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end();
       MBBb != MBBe; ++MBBb) {
    MachineBasicBlock* MBB = MBBb;
    PeepholeMap.clear();
    PeepholeDoubleRegsMap.clear();

    // Traverse the basic block.
    for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end();
                                     ++MII) {
      MachineInstr *MI = MII;
      // Look for sign extends:
      // %vreg170<def> = SXTW %vreg166
      if (!DisableOptSZExt && MI->getOpcode() == Hexagon::SXTW) {
        assert (MI->getNumOperands() == 2);
        MachineOperand &Dst = MI->getOperand(0);
        MachineOperand &Src  = MI->getOperand(1);
        unsigned DstReg = Dst.getReg();
        unsigned SrcReg = Src.getReg();
        // Just handle virtual registers.
        if (TargetRegisterInfo::isVirtualRegister(DstReg) &&
            TargetRegisterInfo::isVirtualRegister(SrcReg)) {
          // Map the following:
          // %vreg170<def> = SXTW %vreg166
          // PeepholeMap[170] = vreg166
          PeepholeMap[DstReg] = SrcReg;
        }
      }

      // Look for  %vreg170<def> = COMBINE_ir_V4 (0, %vreg169)
      // %vreg170:DoublRegs, %vreg169:IntRegs
      if (!DisableOptExtTo64 &&
          MI->getOpcode () == Hexagon::COMBINE_Ir_V4) {
        assert (MI->getNumOperands() == 3);
        MachineOperand &Dst = MI->getOperand(0);
        MachineOperand &Src1 = MI->getOperand(1);
        MachineOperand &Src2 = MI->getOperand(2);
        if (Src1.getImm() != 0)
          continue;
        unsigned DstReg = Dst.getReg();
        unsigned SrcReg = Src2.getReg();
        PeepholeMap[DstReg] = SrcReg;
      }

      // Look for this sequence below
      // %vregDoubleReg1 = LSRd_ri %vregDoubleReg0, 32
      // %vregIntReg = COPY %vregDoubleReg1:subreg_loreg.
      // and convert into
      // %vregIntReg = COPY %vregDoubleReg0:subreg_hireg.
      if (MI->getOpcode() == Hexagon::LSRd_ri) {
        assert(MI->getNumOperands() == 3);
        MachineOperand &Dst = MI->getOperand(0);
        MachineOperand &Src1 = MI->getOperand(1);
        MachineOperand &Src2 = MI->getOperand(2);
        if (Src2.getImm() != 32)
          continue;
        unsigned DstReg = Dst.getReg();
        unsigned SrcReg = Src1.getReg();
        PeepholeDoubleRegsMap[DstReg] =
          std::make_pair(*&SrcReg, 1/*Hexagon::subreg_hireg*/);
      }

      // Look for P=NOT(P).
      if (!DisablePNotP &&
          (MI->getOpcode() == Hexagon::NOT_p)) {
        assert (MI->getNumOperands() == 2);
        MachineOperand &Dst = MI->getOperand(0);
        MachineOperand &Src  = MI->getOperand(1);
        unsigned DstReg = Dst.getReg();
        unsigned SrcReg = Src.getReg();
        // Just handle virtual registers.
        if (TargetRegisterInfo::isVirtualRegister(DstReg) &&
            TargetRegisterInfo::isVirtualRegister(SrcReg)) {
          // Map the following:
          // %vreg170<def> = NOT_xx %vreg166
          // PeepholeMap[170] = vreg166
          PeepholeMap[DstReg] = SrcReg;
        }
      }

      // Look for copy:
      // %vreg176<def> = COPY %vreg170:subreg_loreg
      if (!DisableOptSZExt && MI->isCopy()) {
        assert (MI->getNumOperands() == 2);
        MachineOperand &Dst = MI->getOperand(0);
        MachineOperand &Src  = MI->getOperand(1);

        // Make sure we are copying the lower 32 bits.
        if (Src.getSubReg() != Hexagon::subreg_loreg)
          continue;

        unsigned DstReg = Dst.getReg();
        unsigned SrcReg = Src.getReg();
        if (TargetRegisterInfo::isVirtualRegister(DstReg) &&
            TargetRegisterInfo::isVirtualRegister(SrcReg)) {
          // Try to find in the map.
          if (unsigned PeepholeSrc = PeepholeMap.lookup(SrcReg)) {
            // Change the 1st operand.
            MI->RemoveOperand(1);
            MI->addOperand(MachineOperand::CreateReg(PeepholeSrc, false));
          } else  {
            DenseMap<unsigned, std::pair<unsigned, unsigned> >::iterator DI =
              PeepholeDoubleRegsMap.find(SrcReg);
            if (DI != PeepholeDoubleRegsMap.end()) {
              std::pair<unsigned,unsigned> PeepholeSrc = DI->second;
              MI->RemoveOperand(1);
              MI->addOperand(MachineOperand::CreateReg(PeepholeSrc.first,
                                                       false /*isDef*/,
                                                       false /*isImp*/,
                                                       false /*isKill*/,
                                                       false /*isDead*/,
                                                       false /*isUndef*/,
                                                       false /*isEarlyClobber*/,
                                                       PeepholeSrc.second));
            }
          }
        }
      }

      // Look for Predicated instructions.
      if (!DisablePNotP) {
        bool Done = false;
        if (QII->isPredicated(MI)) {
          MachineOperand &Op0 = MI->getOperand(0);
          unsigned Reg0 = Op0.getReg();
          const TargetRegisterClass *RC0 = MRI->getRegClass(Reg0);
          if (RC0->getID() == Hexagon::PredRegsRegClassID) {
            // Handle instructions that have a prediate register in op0
            // (most cases of predicable instructions).
            if (TargetRegisterInfo::isVirtualRegister(Reg0)) {
              // Try to find in the map.
              if (unsigned PeepholeSrc = PeepholeMap.lookup(Reg0)) {
                // Change the 1st operand and, flip the opcode.
                MI->getOperand(0).setReg(PeepholeSrc);
                int NewOp = QII->getInvertedPredicatedOpcode(MI->getOpcode());
                MI->setDesc(QII->get(NewOp));
                Done = true;
              }
            }
          }
        }

        if (!Done) {
          // Handle special instructions.
          unsigned Op = MI->getOpcode();
          unsigned NewOp = 0;
          unsigned PR = 1, S1 = 2, S2 = 3;   // Operand indices.

          switch (Op) {
            case Hexagon::TFR_condset_rr:
            case Hexagon::TFR_condset_ii:
            case Hexagon::MUX_ii:
            case Hexagon::MUX_rr:
              NewOp = Op;
              break;
            case Hexagon::TFR_condset_ri:
              NewOp = Hexagon::TFR_condset_ir;
              break;
            case Hexagon::TFR_condset_ir:
              NewOp = Hexagon::TFR_condset_ri;
              break;
            case Hexagon::MUX_ri:
              NewOp = Hexagon::MUX_ir;
              break;
            case Hexagon::MUX_ir:
              NewOp = Hexagon::MUX_ri;
              break;
          }
          if (NewOp) {
            unsigned PSrc = MI->getOperand(PR).getReg();
            if (unsigned POrig = PeepholeMap.lookup(PSrc)) {
              MI->getOperand(PR).setReg(POrig);
              MI->setDesc(QII->get(NewOp));
              // Swap operands S1 and S2.
              MachineOperand Op1 = MI->getOperand(S1);
              MachineOperand Op2 = MI->getOperand(S2);
              ChangeOpInto(MI->getOperand(S1), Op2);
              ChangeOpInto(MI->getOperand(S2), Op1);
            }
          } // if (NewOp)
        } // if (!Done)

      } // if (!DisablePNotP)

    } // Instruction
  } // Basic Block
  return true;
}
void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
  MachineFrameInfo *MFI = MF.getFrameInfo();
  const Function* Fn = MF.getFunction();
  const X86Subtarget* Subtarget = &MF.getTarget().getSubtarget<X86Subtarget>();
  MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
  MachineBasicBlock::iterator MBBI = MBB.begin();
  bool needsFrameMoves = (MMI && MMI->hasDebugInfo()) ||
                          !Fn->doesNotThrow() ||
                          UnwindTablesMandatory;
  // Prepare for frame info.
  unsigned FrameLabelId = 0;

  // Get the number of bytes to allocate from the FrameInfo.
  uint64_t StackSize = MFI->getStackSize();
  // Get desired stack alignment
  uint64_t MaxAlign  = MFI->getMaxAlignment();

  // Add RETADDR move area to callee saved frame size.
  int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
  if (TailCallReturnAddrDelta < 0)
    X86FI->setCalleeSavedFrameSize(
          X86FI->getCalleeSavedFrameSize() +(-TailCallReturnAddrDelta));

  // If this is x86-64 and the Red Zone is not disabled, if we are a leaf
  // function, and use up to 128 bytes of stack space, don't have a frame
  // pointer, calls, or dynamic alloca then we do not need to adjust the
  // stack pointer (we fit in the Red Zone).
  if (Is64Bit && !DisableRedZone &&
      !needsStackRealignment(MF) &&
      !MFI->hasVarSizedObjects() &&                // No dynamic alloca.
      !MFI->hasCalls()) {                          // No calls.
    uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
    if (hasFP(MF)) MinSize += SlotSize;
    StackSize = std::max(MinSize,
                         StackSize > 128 ? StackSize - 128 : 0);
    MFI->setStackSize(StackSize);
  }

  // Insert stack pointer adjustment for later moving of return addr.  Only
  // applies to tail call optimized functions where the callee argument stack
  // size is bigger than the callers.
  if (TailCallReturnAddrDelta < 0) {
    MachineInstr *MI =
      BuildMI(MBB, MBBI, TII.get(Is64Bit? X86::SUB64ri32 : X86::SUB32ri),
              StackPtr).addReg(StackPtr).addImm(-TailCallReturnAddrDelta);
    // The EFLAGS implicit def is dead.
    MI->getOperand(3).setIsDead();
  }

  uint64_t NumBytes = 0;
  if (hasFP(MF)) {
    // Calculate required stack adjustment
    uint64_t FrameSize = StackSize - SlotSize;
    if (needsStackRealignment(MF))
      FrameSize = (FrameSize + MaxAlign - 1)/MaxAlign*MaxAlign;

    NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize();

    // Get the offset of the stack slot for the EBP register... which is
    // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
    // Update the frame offset adjustment.
    MFI->setOffsetAdjustment(-NumBytes);

    // Save EBP into the appropriate stack slot...
    BuildMI(MBB, MBBI, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
      .addReg(FramePtr, /*isDef=*/false, /*isImp=*/false, /*isKill=*/true);

    if (needsFrameMoves) {
      // Mark effective beginning of when frame pointer becomes valid.
      FrameLabelId = MMI->NextLabelID();
      BuildMI(MBB, MBBI, TII.get(X86::DBG_LABEL)).addImm(FrameLabelId);
    }

    // Update EBP with the new base value...
    BuildMI(MBB, MBBI, TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr)
      .addReg(StackPtr);

    // Mark the FramePtr as live-in in every block except the entry.
    for (MachineFunction::iterator I = next(MF.begin()), E = MF.end();
         I != E; ++I)
      I->addLiveIn(FramePtr);

    // Realign stack
    if (needsStackRealignment(MF)) {
      MachineInstr *MI =
        BuildMI(MBB, MBBI,
                TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri),
                StackPtr).addReg(StackPtr).addImm(-MaxAlign);
      // The EFLAGS implicit def is dead.
      MI->getOperand(3).setIsDead();
    }
  } else
    NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();

  unsigned ReadyLabelId = 0;
  if (needsFrameMoves) {
    // Mark effective beginning of when frame pointer is ready.
    ReadyLabelId = MMI->NextLabelID();
    BuildMI(MBB, MBBI, TII.get(X86::DBG_LABEL)).addImm(ReadyLabelId);
  }

  // Skip the callee-saved push instructions.
  while (MBBI != MBB.end() &&
         (MBBI->getOpcode() == X86::PUSH32r ||
          MBBI->getOpcode() == X86::PUSH64r))
    ++MBBI;

  if (NumBytes) {   // adjust stack pointer: ESP -= numbytes
    if (NumBytes >= 4096 && Subtarget->isTargetCygMing()) {
      // Check, whether EAX is livein for this function
      bool isEAXAlive = false;
      for (MachineRegisterInfo::livein_iterator
           II = MF.getRegInfo().livein_begin(),
           EE = MF.getRegInfo().livein_end(); (II != EE) && !isEAXAlive; ++II) {
        unsigned Reg = II->first;
        isEAXAlive = (Reg == X86::EAX || Reg == X86::AX ||
                      Reg == X86::AH || Reg == X86::AL);
      }

      // Function prologue calls _alloca to probe the stack when allocating
      // more than 4k bytes in one go. Touching the stack at 4K increments is
      // necessary to ensure that the guard pages used by the OS virtual memory
      // manager are allocated in correct sequence.
      if (!isEAXAlive) {
        BuildMI(MBB, MBBI, TII.get(X86::MOV32ri), X86::EAX).addImm(NumBytes);
        BuildMI(MBB, MBBI, TII.get(X86::CALLpcrel32))
          .addExternalSymbol("_alloca");
      } else {
        // Save EAX
        BuildMI(MBB, MBBI, TII.get(X86::PUSH32r))
          .addReg(X86::EAX, /*isDef=*/false, /*isImp=*/false, /*isKill=*/true);
        // Allocate NumBytes-4 bytes on stack. We'll also use 4 already
        // allocated bytes for EAX.
        BuildMI(MBB, MBBI, TII.get(X86::MOV32ri), X86::EAX).addImm(NumBytes-4);
        BuildMI(MBB, MBBI, TII.get(X86::CALLpcrel32))
          .addExternalSymbol("_alloca");
        // Restore EAX
        MachineInstr *MI = addRegOffset(BuildMI(MF, TII.get(X86::MOV32rm),X86::EAX),
                                        StackPtr, false, NumBytes-4);
        MBB.insert(MBBI, MI);
      }
    } else {
      // If there is an SUB32ri of ESP immediately before this instruction,
      // merge the two. This can be the case when tail call elimination is
      // enabled and the callee has more arguments then the caller.
      NumBytes -= mergeSPUpdates(MBB, MBBI, StackPtr, true);
      // If there is an ADD32ri or SUB32ri of ESP immediately after this
      // instruction, merge the two instructions.
      mergeSPUpdatesDown(MBB, MBBI, StackPtr, &NumBytes);

      if (NumBytes)
        emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, TII);
    }
  }

  if (needsFrameMoves)
    emitFrameMoves(MF, FrameLabelId, ReadyLabelId);
}
  bool runOnMachineFunction(MachineFunction &MF) override {
    ST = &MF.getSubtarget<R600Subtarget>();
    MaxFetchInst = ST->getTexVTXClauseSize();
    TII = ST->getInstrInfo();
    TRI = ST->getRegisterInfo();

    R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();

    CFStack CFStack(ST, MF.getFunction()->getCallingConv());
    for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
        ++MB) {
      MachineBasicBlock &MBB = *MB;
      unsigned CfCount = 0;
      std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack;
      std::vector<MachineInstr * > IfThenElseStack;
      if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_VS) {
        BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
            getHWInstrDesc(CF_CALL_FS));
        CfCount++;
      }
      std::vector<ClauseFile> FetchClauses, AluClauses;
      std::vector<MachineInstr *> LastAlu(1);
      std::vector<MachineInstr *> ToPopAfter;

      for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
          I != E;) {
        if (TII->usesTextureCache(*I) || TII->usesVertexCache(*I)) {
          DEBUG(dbgs() << CfCount << ":"; I->dump(););
          FetchClauses.push_back(MakeFetchClause(MBB, I));
          CfCount++;
          LastAlu.back() = nullptr;
          continue;
        }

        MachineBasicBlock::iterator MI = I;
        if (MI->getOpcode() != AMDGPU::ENDIF)
          LastAlu.back() = nullptr;
        if (MI->getOpcode() == AMDGPU::CF_ALU)
          LastAlu.back() = &*MI;
        I++;
        bool RequiresWorkAround =
            CFStack.requiresWorkAroundForInst(MI->getOpcode());
        switch (MI->getOpcode()) {
        case AMDGPU::CF_ALU_PUSH_BEFORE:
          if (RequiresWorkAround) {
            DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n");
            BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG))
                .addImm(CfCount + 1)
                .addImm(1);
            MI->setDesc(TII->get(AMDGPU::CF_ALU));
            CfCount++;
            CFStack.pushBranch(AMDGPU::CF_PUSH_EG);
          } else
            CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE);

        case AMDGPU::CF_ALU:
          I = MI;
          AluClauses.push_back(MakeALUClause(MBB, I));
          DEBUG(dbgs() << CfCount << ":"; MI->dump(););
          CfCount++;
          break;
        case AMDGPU::WHILELOOP: {
          CFStack.pushLoop();
          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
              getHWInstrDesc(CF_WHILE_LOOP))
              .addImm(1);
          std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount,
              std::set<MachineInstr *>());
          Pair.second.insert(MIb);
          LoopStack.push_back(std::move(Pair));
          MI->eraseFromParent();
          CfCount++;
          break;
        }
        case AMDGPU::ENDLOOP: {
          CFStack.popLoop();
          std::pair<unsigned, std::set<MachineInstr *> > Pair =
              std::move(LoopStack.back());
          LoopStack.pop_back();
          CounterPropagateAddr(Pair.second, CfCount);
          BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP))
              .addImm(Pair.first + 1);
          MI->eraseFromParent();
          CfCount++;
          break;
        }
        case AMDGPU::IF_PREDICATE_SET: {
          LastAlu.push_back(nullptr);
          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
              getHWInstrDesc(CF_JUMP))
              .addImm(0)
              .addImm(0);
          IfThenElseStack.push_back(MIb);
          DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
          MI->eraseFromParent();
          CfCount++;
          break;
        }
Example #12
0
bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
  MachineRegisterInfo &MRI = MF.getRegInfo();
  const SIRegisterInfo *TRI =
      static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
  const SIInstrInfo *TII =
      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
                                                  BI != BE; ++BI) {

    MachineBasicBlock &MBB = *BI;
    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
                                                      I != E; ++I) {
      MachineInstr &MI = *I;
      if (MI.getOpcode() == AMDGPU::COPY && isVGPRToSGPRCopy(MI, TRI, MRI)) {
        DEBUG(dbgs() << "Fixing VGPR -> SGPR copy:\n");
        DEBUG(MI.print(dbgs()));
        TII->moveToVALU(MI);

      }

      switch (MI.getOpcode()) {
      default: continue;
      case AMDGPU::PHI: {
        DEBUG(dbgs() << "Fixing PHI: " << MI);

        for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
          const MachineOperand &Op = MI.getOperand(i);
          unsigned Reg = Op.getReg();
          const TargetRegisterClass *RC
            = inferRegClassFromDef(TRI, MRI, Reg, Op.getSubReg());

          MRI.constrainRegClass(Op.getReg(), RC);
        }
        unsigned Reg = MI.getOperand(0).getReg();
        const TargetRegisterClass *RC = inferRegClassFromUses(TRI, MRI, Reg,
                                                  MI.getOperand(0).getSubReg());
        if (TRI->getCommonSubClass(RC, &AMDGPU::VGPR_32RegClass)) {
          MRI.constrainRegClass(Reg, &AMDGPU::VGPR_32RegClass);
        }

        if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
          break;

        // If a PHI node defines an SGPR and any of its operands are VGPRs,
        // then we need to move it to the VALU.
        //
        // Also, if a PHI node defines an SGPR and has all SGPR operands
        // we must move it to the VALU, because the SGPR operands will
        // all end up being assigned the same register, which means
        // there is a potential for a conflict if different threads take
        // different control flow paths.
        //
        // For Example:
        //
        // sgpr0 = def;
        // ...
        // sgpr1 = def;
        // ...
        // sgpr2 = PHI sgpr0, sgpr1
        // use sgpr2;
        //
        // Will Become:
        //
        // sgpr2 = def;
        // ...
        // sgpr2 = def;
        // ...
        // use sgpr2
        //
        // FIXME: This is OK if the branching decision is made based on an
        // SGPR value.
        bool SGPRBranch = false;

        // The one exception to this rule is when one of the operands
        // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK
        // instruction.  In this case, there we know the program will
        // never enter the second block (the loop) without entering
        // the first block (where the condition is computed), so there
        // is no chance for values to be over-written.

        bool HasBreakDef = false;
        for (unsigned i = 1; i < MI.getNumOperands(); i+=2) {
          unsigned Reg = MI.getOperand(i).getReg();
          if (TRI->hasVGPRs(MRI.getRegClass(Reg))) {
            TII->moveToVALU(MI);
            break;
          }
          MachineInstr *DefInstr = MRI.getUniqueVRegDef(Reg);
          assert(DefInstr);
          switch(DefInstr->getOpcode()) {

          case AMDGPU::SI_BREAK:
          case AMDGPU::SI_IF_BREAK:
          case AMDGPU::SI_ELSE_BREAK:
          // If we see a PHI instruction that defines an SGPR, then that PHI
          // instruction has already been considered and should have
          // a *_BREAK as an operand.
          case AMDGPU::PHI:
            HasBreakDef = true;
            break;
          }
        }

        if (!SGPRBranch && !HasBreakDef)
          TII->moveToVALU(MI);
        break;
      }
      case AMDGPU::REG_SEQUENCE: {
        if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) ||
            !hasVGPROperands(MI, TRI))
          continue;

        DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);

        TII->moveToVALU(MI);
        break;
      }
      case AMDGPU::INSERT_SUBREG: {
        const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
        DstRC = MRI.getRegClass(MI.getOperand(0).getReg());
        Src0RC = MRI.getRegClass(MI.getOperand(1).getReg());
        Src1RC = MRI.getRegClass(MI.getOperand(2).getReg());
        if (TRI->isSGPRClass(DstRC) &&
            (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) {
          DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
          TII->moveToVALU(MI);
        }
        break;
      }
      }
    }
  }

  return true;
}
Example #13
0
bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
  MachineRegisterInfo &MRI = MF.getRegInfo();
  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
  const SIInstrInfo *TII = ST.getInstrInfo();
  const TargetRegisterInfo *TRI = &TII->getRegisterInfo();

  std::vector<unsigned> I1Defs;

  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
                                                  BI != BE; ++BI) {

    MachineBasicBlock &MBB = *BI;
    MachineBasicBlock::iterator I, Next;
    for (I = MBB.begin(); I != MBB.end(); I = Next) {
      Next = std::next(I);
      MachineInstr &MI = *I;

      if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) {
        unsigned Reg = MI.getOperand(0).getReg();
        const TargetRegisterClass *RC = MRI.getRegClass(Reg);
        if (RC == &AMDGPU::VReg_1RegClass)
          MRI.setRegClass(Reg, &AMDGPU::SReg_64RegClass);
        continue;
      }

      if (MI.getOpcode() != AMDGPU::COPY)
        continue;

      const MachineOperand &Dst = MI.getOperand(0);
      const MachineOperand &Src = MI.getOperand(1);

      if (!TargetRegisterInfo::isVirtualRegister(Src.getReg()) ||
          !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
        continue;

      const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg());
      const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg());

      if (DstRC == &AMDGPU::VReg_1RegClass &&
          TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) {
        I1Defs.push_back(Dst.getReg());
        DebugLoc DL = MI.getDebugLoc();

        MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg());
        if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) {
          if (DefInst->getOperand(1).isImm()) {
            I1Defs.push_back(Dst.getReg());

            int64_t Val = DefInst->getOperand(1).getImm();
            assert(Val == 0 || Val == -1);

            BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32))
              .addOperand(Dst)
              .addImm(Val);
            MI.eraseFromParent();
            continue;
          }
        }

        BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64))
          .addOperand(Dst)
          .addImm(0)
          .addImm(-1)
          .addOperand(Src);
        MI.eraseFromParent();
      } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
                 SrcRC == &AMDGPU::VReg_1RegClass) {
        BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_U32_e64))
          .addOperand(Dst)
          .addOperand(Src)
          .addImm(0);
        MI.eraseFromParent();
      }
    }
  }

  for (unsigned Reg : I1Defs)
    MRI.setRegClass(Reg, &AMDGPU::VGPR_32RegClass);

  return false;
}
bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
  TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
  TRI =
      static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

  bool HaveKill = false;
  bool NeedWQM = false;
  bool NeedFlat = false;
  unsigned Depth = 0;

  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
       BI != BE; ++BI) {

    MachineBasicBlock &MBB = *BI;
    MachineBasicBlock::iterator I, Next;
    for (I = MBB.begin(); I != MBB.end(); I = Next) {
      Next = std::next(I);

      MachineInstr &MI = *I;
      if (TII->isDS(MI.getOpcode()))
        NeedWQM = true;

      // Flat uses m0 in case it needs to access LDS.
      if (TII->isFLAT(MI.getOpcode()))
        NeedFlat = true;

      switch (MI.getOpcode()) {
        default: break;
        case AMDGPU::SI_IF:
          ++Depth;
          If(MI);
          break;

        case AMDGPU::SI_ELSE:
          Else(MI);
          break;

        case AMDGPU::SI_BREAK:
          Break(MI);
          break;

        case AMDGPU::SI_IF_BREAK:
          IfBreak(MI);
          break;

        case AMDGPU::SI_ELSE_BREAK:
          ElseBreak(MI);
          break;

        case AMDGPU::SI_LOOP:
          ++Depth;
          Loop(MI);
          break;

        case AMDGPU::SI_END_CF:
          if (--Depth == 0 && HaveKill) {
            SkipIfDead(MI);
            HaveKill = false;
          }
          EndCf(MI);
          break;

        case AMDGPU::SI_KILL:
          if (Depth == 0)
            SkipIfDead(MI);
          else
            HaveKill = true;
          Kill(MI);
          break;

        case AMDGPU::S_BRANCH:
          Branch(MI);
          break;

        case AMDGPU::SI_INDIRECT_SRC:
          IndirectSrc(MI);
          break;

        case AMDGPU::SI_INDIRECT_DST_V1:
        case AMDGPU::SI_INDIRECT_DST_V2:
        case AMDGPU::SI_INDIRECT_DST_V4:
        case AMDGPU::SI_INDIRECT_DST_V8:
        case AMDGPU::SI_INDIRECT_DST_V16:
          IndirectDst(MI);
          break;

        case AMDGPU::V_INTERP_P1_F32:
        case AMDGPU::V_INTERP_P2_F32:
        case AMDGPU::V_INTERP_MOV_F32:
          NeedWQM = true;
          break;
      }
    }
  }

  if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) {
    MachineBasicBlock &MBB = MF.front();
    BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
            AMDGPU::EXEC).addReg(AMDGPU::EXEC);
  }

  // FIXME: This seems inappropriate to do here.
  if (NeedFlat && MFI->IsKernel) {
    // Insert the prologue initializing the SGPRs pointing to the scratch space
    // for flat accesses.
    const MachineFrameInfo *FrameInfo = MF.getFrameInfo();

    // TODO: What to use with function calls?

    // FIXME: This is reporting stack size that is used in a scratch buffer
    // rather than registers as well.
    uint64_t StackSizeBytes = FrameInfo->getStackSize();

    int IndirectBegin
      = static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF);
    // Convert register index to 256-byte unit.
    uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256);

    assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff &&
           "Stack limits should be smaller than 16-bits");

    // Initialize the flat scratch register pair.
    // TODO: Can we use one s_mov_b64 here?

    // Offset is in units of 256-bytes.
    MachineBasicBlock &MBB = MF.front();
    DebugLoc NoDL;
    MachineBasicBlock::iterator Start = MBB.getFirstNonPHI();
    const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32);

    assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes));

    BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO)
      .addImm(StackOffset);

    // Documentation says size is "per-thread scratch size in bytes"
    BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI)
      .addImm(StackSizeBytes);
  }

  return true;
}
Example #15
0
bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) {
  raw_ostream *OutFile = 0;
  if (OutFileName) {
    std::string ErrorInfo;
    OutFile = new raw_fd_ostream(OutFileName, ErrorInfo,
                                 raw_fd_ostream::F_Append);
    if (!ErrorInfo.empty()) {
      errs() << "Error opening '" << OutFileName << "': " << ErrorInfo << '\n';
      exit(1);
    }

    OS = OutFile;
  } else {
    OS = &errs();
  }

  foundErrors = 0;

  this->MF = &MF;
  TM = &MF.getTarget();
  TII = TM->getInstrInfo();
  TRI = TM->getRegisterInfo();
  MRI = &MF.getRegInfo();

  LiveVars = NULL;
  LiveInts = NULL;
  LiveStks = NULL;
  Indexes = NULL;
  if (PASS) {
    LiveInts = PASS->getAnalysisIfAvailable<LiveIntervals>();
    // We don't want to verify LiveVariables if LiveIntervals is available.
    if (!LiveInts)
      LiveVars = PASS->getAnalysisIfAvailable<LiveVariables>();
    LiveStks = PASS->getAnalysisIfAvailable<LiveStacks>();
    Indexes = PASS->getAnalysisIfAvailable<SlotIndexes>();
  }

  visitMachineFunctionBefore();
  for (MachineFunction::const_iterator MFI = MF.begin(), MFE = MF.end();
       MFI!=MFE; ++MFI) {
    visitMachineBasicBlockBefore(MFI);
    for (MachineBasicBlock::const_instr_iterator MBBI = MFI->instr_begin(),
           MBBE = MFI->instr_end(); MBBI != MBBE; ++MBBI) {
      if (MBBI->getParent() != MFI) {
        report("Bad instruction parent pointer", MFI);
        *OS << "Instruction: " << *MBBI;
        continue;
      }
      // Skip BUNDLE instruction for now. FIXME: We should add code to verify
      // the BUNDLE's specifically.
      if (MBBI->isBundle())
        continue;
      visitMachineInstrBefore(MBBI);
      for (unsigned I = 0, E = MBBI->getNumOperands(); I != E; ++I)
        visitMachineOperand(&MBBI->getOperand(I), I);
      visitMachineInstrAfter(MBBI);
    }
    visitMachineBasicBlockAfter(MFI);
  }
  visitMachineFunctionAfter();

  if (OutFile)
    delete OutFile;
  else if (foundErrors)
    report_fatal_error("Found "+Twine(foundErrors)+" machine code errors.");

  // Clean up.
  regsLive.clear();
  regsDefined.clear();
  regsDead.clear();
  regsKilled.clear();
  regMasks.clear();
  regsLiveInButUnused.clear();
  MBBInfoMap.clear();

  return false;                 // no changes
}
bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
  if (skipFunction(*MF.getFunction()))
    return false;

  MachineRegisterInfo &MRI = MF.getRegInfo();
  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
  const SIInstrInfo *TII = ST.getInstrInfo();
  const SIRegisterInfo &TRI = TII->getRegisterInfo();

  std::vector<unsigned> I1Defs;

  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
                                                  BI != BE; ++BI) {

    MachineBasicBlock &MBB = *BI;
    MachineBasicBlock::iterator I, Next;
    for (I = MBB.begin(); I != MBB.end(); I = Next) {
      Next = std::next(I);
      MachineInstr &MI = *I;

      if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
        // If this has a literal constant source that is the same as the
        // reversed bits of an inline immediate, replace with a bitreverse of
        // that constant. This saves 4 bytes in the common case of materializing
        // sign bits.

        // Test if we are after regalloc. We only want to do this after any
        // optimizations happen because this will confuse them.
        // XXX - not exactly a check for post-regalloc run.
        MachineOperand &Src = MI.getOperand(1);
        if (Src.isImm() &&
            TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) {
          int32_t ReverseImm;
          if (isReverseInlineImm(TII, Src, ReverseImm)) {
            MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
            Src.setImm(ReverseImm);
            continue;
          }
        }
      }

      // Combine adjacent s_nops to use the immediate operand encoding how long
      // to wait.
      //
      // s_nop N
      // s_nop M
      //  =>
      // s_nop (N + M)
      if (MI.getOpcode() == AMDGPU::S_NOP &&
          Next != MBB.end() &&
          (*Next).getOpcode() == AMDGPU::S_NOP) {

        MachineInstr &NextMI = *Next;
        // The instruction encodes the amount to wait with an offset of 1,
        // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back
        // after adding.
        uint8_t Nop0 = MI.getOperand(0).getImm() + 1;
        uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1;

        // Make sure we don't overflow the bounds.
        if (Nop0 + Nop1 <= 8) {
          NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1);
          MI.eraseFromParent();
        }

        continue;
      }

      // FIXME: We also need to consider movs of constant operands since
      // immediate operands are not folded if they have more than one use, and
      // the operand folding pass is unaware if the immediate will be free since
      // it won't know if the src == dest constraint will end up being
      // satisfied.
      if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
          MI.getOpcode() == AMDGPU::S_MUL_I32) {
        const MachineOperand *Dest = &MI.getOperand(0);
        MachineOperand *Src0 = &MI.getOperand(1);
        MachineOperand *Src1 = &MI.getOperand(2);

        if (!Src0->isReg() && Src1->isReg()) {
          if (TII->commuteInstruction(MI, false, 1, 2))
            std::swap(Src0, Src1);
        }

        // FIXME: This could work better if hints worked with subregisters. If
        // we have a vector add of a constant, we usually don't get the correct
        // allocation due to the subregister usage.
        if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) &&
            Src0->isReg()) {
          MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
          MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
          continue;
        }

        if (Src0->isReg() && Src0->getReg() == Dest->getReg()) {
          if (Src1->isImm() && isKImmOperand(TII, *Src1)) {
            unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
              AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;

            MI.setDesc(TII->get(Opc));
            MI.tieOperands(0, 1);
          }
        }
      }

      // Try to use s_cmpk_*
      if (MI.isCompare() && TII->isSOPC(MI)) {
        shrinkScalarCompare(TII, MI);
        continue;
      }

      // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
      if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
        const MachineOperand &Dst = MI.getOperand(0);
        MachineOperand &Src = MI.getOperand(1);

        if (Src.isImm() &&
            TargetRegisterInfo::isPhysicalRegister(Dst.getReg())) {
          int32_t ReverseImm;
          if (isKImmOperand(TII, Src))
            MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
          else if (isReverseInlineImm(TII, Src, ReverseImm)) {
            MI.setDesc(TII->get(AMDGPU::S_BREV_B32));
            Src.setImm(ReverseImm);
          }
        }

        continue;
      }

      if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
        continue;

      if (!canShrink(MI, TII, TRI, MRI)) {
        // Try commuting the instruction and see if that enables us to shrink
        // it.
        if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
            !canShrink(MI, TII, TRI, MRI))
          continue;
      }

      // getVOPe32 could be -1 here if we started with an instruction that had
      // a 32-bit encoding and then commuted it to an instruction that did not.
      if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
        continue;

      int Op32 = AMDGPU::getVOPe32(MI.getOpcode());

      if (TII->isVOPC(Op32)) {
        unsigned DstReg = MI.getOperand(0).getReg();
        if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
          // VOPC instructions can only write to the VCC register. We can't
          // force them to use VCC here, because this is only one register and
          // cannot deal with sequences which would require multiple copies of
          // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
          //
          // So, instead of forcing the instruction to write to VCC, we provide
          // a hint to the register allocator to use VCC and then we we will run
          // this pass again after RA and shrink it if it outputs to VCC.
          MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC);
          continue;
        }
        if (DstReg != AMDGPU::VCC)
          continue;
      }

      if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
        // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
        // instructions.
        const MachineOperand *Src2 =
            TII->getNamedOperand(MI, AMDGPU::OpName::src2);
        if (!Src2->isReg())
          continue;
        unsigned SReg = Src2->getReg();
        if (TargetRegisterInfo::isVirtualRegister(SReg)) {
          MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC);
          continue;
        }
        if (SReg != AMDGPU::VCC)
          continue;
      }

      // Check for the bool flag output for instructions like V_ADD_I32_e64.
      const MachineOperand *SDst = TII->getNamedOperand(MI,
                                                        AMDGPU::OpName::sdst);

      // Check the carry-in operand for v_addc_u32_e64.
      const MachineOperand *Src2 = TII->getNamedOperand(MI,
                                                        AMDGPU::OpName::src2);

      if (SDst) {
        if (SDst->getReg() != AMDGPU::VCC) {
          if (TargetRegisterInfo::isVirtualRegister(SDst->getReg()))
            MRI.setRegAllocationHint(SDst->getReg(), 0, AMDGPU::VCC);
          continue;
        }

        // All of the instructions with carry outs also have an SGPR input in
        // src2.
        if (Src2 && Src2->getReg() != AMDGPU::VCC) {
          if (TargetRegisterInfo::isVirtualRegister(Src2->getReg()))
            MRI.setRegAllocationHint(Src2->getReg(), 0, AMDGPU::VCC);

          continue;
        }
      }

      // We can shrink this instruction
      DEBUG(dbgs() << "Shrinking " << MI);

      MachineInstrBuilder Inst32 =
          BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));

      // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
      // For VOPC instructions, this is replaced by an implicit def of vcc.
      int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
      if (Op32DstIdx != -1) {
        // dst
        Inst32.add(MI.getOperand(0));
      } else {
        assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
               "Unexpected case");
      }


      Inst32.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));

      const MachineOperand *Src1 =
          TII->getNamedOperand(MI, AMDGPU::OpName::src1);
      if (Src1)
        Inst32.add(*Src1);

      if (Src2) {
        int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
        if (Op32Src2Idx != -1) {
          Inst32.add(*Src2);
        } else {
          // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
          // replaced with an implicit read of vcc. This was already added
          // during the initial BuildMI, so find it to preserve the flags.
          copyFlagsToImplicitVCC(*Inst32, *Src2);
        }
      }

      ++NumInstructionsShrunk;

      // Copy extra operands not present in the instruction definition.
      copyExtraImplicitOps(*Inst32, MF, MI);

      MI.eraseFromParent();
      foldImmediates(*Inst32, TII, MRI);

      DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');


    }
  }
  return false;
}
Example #17
0
bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {

  const R600RegisterInfo &TRI = TII->getRegisterInfo();

  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
                                                  BB != BB_E; ++BB) {
    MachineBasicBlock &MBB = *BB;
    MachineBasicBlock::iterator I = MBB.begin();
    while (I != MBB.end()) {
      MachineInstr &MI = *I;
      I = llvm::next(I);

      switch (MI.getOpcode()) {
      default: break;
      // Expand PRED_X to one of the PRED_SET instructions.
      case AMDGPU::PRED_X: {
        uint64_t Flags = MI.getOperand(3).getImm();
        // The native opcode used by PRED_X is stored as an immediate in the
        // third operand.
        MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I,
                                            MI.getOperand(2).getImm(), // opcode
                                            MI.getOperand(0).getReg(), // dst
                                            MI.getOperand(1).getReg(), // src0
                                            AMDGPU::ZERO);             // src1
        TII->addFlag(PredSet, 0, MO_FLAG_MASK);
        if (Flags & MO_FLAG_PUSH) {
          TII->setImmOperand(PredSet, R600Operands::UPDATE_EXEC_MASK, 1);
        } else {
          TII->setImmOperand(PredSet, R600Operands::UPDATE_PREDICATE, 1);
        }
        MI.eraseFromParent();
        continue;
        }
      case AMDGPU::BREAK: {
        MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I,
                                          AMDGPU::PRED_SETE_INT,
                                          AMDGPU::PREDICATE_BIT,
                                          AMDGPU::ZERO,
                                          AMDGPU::ZERO);
        TII->addFlag(PredSet, 0, MO_FLAG_MASK);
        TII->setImmOperand(PredSet, R600Operands::UPDATE_EXEC_MASK, 1);

        BuildMI(MBB, I, MBB.findDebugLoc(I),
                TII->get(AMDGPU::PREDICATED_BREAK))
                .addReg(AMDGPU::PREDICATE_BIT);
        MI.eraseFromParent();
        continue;
        }

      case AMDGPU::INTERP_PAIR_XY: {
        MachineInstr *BMI;
        unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
                MI.getOperand(2).getImm());

        for (unsigned Chan = 0; Chan < 4; ++Chan) {
          unsigned DstReg;

          if (Chan < 2)
            DstReg = MI.getOperand(Chan).getReg();
          else
            DstReg = Chan == 2 ? AMDGPU::T0_Z : AMDGPU::T0_W;

          BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_XY,
              DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg);

          if (Chan > 0) {
            BMI->bundleWithPred();
          }
          if (Chan >= 2)
            TII->addFlag(BMI, 0, MO_FLAG_MASK);
          if (Chan != 3)
            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
        }

        MI.eraseFromParent();
        continue;
        }

      case AMDGPU::INTERP_PAIR_ZW: {
        MachineInstr *BMI;
        unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
                MI.getOperand(2).getImm());

        for (unsigned Chan = 0; Chan < 4; ++Chan) {
          unsigned DstReg;

          if (Chan < 2)
            DstReg = Chan == 0 ? AMDGPU::T0_X : AMDGPU::T0_Y;
          else
            DstReg = MI.getOperand(Chan-2).getReg();

          BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_ZW,
              DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg);

          if (Chan > 0) {
            BMI->bundleWithPred();
          }
          if (Chan < 2)
            TII->addFlag(BMI, 0, MO_FLAG_MASK);
          if (Chan != 3)
            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
        }

        MI.eraseFromParent();
        continue;
        }

      case AMDGPU::INTERP_VEC_LOAD: {
        const R600RegisterInfo &TRI = TII->getRegisterInfo();
        MachineInstr *BMI;
        unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
                MI.getOperand(1).getImm());
        unsigned DstReg = MI.getOperand(0).getReg();

        for (unsigned Chan = 0; Chan < 4; ++Chan) {
          BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_LOAD_P0,
              TRI.getSubReg(DstReg, TRI.getSubRegFromChannel(Chan)), PReg);
          if (Chan > 0) {
            BMI->bundleWithPred();
          }
          if (Chan != 3)
            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
        }

        MI.eraseFromParent();
        continue;
        }
      }

      bool IsReduction = TII->isReductionOp(MI.getOpcode());
      bool IsVector = TII->isVector(MI);
      bool IsCube = TII->isCubeOp(MI.getOpcode());
      if (!IsReduction && !IsVector && !IsCube) {
        continue;
      }

      // Expand the instruction
      //
      // Reduction instructions:
      // T0_X = DP4 T1_XYZW, T2_XYZW
      // becomes:
      // TO_X = DP4 T1_X, T2_X
      // TO_Y (write masked) = DP4 T1_Y, T2_Y
      // TO_Z (write masked) = DP4 T1_Z, T2_Z
      // TO_W (write masked) = DP4 T1_W, T2_W
      //
      // Vector instructions:
      // T0_X = MULLO_INT T1_X, T2_X
      // becomes:
      // T0_X = MULLO_INT T1_X, T2_X
      // T0_Y (write masked) = MULLO_INT T1_X, T2_X
      // T0_Z (write masked) = MULLO_INT T1_X, T2_X
      // T0_W (write masked) = MULLO_INT T1_X, T2_X
      //
      // Cube instructions:
      // T0_XYZW = CUBE T1_XYZW
      // becomes:
      // TO_X = CUBE T1_Z, T1_Y
      // T0_Y = CUBE T1_Z, T1_X
      // T0_Z = CUBE T1_X, T1_Z
      // T0_W = CUBE T1_Y, T1_Z
      for (unsigned Chan = 0; Chan < 4; Chan++) {
        unsigned DstReg = MI.getOperand(
                            TII->getOperandIdx(MI, R600Operands::DST)).getReg();
        unsigned Src0 = MI.getOperand(
                           TII->getOperandIdx(MI, R600Operands::SRC0)).getReg();
        unsigned Src1 = 0;

        // Determine the correct source registers
        if (!IsCube) {
          int Src1Idx = TII->getOperandIdx(MI, R600Operands::SRC1);
          if (Src1Idx != -1) {
            Src1 = MI.getOperand(Src1Idx).getReg();
          }
        }
        if (IsReduction) {
          unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
          Src0 = TRI.getSubReg(Src0, SubRegIndex);
          Src1 = TRI.getSubReg(Src1, SubRegIndex);
        } else if (IsCube) {
          static const int CubeSrcSwz[] = {2, 2, 0, 1};
          unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]);
          unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]);
          Src1 = TRI.getSubReg(Src0, SubRegIndex1);
          Src0 = TRI.getSubReg(Src0, SubRegIndex0);
        }

        // Determine the correct destination registers;
        bool Mask = false;
        bool NotLast = true;
        if (IsCube) {
          unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
          DstReg = TRI.getSubReg(DstReg, SubRegIndex);
        } else {
          // Mask the write if the original instruction does not write to
          // the current Channel.
          Mask = (Chan != TRI.getHWRegChan(DstReg));
          unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
          DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
        }

        // Set the IsLast bit
        NotLast = (Chan != 3 );

        // Add the new instruction
        unsigned Opcode = MI.getOpcode();
        switch (Opcode) {
        case AMDGPU::CUBE_r600_pseudo:
          Opcode = AMDGPU::CUBE_r600_real;
          break;
        case AMDGPU::CUBE_eg_pseudo:
          Opcode = AMDGPU::CUBE_eg_real;
          break;
        case AMDGPU::DOT4_r600_pseudo:
          Opcode = AMDGPU::DOT4_r600_real;
          break;
        case AMDGPU::DOT4_eg_pseudo:
          Opcode = AMDGPU::DOT4_eg_real;
          break;
        default:
          break;
        }

        MachineInstr *NewMI =
          TII->buildDefaultInstruction(MBB, I, Opcode, DstReg, Src0, Src1);

        if (Chan != 0)
          NewMI->bundleWithPred();
        if (Mask) {
          TII->addFlag(NewMI, 0, MO_FLAG_MASK);
        }
        if (NotLast) {
          TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST);
        }
      }
      MI.eraseFromParent();
    }
  }
  return false;
}
Example #18
0
/// runOnMachineFunction - Reduce two-address instructions to two operands.
///
bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
  DEBUG(errs() << "Machine Function\n");
  const TargetMachine &TM = MF.getTarget();
  MRI = &MF.getRegInfo();
  TII = TM.getInstrInfo();
  TRI = TM.getRegisterInfo();
  LV = getAnalysisIfAvailable<LiveVariables>();
  AA = &getAnalysis<AliasAnalysis>();

  bool MadeChange = false;

  DEBUG(errs() << "********** REWRITING TWO-ADDR INSTRS **********\n");
  DEBUG(errs() << "********** Function: " 
        << MF.getFunction()->getName() << '\n');

  // ReMatRegs - Keep track of the registers whose def's are remat'ed.
  BitVector ReMatRegs;
  ReMatRegs.resize(MRI->getLastVirtReg()+1);

  typedef DenseMap<unsigned, SmallVector<std::pair<unsigned, unsigned>, 4> >
    TiedOperandMap;
  TiedOperandMap TiedOperands(4);

  SmallPtrSet<MachineInstr*, 8> Processed;
  for (MachineFunction::iterator mbbi = MF.begin(), mbbe = MF.end();
       mbbi != mbbe; ++mbbi) {
    unsigned Dist = 0;
    DistanceMap.clear();
    SrcRegMap.clear();
    DstRegMap.clear();
    Processed.clear();
    for (MachineBasicBlock::iterator mi = mbbi->begin(), me = mbbi->end();
         mi != me; ) {
      MachineBasicBlock::iterator nmi = next(mi);
      const TargetInstrDesc &TID = mi->getDesc();
      bool FirstTied = true;

      DistanceMap.insert(std::make_pair(mi, ++Dist));

      ProcessCopy(&*mi, &*mbbi, Processed);

      // First scan through all the tied register uses in this instruction
      // and record a list of pairs of tied operands for each register.
      unsigned NumOps = (mi->getOpcode() == TargetInstrInfo::INLINEASM)
        ? mi->getNumOperands() : TID.getNumOperands();
      for (unsigned SrcIdx = 0; SrcIdx < NumOps; ++SrcIdx) {
        unsigned DstIdx = 0;
        if (!mi->isRegTiedToDefOperand(SrcIdx, &DstIdx))
          continue;

        if (FirstTied) {
          FirstTied = false;
          ++NumTwoAddressInstrs;
          DEBUG(errs() << '\t' << *mi);
        }

        assert(mi->getOperand(SrcIdx).isReg() &&
               mi->getOperand(SrcIdx).getReg() &&
               mi->getOperand(SrcIdx).isUse() &&
               "two address instruction invalid");

        unsigned regB = mi->getOperand(SrcIdx).getReg();
        TiedOperandMap::iterator OI = TiedOperands.find(regB);
        if (OI == TiedOperands.end()) {
          SmallVector<std::pair<unsigned, unsigned>, 4> TiedPair;
          OI = TiedOperands.insert(std::make_pair(regB, TiedPair)).first;
        }
        OI->second.push_back(std::make_pair(SrcIdx, DstIdx));
      }

      // Now iterate over the information collected above.
      for (TiedOperandMap::iterator OI = TiedOperands.begin(),
             OE = TiedOperands.end(); OI != OE; ++OI) {
        SmallVector<std::pair<unsigned, unsigned>, 4> &TiedPairs = OI->second;

        // If the instruction has a single pair of tied operands, try some
        // transformations that may either eliminate the tied operands or
        // improve the opportunities for coalescing away the register copy.
        if (TiedOperands.size() == 1 && TiedPairs.size() == 1) {
          unsigned SrcIdx = TiedPairs[0].first;
          unsigned DstIdx = TiedPairs[0].second;

          // If the registers are already equal, nothing needs to be done.
          if (mi->getOperand(SrcIdx).getReg() ==
              mi->getOperand(DstIdx).getReg())
            break; // Done with this instruction.

          if (TryInstructionTransform(mi, nmi, mbbi, SrcIdx, DstIdx, Dist))
            break; // The tied operands have been eliminated.
        }

        bool RemovedKillFlag = false;
        bool AllUsesCopied = true;
        unsigned LastCopiedReg = 0;
        unsigned regB = OI->first;
        for (unsigned tpi = 0, tpe = TiedPairs.size(); tpi != tpe; ++tpi) {
          unsigned SrcIdx = TiedPairs[tpi].first;
          unsigned DstIdx = TiedPairs[tpi].second;
          unsigned regA = mi->getOperand(DstIdx).getReg();
          // Grab regB from the instruction because it may have changed if the
          // instruction was commuted.
          regB = mi->getOperand(SrcIdx).getReg();

          if (regA == regB) {
            // The register is tied to multiple destinations (or else we would
            // not have continued this far), but this use of the register
            // already matches the tied destination.  Leave it.
            AllUsesCopied = false;
            continue;
          }
          LastCopiedReg = regA;

          assert(TargetRegisterInfo::isVirtualRegister(regB) &&
                 "cannot make instruction into two-address form");

#ifndef NDEBUG
          // First, verify that we don't have a use of "a" in the instruction
          // (a = b + a for example) because our transformation will not
          // work. This should never occur because we are in SSA form.
          for (unsigned i = 0; i != mi->getNumOperands(); ++i)
            assert(i == DstIdx ||
                   !mi->getOperand(i).isReg() ||
                   mi->getOperand(i).getReg() != regA);
#endif

          // Emit a copy or rematerialize the definition.
          const TargetRegisterClass *rc = MRI->getRegClass(regB);
          MachineInstr *DefMI = MRI->getVRegDef(regB);
          // If it's safe and profitable, remat the definition instead of
          // copying it.
          if (DefMI &&
              DefMI->getDesc().isAsCheapAsAMove() &&
              DefMI->isSafeToReMat(TII, regB, AA) &&
              isProfitableToReMat(regB, rc, mi, DefMI, mbbi, Dist)){
            DEBUG(errs() << "2addr: REMATTING : " << *DefMI << "\n");
            unsigned regASubIdx = mi->getOperand(DstIdx).getSubReg();
            TII->reMaterialize(*mbbi, mi, regA, regASubIdx, DefMI, TRI);
            ReMatRegs.set(regB);
            ++NumReMats;
          } else {
            bool Emitted = TII->copyRegToReg(*mbbi, mi, regA, regB, rc, rc);
            (void)Emitted;
            assert(Emitted && "Unable to issue a copy instruction!\n");
          }

          MachineBasicBlock::iterator prevMI = prior(mi);
          // Update DistanceMap.
          DistanceMap.insert(std::make_pair(prevMI, Dist));
          DistanceMap[mi] = ++Dist;

          DEBUG(errs() << "\t\tprepend:\t" << *prevMI);

          MachineOperand &MO = mi->getOperand(SrcIdx);
          assert(MO.isReg() && MO.getReg() == regB && MO.isUse() &&
                 "inconsistent operand info for 2-reg pass");
          if (MO.isKill()) {
            MO.setIsKill(false);
            RemovedKillFlag = true;
          }
          MO.setReg(regA);
        }

        if (AllUsesCopied) {
          // Replace other (un-tied) uses of regB with LastCopiedReg.
          for (unsigned i = 0, e = mi->getNumOperands(); i != e; ++i) {
            MachineOperand &MO = mi->getOperand(i);
            if (MO.isReg() && MO.getReg() == regB && MO.isUse()) {
              if (MO.isKill()) {
                MO.setIsKill(false);
                RemovedKillFlag = true;
              }
              MO.setReg(LastCopiedReg);
            }
          }

          // Update live variables for regB.
          if (RemovedKillFlag && LV && LV->getVarInfo(regB).removeKill(mi))
            LV->addVirtualRegisterKilled(regB, prior(mi));

        } else if (RemovedKillFlag) {
          // Some tied uses of regB matched their destination registers, so
          // regB is still used in this instruction, but a kill flag was
          // removed from a different tied use of regB, so now we need to add
          // a kill flag to one of the remaining uses of regB.
          for (unsigned i = 0, e = mi->getNumOperands(); i != e; ++i) {
            MachineOperand &MO = mi->getOperand(i);
            if (MO.isReg() && MO.getReg() == regB && MO.isUse()) {
              MO.setIsKill(true);
              break;
            }
          }
        }
          
        MadeChange = true;

        DEBUG(errs() << "\t\trewrite to:\t" << *mi);
      }

      // Clear TiedOperands here instead of at the top of the loop
      // since most instructions do not have tied operands.
      TiedOperands.clear();
      mi = nmi;
    }
  }

  // Some remat'ed instructions are dead.
  int VReg = ReMatRegs.find_first();
  while (VReg != -1) {
    if (MRI->use_empty(VReg)) {
      MachineInstr *DefMI = MRI->getVRegDef(VReg);
      DefMI->eraseFromParent();
    }
    VReg = ReMatRegs.find_next(VReg);
  }

  return MadeChange;
}
Example #19
0
void MCS51FrameLowering::emitPrologue(MachineFunction &MF) const {
  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
  MachineFrameInfo *MFI = MF.getFrameInfo();
  MCS51MachineFunctionInfo *MCS51FI = MF.getInfo<MCS51MachineFunctionInfo>();
  const MCS51InstrInfo &TII =
    *static_cast<const MCS51InstrInfo*>(MF.getTarget().getInstrInfo());

  MachineBasicBlock::iterator MBBI = MBB.begin();
  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();

  // Get the number of bytes to allocate from the FrameInfo.
  uint64_t StackSize = MFI->getStackSize();

  uint64_t NumBytes = 0;
  if (hasFP(MF)) {
    // Calculate required stack adjustment
    uint64_t FrameSize = StackSize - 2;
    NumBytes = FrameSize - MCS51FI->getCalleeSavedFrameSize();

    // Get the offset of the stack slot for the EBP register... which is
    // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
    // Update the frame offset adjustment.
    MFI->setOffsetAdjustment(-NumBytes);

    // Save FPW into the appropriate stack slot...
    BuildMI(MBB, MBBI, DL, TII.get(MCS51::PUSH16r))
      .addReg(MCS51::FPW, RegState::Kill);

    // Update FPW with the new base value...
    BuildMI(MBB, MBBI, DL, TII.get(MCS51::MOV16rr), MCS51::FPW)
      .addReg(MCS51::SPW);

    // Mark the FramePtr as live-in in every block except the entry.
    for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end();
         I != E; ++I)
      I->addLiveIn(MCS51::FPW);

  } else
    NumBytes = StackSize - MCS51FI->getCalleeSavedFrameSize();

  // Skip the callee-saved push instructions.
  while (MBBI != MBB.end() && (MBBI->getOpcode() == MCS51::PUSH16r))
    ++MBBI;

  if (MBBI != MBB.end())
    DL = MBBI->getDebugLoc();

  if (NumBytes) { // adjust stack pointer: SPW -= numbytes
    // If there is an SUB16ri of SPW immediately before this instruction, merge
    // the two.
    //NumBytes -= mergeSPUpdates(MBB, MBBI, true);
    // If there is an ADD16ri or SUB16ri of SPW immediately after this
    // instruction, merge the two instructions.
    // mergeSPUpdatesDown(MBB, MBBI, &NumBytes);

    if (NumBytes) {
      MachineInstr *MI =
        BuildMI(MBB, MBBI, DL, TII.get(MCS51::SUB16ri), MCS51::SPW)
        .addReg(MCS51::SPW).addImm(NumBytes);
      // The SRW implicit def is dead.
      MI->getOperand(3).setIsDead();
    }
  }
}
Example #20
0
bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
  TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
  TRI =
      static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

  bool HaveKill = false;
  bool NeedFlat = false;
  unsigned Depth = 0;

  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
       BI != BE; ++BI) {

    MachineBasicBlock *EmptyMBBAtEnd = NULL;
    MachineBasicBlock &MBB = *BI;
    MachineBasicBlock::iterator I, Next;
    bool ExecModified = false;

    for (I = MBB.begin(); I != MBB.end(); I = Next) {
      Next = std::next(I);

      MachineInstr &MI = *I;

      // Flat uses m0 in case it needs to access LDS.
      if (TII->isFLAT(MI))
        NeedFlat = true;

      for (const auto &Def : I->defs()) {
        if (Def.isReg() && Def.isDef() && Def.getReg() == AMDGPU::EXEC) {
          ExecModified = true;
          break;
        }
      }

      switch (MI.getOpcode()) {
        default: break;
        case AMDGPU::SI_IF:
          ++Depth;
          If(MI);
          break;

        case AMDGPU::SI_ELSE:
          Else(MI, ExecModified);
          break;

        case AMDGPU::SI_BREAK:
          Break(MI);
          break;

        case AMDGPU::SI_IF_BREAK:
          IfBreak(MI);
          break;

        case AMDGPU::SI_ELSE_BREAK:
          ElseBreak(MI);
          break;

        case AMDGPU::SI_LOOP:
          ++Depth;
          Loop(MI);
          break;

        case AMDGPU::SI_END_CF:
          if (--Depth == 0 && HaveKill) {
            SkipIfDead(MI);
            HaveKill = false;
          }
          EndCf(MI);
          break;

        case AMDGPU::SI_KILL:
          if (Depth == 0)
            SkipIfDead(MI);
          else
            HaveKill = true;
          Kill(MI);
          break;

        case AMDGPU::S_BRANCH:
          Branch(MI);
          break;

        case AMDGPU::SI_INDIRECT_SRC_V1:
        case AMDGPU::SI_INDIRECT_SRC_V2:
        case AMDGPU::SI_INDIRECT_SRC_V4:
        case AMDGPU::SI_INDIRECT_SRC_V8:
        case AMDGPU::SI_INDIRECT_SRC_V16:
          IndirectSrc(MI);
          break;

        case AMDGPU::SI_INDIRECT_DST_V1:
        case AMDGPU::SI_INDIRECT_DST_V2:
        case AMDGPU::SI_INDIRECT_DST_V4:
        case AMDGPU::SI_INDIRECT_DST_V8:
        case AMDGPU::SI_INDIRECT_DST_V16:
          IndirectDst(MI);
          break;

        case AMDGPU::S_ENDPGM: {
          if (MF.getInfo<SIMachineFunctionInfo>()->returnsVoid())
            break;

          // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
          // because external bytecode will be appended at the end.
          if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
            // S_ENDPGM is not the last instruction. Add an empty block at
            // the end and jump there.
            if (!EmptyMBBAtEnd) {
              EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
              MF.insert(MF.end(), EmptyMBBAtEnd);
            }

            MBB.addSuccessor(EmptyMBBAtEnd);
            BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
                    .addMBB(EmptyMBBAtEnd);
          }

          I->eraseFromParent();
          break;
        }
      }
    }
  }

  if (NeedFlat && MFI->IsKernel) {
    // TODO: What to use with function calls?
    // We will need to Initialize the flat scratch register pair.
    if (NeedFlat)
      MFI->setHasFlatInstructions(true);
  }

  return true;
}
/// MoveDiscontiguousLoopBlocks - Move any loop blocks that are not in the
/// portion of the loop contiguous with the header. This usually makes the loop
/// contiguous, provided that AnalyzeBranch can handle all the relevant
/// branching. See the @cfg_islands case in test/CodeGen/X86/loop_blocks.ll
/// for an example of this.
bool CodePlacementOpt::MoveDiscontiguousLoopBlocks(MachineFunction &MF,
                                                   MachineLoop *L) {
  bool Changed = false;
  MachineBasicBlock *TopMBB = L->getTopBlock();
  MachineBasicBlock *BotMBB = L->getBottomBlock();

  // Determine a position to move orphaned loop blocks to. If TopMBB is not
  // entered via fallthrough and BotMBB is exited via fallthrough, prepend them
  // to the top of the loop to avoid loosing that fallthrough. Otherwise append
  // them to the bottom, even if it previously had a fallthrough, on the theory
  // that it's worth an extra branch to keep the loop contiguous.
  MachineFunction::iterator InsertPt =
    llvm::next(MachineFunction::iterator(BotMBB));
  bool InsertAtTop = false;
  if (TopMBB != MF.begin() &&
      !HasFallthrough(prior(MachineFunction::iterator(TopMBB))) &&
      HasFallthrough(BotMBB)) {
    InsertPt = TopMBB;
    InsertAtTop = true;
  }

  // Keep a record of which blocks are in the portion of the loop contiguous
  // with the loop header.
  SmallPtrSet<MachineBasicBlock *, 8> ContiguousBlocks;
  for (MachineFunction::iterator I = TopMBB,
       E = llvm::next(MachineFunction::iterator(BotMBB)); I != E; ++I)
    ContiguousBlocks.insert(I);

  // Find non-contigous blocks and fix them.
  if (InsertPt != MF.begin() && HasAnalyzableTerminator(prior(InsertPt)))
    for (MachineLoop::block_iterator BI = L->block_begin(), BE = L->block_end();
         BI != BE; ++BI) {
      MachineBasicBlock *BB = *BI;

      // Verify that we can analyze all the loop entry edges before beginning
      // any changes which will require us to be able to analyze them.
      if (!HasAnalyzableTerminator(BB))
        continue;
      if (!HasAnalyzableTerminator(prior(MachineFunction::iterator(BB))))
        continue;

      // If the layout predecessor is part of the loop, this block will be
      // processed along with it. This keeps them in their relative order.
      if (BB != MF.begin() &&
          L->contains(prior(MachineFunction::iterator(BB))))
        continue;

      // Check to see if this block is already contiguous with the main
      // portion of the loop.
      if (!ContiguousBlocks.insert(BB))
        continue;

      // Move the block.
      DEBUG(dbgs() << "CGP: Moving blocks starting at BB#" << BB->getNumber()
                   << " to be contiguous with loop.\n");
      Changed = true;

      // Process this block and all loop blocks contiguous with it, to keep
      // them in their relative order.
      MachineFunction::iterator Begin = BB;
      MachineFunction::iterator End = llvm::next(MachineFunction::iterator(BB));
      for (; End != MF.end(); ++End) {
        if (!L->contains(End)) break;
        if (!HasAnalyzableTerminator(End)) break;
        ContiguousBlocks.insert(End);
        ++NumIntraMoved;
      }

      // If we're inserting at the bottom of the loop, and the code we're
      // moving originally had fall-through successors, bring the sucessors
      // up with the loop blocks to preserve the fall-through edges.
      if (!InsertAtTop)
        for (; End != MF.end(); ++End) {
          if (L->contains(End)) break;
          if (!HasAnalyzableTerminator(End)) break;
          if (!HasFallthrough(prior(End))) break;
        }

      // Move the blocks. This may invalidate TopMBB and/or BotMBB, but
      // we don't need them anymore at this point.
      Splice(MF, InsertPt, Begin, End);
    }

  return Changed;
}
Example #22
0
bool GCMachineCodeFixup::runOnMachineFunction(MachineFunction &MF) {
  // Quick exit for functions that do not use GC.
  if (!MF.getFunction()->hasGC())
    return false;

  const TargetMachine &TM = MF.getTarget();
  const TargetInstrInfo *TII = TM.getInstrInfo();
  GCModuleInfo &GMI = getAnalysis<GCModuleInfo>();
  GCFunctionInfo &GCFI = GMI.getFunctionInfo(*MF.getFunction());

  for (MachineFunction::iterator MBBI = MF.begin(),
                                 MBBE = MF.end(); MBBI != MBBE; ++MBBI) {
    for (MachineBasicBlock::iterator MII = MBBI->begin(),
                                     MIE = MBBI->end(); MII != MIE;) {
      if (!MII->isGCRegRoot() || !MII->getOperand(0).isReg()) {
        ++MII;
        continue;
      }

      // Trace the register back to its location at the site of the call (either
      // a physical reg or a frame index).
      bool TracingReg = true;
      unsigned TracedReg = MII->getOperand(0).getReg();
      int FrameIndex;

      MachineBasicBlock::iterator PrevII = MII;
      for (--PrevII;; --PrevII) {
        if (PrevII->isGCRegRoot() && PrevII->getOperand(0).isReg())
          break;
        if (PrevII->isCall())
          break;

        int FI;

        // Trace back through register reloads.
        unsigned Reg =
          TM.getInstrInfo()->isLoadFromStackSlotPostFE(&*PrevII, FI);
        if (Reg) {
          // This is a reload. If we're tracing this register, start tracing the
          // frame index instead.
          if (TracingReg && TracedReg == Reg) {
            TracingReg = false;
            FrameIndex = FI;
          }
          continue;
        }

        // Trace back through spills.
        if (TM.getInstrInfo()->isStoreToStackSlotPostFE(&*PrevII, FI))
          continue;

        // Trace back through register-to-register copies.
        if (PrevII->isCopy()) {
          if (TracingReg && TracedReg == PrevII->getOperand(0).getReg())
            TracedReg = PrevII->getOperand(1).getReg();
          continue;
        }

        // Trace back through non-register GC_REG_ROOT instructions.
        if (PrevII->isGCRegRoot() && !PrevII->getOperand(0).isReg())
          continue;

        DEBUG(dbgs() << "Bad instruction: " << *PrevII);
        llvm_unreachable("GC_REG_ROOT found in an unexpected location!");
      }

      // Now we've reached either a call or another GC_REG_ROOT instruction.
      // Move the GC_REG_ROOT instruction we're considering to the right place,
      // and rewrite it if necessary.
      //
      // Also, tell the GCFunctionInfo about the frame index, since this is
      // our only chance -- the frame indices will be deleted by the time
      // GCMachineCodeAnalysis runs.
      ++PrevII;
      unsigned RootIndex = MII->getOperand(1).getImm();
      MachineInstr *NewMI;
      if (TracingReg) {
        MachineInstrBuilder MIB = BuildMI(MF, MII->getDebugLoc(),
                                          TII->get(TargetOpcode::GC_REG_ROOT));
        MIB.addReg(TracedReg).addImm(RootIndex);
        NewMI = MIB;
      } else {
        NewMI = TII->emitFrameIndexGCRegRoot(MF, FrameIndex, RootIndex,
                                             MII->getDebugLoc());
        GCFI.spillRegRoot(RootIndex, FrameIndex);
      }

      MBBI->insert(PrevII, NewMI);

      MachineBasicBlock::iterator NextII = MII;
      ++NextII;
      MII->eraseFromParent();
      MII = NextII;
    }
  }

  return true;
}
Example #23
0
bool PatmosSPMark::runOnMachineModule(const Module &M) {

  DEBUG( dbgs() <<
         "[Single-Path] Mark functions reachable from single-path roots\n");

  MMI = &getAnalysis<MachineModuleInfo>();
  assert(MMI);

  Worklist W;

  // initialize the worklist with machine functions that have either
  // sp-root or sp-reachable function attribute
  for(Module::const_iterator F(M.begin()), FE(M.end()); F != FE; ++F) {
    if (F->hasFnAttribute("sp-root") || F->hasFnAttribute("sp-reachable")) {
      // get the machine-level function
      MachineFunction *MF = MMI->getMachineFunction(F);
      assert( MF );
      PatmosMachineFunctionInfo *PMFI =
        MF->getInfo<PatmosMachineFunctionInfo>();
      PMFI->setSinglePath();
      NumSPTotal++; // bump STATISTIC
      W.push_back(MF);
    }
  }

  // process worklist
  while (!W.empty()) {
    MachineFunction *MF = W.front();
    W.pop_front();
    scanAndRewriteCalls(MF, W);
  }

  // clear all cloned machine functions that are not marked as single-path
  // by now
  for(Module::const_iterator F(M.begin()), FE(M.end()); F != FE; ++F) {
    if (F->hasFnAttribute("sp-maybe")) {
      // get the machine-level function
      MachineFunction *MF = MMI->getMachineFunction(F);
      assert( MF );
      PatmosMachineFunctionInfo *PMFI =
        MF->getInfo<PatmosMachineFunctionInfo>();

      if (!PMFI->isSinglePath()) {
        // delete all MBBs
        while (MF->begin() != MF->end()) {
          MF->begin()->eraseFromParent();
        }
        // insert a new single MBB with a single return instruction
        MachineBasicBlock *EmptyMBB = MF->CreateMachineBasicBlock();
        MF->push_back(EmptyMBB);

        DebugLoc DL;
        AddDefaultPred(BuildMI(*EmptyMBB, EmptyMBB->end(), DL,
            TM.getInstrInfo()->get(Patmos::RET)));

        NumSPCleared++; // bump STATISTIC
      };
    }
  }

  return true;
}
Example #24
0
bool VirtRegReduction::runOnMachineFunction(MachineFunction &MF)
{
  bool Changed = false;

#if VRRPROF
  const Function *F = MF.getFunction();
  std::string FN = F->getName().str();
  llog("starting vrr... %s (%d)\n", FN.c_str(), (int)time(NULL));
  llog("starting immRegs finder... (%d)\n", (int)time(NULL));
#endif
  std::auto_ptr<std::unordered_set<unsigned> > immRegsHolder;
  std::unordered_set<unsigned> *immRegs = NULL;
  
  // single-def regs defined by a MoveImm shouldn't coalesce as we may be
  // able to fold them later
  {
    std::unordered_map<unsigned, const MachineInstr *> singleDef;

    MachineFunction::const_iterator I = MF.begin(), E = MF.end();

    // find all registers w/ a single def
    for(; I != E; I++)
    {
      MachineBasicBlock::const_iterator BI = I->begin(), BE = I->end();

     for(; BI != BE; BI++)
     {
       MachineInstr::const_mop_iterator II, IE;
       II = BI->operands_begin();
       IE = BI->operands_end();
       for(; II != IE; II++)
         if(II->isReg() && II->isDef())
         {
           unsigned R = II->getReg();
           std::unordered_map<unsigned, const MachineInstr *>::iterator SI = singleDef.find(R);

           if(SI == singleDef.end())
             singleDef[R] = BI; // first seen! insert
           else
             SI->second = NULL; // second seen -- replace w/ NULL
         }
      }
    }

    std::unordered_map<unsigned, const MachineInstr *>::const_iterator SI = singleDef.begin(), SE = singleDef.end();

    for(; SI != SE; SI++)
    {
      if(SI->second && SI->second->getDesc().isMoveImmediate()) // single def imm?
      {
        if(!immRegs)
          immRegsHolder.reset(immRegs = new std::unordered_set<unsigned>);
        immRegs->insert(SI->first); // don't coalesce
      }
    }
  }

#if VRRPROF
  llog("starting tdkRegs finder... (%d)\n", (int)time(NULL));
#endif

  std::auto_ptr<std::unordered_set<unsigned> > tdkRegsHolder;
  std::unordered_set<unsigned> *tdkRegs = NULL;
  
  bool setjmpSafe = !MF.callsSetJmp() && MF.getFunction()->doesNotThrow();

  {
    tdkRegsHolder.reset(tdkRegs = new std::unordered_set<unsigned>);

    std::unordered_map<unsigned, unsigned> trivialDefKills;

    MachineFunction::const_iterator I = MF.begin(), E = MF.end();

    // find all registers defed and killed in the same block w/ no intervening
    // unsafe (due to setjmp) calls + side-effecty operations
    for(; I != E; I++)
    {
      std::unordered_set<unsigned> defs;

      MachineBasicBlock::const_iterator BI = I->begin(), BE = I->end();

     for(; BI != BE; BI++)
     {
       // TODO need to add || BI->getDesc().isInlineAsm() here to help stackification?
       if((!setjmpSafe && BI->getDesc().isCall()) || BI->getDesc().hasUnmodeledSideEffects()) { 
         // invalidate on a call instruction if setjmp present, or instr with side effects regardless
         defs.clear();
       }

       MachineInstr::const_mop_iterator II, IE;
     
       // uses when we're not tracking a reg it make it unsafe
       II = BI->operands_begin();
       IE = BI->operands_end();
       for(; II != IE; II++)
         if(II->isReg() && II->isUse())
         {
           unsigned R = II->getReg();
           std::unordered_set<unsigned>::const_iterator DI = defs.find(R);

           if(DI == defs.end())
             trivialDefKills[R] = 100;
         }
       // kills of tracked defs are trivial def/kills
       II = BI->operands_begin();
       IE = BI->operands_end();
       for(; II != IE; II++)
         if(II->isReg() && II->isKill())
         {
           unsigned R = II->getReg();
           std::unordered_set<unsigned>::const_iterator DI = defs.find(R);

           if(DI != defs.end())
           {
             defs.erase(DI);
             trivialDefKills[R]++;
           }
           else
             trivialDefKills[R] = 100; // don't use
         }
       // record all defs in this instruction
       II = BI->operands_begin();
       IE = BI->operands_end();
       for(; II != IE; II++)
         if(II->isReg() && II->isDef())
           defs.insert(II->getReg());
      }
    }

    std::unordered_map<unsigned, unsigned>::const_iterator DKI = trivialDefKills.begin(),
        DKE = trivialDefKills.end();

    for(; DKI != DKE; DKI++)
      if(DKI->second == 1)
        tdkRegs->insert(DKI->first);
  }

#if VRRPROF
  llog("starting conflict graph construction... (%d)\n", (int)time(NULL));
#endif

  std::unordered_set<unsigned>::const_iterator tdkE = tdkRegs->end();

  std::unordered_set<unsigned> *okRegs = NULL;

  if(!setjmpSafe)
    okRegs = tdkRegs;

  MachineRegisterInfo *RI = &(MF.getRegInfo());
  // will eventually hold a virt register coloring for this function
  ConflictGraph::Coloring coloring;

  {
    ConflictGraph cg;
    LiveIntervals &LIS = getAnalysis<LiveIntervals>();
    LiveIntervals::const_iterator I = LIS.begin(), E = LIS.end();

    // check every possible LiveInterval, LiveInterval pair of the same
    // register class for overlap and add overlaps to the conflict graph
    // also, treat trivially def-kill-ed regs and not trivially def-kill-ed
    // regs as conflicting so they end up using different VRs -- this makes
    // stackification easier later in the toolchain
    for(; I != E; I++)
    {
      unsigned R = I->first;

      if(TargetRegisterInfo::isPhysicalRegister(R))
        continue;
      if(okRegs && okRegs->find(R) == okRegs->end())
        continue;
      // leave singly-defined MoveImm regs for later coalescing
      if(immRegs && immRegs->find(R) != immRegs->end())
        continue;

//      const TargetRegisterClass *RC = RI->getRegClass(R);
      const LiveInterval *LI = I->second;

      if(LI->empty())
        continue;

      cg.addVertex(R);

      bool notTDK = tdkRegs->find(R) == tdkE;
      LiveIntervals::const_iterator I1 = I;

      I1++;
					
      for(; I1 != E; I1++)
      {
        unsigned R1 = I1->first;

        if(TargetRegisterInfo::isPhysicalRegister(R1))
          continue;
        if(okRegs && okRegs->find(R1) == okRegs->end())
          continue;
        // leave singly-defined MoveImm regs for later coalescing
        if(immRegs && immRegs->find(R1) != immRegs->end())
          continue;

/* Don't bother checked RC -- even though it sounds like an opt, it doesn't speed us up in practice
        const TargetRegisterClass *RC1 = RI->getRegClass(R1);

        if(RC != RC1)
          continue; // different reg class... won't conflict
*/

	const LiveInterval *LI1 = I1->second;

        // conflict if intervals overlap OR they're not both TDK or both NOT TDK
        if(LI->overlaps(*LI1) || notTDK != (tdkRegs->find(R1) == tdkE))
          cg.addEdge(R, R1);
      }
    }

#if VRRPROF
  llog("starting coloring... (%d)\n", (int)time(NULL));
#endif

    cg.color(&coloring);

#if VRRPROF
  llog("starting vreg=>vreg construction... (%d)\n", (int)time(NULL));
#endif

	typedef std::unordered_map<unsigned, unsigned> VRegMap;
	VRegMap Regs;

	// build up map of vreg=>vreg
	{
		std::unordered_map<const TargetRegisterClass *, std::unordered_map<unsigned, unsigned> > RCColor2VReg;

		ConflictGraph::Coloring::const_iterator I = coloring.begin(), E = coloring.end();

		for(; I != E; I++)
		{
			unsigned R = I->first;
			unsigned Color = I->second;
			const TargetRegisterClass *RC = RI->getRegClass(R);
			std::unordered_map<unsigned, unsigned> &Color2VReg = RCColor2VReg[RC];

			VRegMap::const_iterator CI = Color2VReg.find(Color);

			if(CI != Color2VReg.end())
				Regs[R] = CI->second; // seen this color; map it
			else
				Regs[R] = Color2VReg[Color] = R; // first sighting of color; bind to this reg
		}
	}

#if VRRPROF
  llog("starting remap... (%d)\n", (int)time(NULL));
#endif


	// remap regs
	{
		VRegMap::const_iterator I = Regs.begin(), E = Regs.end();

		for(; I != E; I++)
			if(I->first != I->second)
			{
				RI->replaceRegWith(I->first, I->second);
				Changed = true;
			}
	}
  }

#if VRRPROF
  llog("done... (%d)\n", (int)time(NULL));
#endif

  return Changed;
}
Example #25
0
/// scavengeFrameVirtualRegs - Replace all frame index virtual registers
/// with physical registers. Use the register scavenger to find an
/// appropriate register to use.
///
/// FIXME: Iterating over the instruction stream is unnecessary. We can simply
/// iterate over the vreg use list, which at this point only contains machine
/// operands for which eliminateFrameIndex need a new scratch reg.
void PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) {
  // Run through the instructions and find any virtual registers.
  for (MachineFunction::iterator BB = Fn.begin(),
       E = Fn.end(); BB != E; ++BB) {
    RS->enterBasicBlock(BB);

    int SPAdj = 0;

    // The instruction stream may change in the loop, so check BB->end()
    // directly.
    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) {
      MachineInstr *MI = I;
      MachineBasicBlock::iterator J = llvm::next(I);
      MachineBasicBlock::iterator P = I == BB->begin() ?
        MachineBasicBlock::iterator(NULL) : llvm::prior(I);

      // RS should process this instruction before we might scavenge at this
      // location. This is because we might be replacing a virtual register
      // defined by this instruction, and if so, registers killed by this
      // instruction are available, and defined registers are not.
      RS->forward(I);

      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
        if (MI->getOperand(i).isReg()) {
          MachineOperand &MO = MI->getOperand(i);
          unsigned Reg = MO.getReg();
          if (Reg == 0)
            continue;
          if (!TargetRegisterInfo::isVirtualRegister(Reg))
            continue;

          // When we first encounter a new virtual register, it
          // must be a definition.
          assert(MI->getOperand(i).isDef() &&
                 "frame index virtual missing def!");
          // Scavenge a new scratch register
          const TargetRegisterClass *RC = Fn.getRegInfo().getRegClass(Reg);
          unsigned ScratchReg = RS->scavengeRegister(RC, J, SPAdj);

          ++NumScavengedRegs;

          // Replace this reference to the virtual register with the
          // scratch register.
          assert (ScratchReg && "Missing scratch register!");
          Fn.getRegInfo().replaceRegWith(Reg, ScratchReg);

          // Because this instruction was processed by the RS before this
          // register was allocated, make sure that the RS now records the
          // register as being used.
          RS->setUsed(ScratchReg);
        }
      }

      // If the scavenger needed to use one of its spill slots, the
      // spill code will have been inserted in between I and J. This is a
      // problem because we need the spill code before I: Move I to just
      // prior to J.
      if (I != llvm::prior(J)) {
        BB->splice(J, BB, I);

        // Before we move I, we need to prepare the RS to visit I again.
        // Specifically, RS will assert if it sees uses of registers that
        // it believes are undefined. Because we have already processed
        // register kills in I, when it visits I again, it will believe that
        // those registers are undefined. To avoid this situation, unprocess
        // the instruction I.
        assert(RS->getCurrentPosition() == I &&
          "The register scavenger has an unexpected position");
        I = P;
        RS->unprocess(P);

        // RS->skipTo(I == BB->begin() ? NULL : llvm::prior(I));
      } else
        ++I;
    }
  }
}
/// runOnMachineFunction - This emits the frame section, autos section and 
/// assembly for each instruction. Also takes care of function begin debug
/// directive and file begin debug directive (if required) for the function.
///
bool PIC16AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
  // This calls the base class function required to be called at beginning
  // of runOnMachineFunction.
  SetupMachineFunction(MF);

  // Put the color information from function to its auto section.
  const Function *F = MF.getFunction();
  ColorAutoSection(F);

  // Emit the function frame (args and temps).
  EmitFunctionFrame(MF);

  DbgInfo.BeginFunction(MF);

  // Now emit the instructions of function in its code section.
  const MCSection *fCodeSection = 
    getObjFileLowering().SectionForCode(CurrentFnSym->getName(), 
                                        PAN::isISR(F->getSection()));

  // Start the Code Section.
  O <<  "\n";
  OutStreamer.SwitchSection(fCodeSection);

  // Emit the frame address of the function at the beginning of code.
  O << "\tretlw  low(" << PAN::getFrameLabel(CurrentFnSym->getName()) << ")\n";
  O << "\tretlw  high(" << PAN::getFrameLabel(CurrentFnSym->getName()) << ")\n";

  // Emit function start label.
  O << *CurrentFnSym << ":\n";

  DebugLoc CurDL;
  O << "\n"; 
  // Print out code for the function.
  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
       I != E; ++I) {

    // Print a label for the basic block.
    if (I != MF.begin()) {
      EmitBasicBlockStart(I);
    }
    
    // Print a basic block.
    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
         II != E; ++II) {

      // Emit the line directive if source line changed.
      const DebugLoc DL = II->getDebugLoc();
      if (!DL.isUnknown() && DL != CurDL) {
        DbgInfo.ChangeDebugLoc(MF, DL);
        CurDL = DL;
      }
        
      // Print the assembly for the instruction.
      EmitInstruction(II);
    }
  }
  
  // Emit function end debug directives.
  DbgInfo.EndFunction(MF);

  return false;  // we didn't modify anything.
}
Example #27
0
bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
  MachineRegisterInfo &MRI = MF.getRegInfo();
  const SIInstrInfo *TII =
      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
  const SIRegisterInfo &TRI = TII->getRegisterInfo();

  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
                                                  BI != BE; ++BI) {

    MachineBasicBlock &MBB = *BI;
    MachineBasicBlock::iterator I, Next;
    for (I = MBB.begin(); I != MBB.end(); I = Next) {
      Next = std::next(I);
      MachineInstr &MI = *I;

      if (!isSafeToFold(MI.getOpcode()))
        continue;

      unsigned OpSize = TII->getOpSize(MI, 1);
      MachineOperand &OpToFold = MI.getOperand(1);
      bool FoldingImm = OpToFold.isImm();

      // FIXME: We could also be folding things like FrameIndexes and
      // TargetIndexes.
      if (!FoldingImm && !OpToFold.isReg())
        continue;

      // Folding immediates with more than one use will increase program size.
      // FIXME: This will also reduce register usage, which may be better
      // in some cases.  A better heuristic is needed.
      if (FoldingImm && !TII->isInlineConstant(OpToFold, OpSize) &&
          !MRI.hasOneUse(MI.getOperand(0).getReg()))
        continue;

      if (OpToFold.isReg() &&
          !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()))
        continue;

      // Prevent folding operands backwards in the function. For example,
      // the COPY opcode must not be replaced by 1 in this example:
      //
      //    %vreg3<def> = COPY %VGPR0; VGPR_32:%vreg3
      //    ...
      //    %VGPR0<def> = V_MOV_B32_e32 1, %EXEC<imp-use>
      MachineOperand &Dst = MI.getOperand(0);
      if (Dst.isReg() &&
          !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
        continue;

      // We need mutate the operands of new mov instructions to add implicit
      // uses of EXEC, but adding them invalidates the use_iterator, so defer
      // this.
      SmallVector<MachineInstr *, 4> CopiesToReplace;

      std::vector<FoldCandidate> FoldList;
      for (MachineRegisterInfo::use_iterator
           Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end();
           Use != E; ++Use) {

        MachineInstr *UseMI = Use->getParent();

        foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList,
                    CopiesToReplace, TII, TRI, MRI);
      }

      // Make sure we add EXEC uses to any new v_mov instructions created.
      for (MachineInstr *Copy : CopiesToReplace)
        Copy->addImplicitDefUseOperands(MF);

      for (FoldCandidate &Fold : FoldList) {
        if (updateOperand(Fold, TRI)) {
          // Clear kill flags.
          if (!Fold.isImm()) {
            assert(Fold.OpToFold && Fold.OpToFold->isReg());
            // FIXME: Probably shouldn't bother trying to fold if not an
            // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
            // copies.
            MRI.clearKillFlags(Fold.OpToFold->getReg());
          }
          DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
                Fold.UseOpNo << " of " << *Fold.UseMI << '\n');
        }
      }
    }
  }
  return false;
}
Example #28
0
bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
  const PPCInstrInfo *TII =
                static_cast<const PPCInstrInfo*>(Fn.getTarget().getInstrInfo());
  // Give the blocks of the function a dense, in-order, numbering.
  Fn.RenumberBlocks();
  BlockSizes.resize(Fn.getNumBlockIDs());

  // Measure each MBB and compute a size for the entire function.
  unsigned FuncSize = 0;
  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
       ++MFI) {
    MachineBasicBlock *MBB = MFI;

    unsigned BlockSize = 0;
    for (MachineBasicBlock::iterator MBBI = MBB->begin(), EE = MBB->end();
         MBBI != EE; ++MBBI)
      BlockSize += TII->GetInstSizeInBytes(MBBI);
    
    BlockSizes[MBB->getNumber()] = BlockSize;
    FuncSize += BlockSize;
  }
  
  // If the entire function is smaller than the displacement of a branch field,
  // we know we don't need to shrink any branches in this function.  This is a
  // common case.
  if (FuncSize < (1 << 15)) {
    BlockSizes.clear();
    return false;
  }
  
  // For each conditional branch, if the offset to its destination is larger
  // than the offset field allows, transform it into a long branch sequence
  // like this:
  //   short branch:
  //     bCC MBB
  //   long branch:
  //     b!CC $PC+8
  //     b MBB
  //
  bool MadeChange = true;
  bool EverMadeChange = false;
  while (MadeChange) {
    // Iteratively expand branches until we reach a fixed point.
    MadeChange = false;
  
    for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
         ++MFI) {
      MachineBasicBlock &MBB = *MFI;
      unsigned MBBStartOffset = 0;
      for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
           I != E; ++I) {
        MachineBasicBlock *Dest = nullptr;
        if (I->getOpcode() == PPC::BCC && !I->getOperand(2).isImm())
          Dest = I->getOperand(2).getMBB();
        else if ((I->getOpcode() == PPC::BC || I->getOpcode() == PPC::BCn) &&
                 !I->getOperand(1).isImm())
          Dest = I->getOperand(1).getMBB();
        else if ((I->getOpcode() == PPC::BDNZ8 || I->getOpcode() == PPC::BDNZ ||
                  I->getOpcode() == PPC::BDZ8  || I->getOpcode() == PPC::BDZ) &&
                 !I->getOperand(0).isImm())
          Dest = I->getOperand(0).getMBB();

        if (!Dest) {
          MBBStartOffset += TII->GetInstSizeInBytes(I);
          continue;
        }
        
        // Determine the offset from the current branch to the destination
        // block.
        int BranchSize;
        if (Dest->getNumber() <= MBB.getNumber()) {
          // If this is a backwards branch, the delta is the offset from the
          // start of this block to this branch, plus the sizes of all blocks
          // from this block to the dest.
          BranchSize = MBBStartOffset;
          
          for (unsigned i = Dest->getNumber(), e = MBB.getNumber(); i != e; ++i)
            BranchSize += BlockSizes[i];
        } else {
          // Otherwise, add the size of the blocks between this block and the
          // dest to the number of bytes left in this block.
          BranchSize = -MBBStartOffset;

          for (unsigned i = MBB.getNumber(), e = Dest->getNumber(); i != e; ++i)
            BranchSize += BlockSizes[i];
        }

        // If this branch is in range, ignore it.
        if (isInt<16>(BranchSize)) {
          MBBStartOffset += 4;
          continue;
        }

        // Otherwise, we have to expand it to a long branch.
        MachineInstr *OldBranch = I;
        DebugLoc dl = OldBranch->getDebugLoc();
 
        if (I->getOpcode() == PPC::BCC) {
          // The BCC operands are:
          // 0. PPC branch predicate
          // 1. CR register
          // 2. Target MBB
          PPC::Predicate Pred = (PPC::Predicate)I->getOperand(0).getImm();
          unsigned CRReg = I->getOperand(1).getReg();
       
          // Jump over the uncond branch inst (i.e. $PC+8) on opposite condition.
          BuildMI(MBB, I, dl, TII->get(PPC::BCC))
            .addImm(PPC::InvertPredicate(Pred)).addReg(CRReg).addImm(2);
        } else if (I->getOpcode() == PPC::BC) {
          unsigned CRBit = I->getOperand(0).getReg();
          BuildMI(MBB, I, dl, TII->get(PPC::BCn)).addReg(CRBit).addImm(2);
        } else if (I->getOpcode() == PPC::BCn) {
          unsigned CRBit = I->getOperand(0).getReg();
          BuildMI(MBB, I, dl, TII->get(PPC::BC)).addReg(CRBit).addImm(2);
        } else if (I->getOpcode() == PPC::BDNZ) {
          BuildMI(MBB, I, dl, TII->get(PPC::BDZ)).addImm(2);
        } else if (I->getOpcode() == PPC::BDNZ8) {
          BuildMI(MBB, I, dl, TII->get(PPC::BDZ8)).addImm(2);
        } else if (I->getOpcode() == PPC::BDZ) {
          BuildMI(MBB, I, dl, TII->get(PPC::BDNZ)).addImm(2);
        } else if (I->getOpcode() == PPC::BDZ8) {
          BuildMI(MBB, I, dl, TII->get(PPC::BDNZ8)).addImm(2);
        } else {
           llvm_unreachable("Unhandled branch type!");
        }
        
        // Uncond branch to the real destination.
        I = BuildMI(MBB, I, dl, TII->get(PPC::B)).addMBB(Dest);

        // Remove the old branch from the function.
        OldBranch->eraseFromParent();
        
        // Remember that this instruction is 8-bytes, increase the size of the
        // block by 4, remember to iterate.
        BlockSizes[MBB.getNumber()] += 4;
        MBBStartOffset += 8;
        ++NumExpanded;
        MadeChange = true;
      }
    }
    EverMadeChange |= MadeChange;
  }
  
  BlockSizes.clear();
  return true;
}
/// runOnMachineFunction - This uses the printInstruction()
/// method to print assembly for each instruction.
///
bool PIC16AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
  this->MF = &MF;

  // This calls the base class function required to be called at beginning
  // of runOnMachineFunction.
  SetupMachineFunction(MF);

  // Get the mangled name.
  const Function *F = MF.getFunction();
  CurrentFnName = Mang->getValueName(F);

  // Emit the function variables.
  EmitFunctionFrame(MF);

  // Emit function begin debug directives
  DbgInfo.EmitFunctBeginDI(F);

  EmitAutos(CurrentFnName);
  const char *codeSection = PAN::getCodeSectionName(CurrentFnName).c_str();
 
  const Section *fCodeSection = TAI->getNamedSection(codeSection,
                                                     SectionFlags::Code);
  O <<  "\n";
  // Start the Code Section.
  SwitchToSection (fCodeSection);

  // Emit the frame address of the function at the beginning of code.
  O << "\tretlw  low(" << PAN::getFrameLabel(CurrentFnName) << ")\n";
  O << "\tretlw  high(" << PAN::getFrameLabel(CurrentFnName) << ")\n";

  // Emit function start label.
  O << CurrentFnName << ":\n";

  // For emitting line directives, we need to keep track of the current
  // source line. When it changes then only emit the line directive.
  unsigned CurLine = 0;
  O << "\n"; 
  // Print out code for the function.
  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
       I != E; ++I) {
    // Print a label for the basic block.
    if (I != MF.begin()) {
      printBasicBlockLabel(I, true);
      O << '\n';
    }
    
    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
         II != E; ++II) {
      // Emit the line directive if source line changed.
      const DebugLoc DL = II->getDebugLoc();
      if (!DL.isUnknown()) {
        unsigned line = MF.getDebugLocTuple(DL).Line;
        if (line != CurLine) {
          O << "\t.line " << line << "\n";
          CurLine = line;
        }
      }
        
      // Print the assembly for the instruction.
      printMachineInstruction(II);
    }
  }
  
  // Emit function end debug directives.
  DbgInfo.EmitFunctEndDI(F, CurLine);
  return false;  // we didn't modify anything.
}
bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
  MachineRegisterInfo &MRI = MF.getRegInfo();
  const SIInstrInfo *TII =
      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
  std::vector<unsigned> I1Defs;

  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
                                                  BI != BE; ++BI) {

    MachineBasicBlock &MBB = *BI;
    MachineBasicBlock::iterator I, Next;
    for (I = MBB.begin(); I != MBB.end(); I = Next) {
      Next = std::next(I);
      MachineInstr &MI = *I;

      if (MI.getOpcode() == AMDGPU::V_MOV_I1) {
        I1Defs.push_back(MI.getOperand(0).getReg());
        MI.setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
        continue;
      }

      if (MI.getOpcode() == AMDGPU::V_AND_I1) {
        I1Defs.push_back(MI.getOperand(0).getReg());
        MI.setDesc(TII->get(AMDGPU::V_AND_B32_e32));
        continue;
      }

      if (MI.getOpcode() == AMDGPU::V_OR_I1) {
        I1Defs.push_back(MI.getOperand(0).getReg());
        MI.setDesc(TII->get(AMDGPU::V_OR_B32_e32));
        continue;
      }

      if (MI.getOpcode() == AMDGPU::V_XOR_I1) {
        I1Defs.push_back(MI.getOperand(0).getReg());
        MI.setDesc(TII->get(AMDGPU::V_XOR_B32_e32));
        continue;
      }

      if (MI.getOpcode() != AMDGPU::COPY ||
          !TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()) ||
          !TargetRegisterInfo::isVirtualRegister(MI.getOperand(1).getReg()))
        continue;


      const TargetRegisterClass *DstRC =
          MRI.getRegClass(MI.getOperand(0).getReg());
      const TargetRegisterClass *SrcRC =
          MRI.getRegClass(MI.getOperand(1).getReg());

      if (DstRC == &AMDGPU::VReg_1RegClass &&
          TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) {
        I1Defs.push_back(MI.getOperand(0).getReg());
        BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CNDMASK_B32_e64))
                .addOperand(MI.getOperand(0))
                .addImm(0)
                .addImm(-1)
                .addOperand(MI.getOperand(1))
                .addImm(0)
                .addImm(0)
                .addImm(0)
                .addImm(0);
        MI.eraseFromParent();
      } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
                 SrcRC == &AMDGPU::VReg_1RegClass) {
        BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_I32_e64))
                .addOperand(MI.getOperand(0))
                .addOperand(MI.getOperand(1))
                .addImm(0);
        MI.eraseFromParent();
      }
    }
  }

  for (unsigned Reg : I1Defs)
    MRI.setRegClass(Reg, &AMDGPU::VReg_32RegClass);

  return false;
}