Example #1
0
// Try to fuse comparison instruction Compare into a later branch.
// Return true on success and if Compare is therefore redundant.
bool SystemZElimCompare::
fuseCompareAndBranch(MachineInstr *Compare,
                     SmallVectorImpl<MachineInstr *> &CCUsers) {
  // See whether we have a comparison that can be fused.
  unsigned FusedOpcode = TII->getCompareAndBranch(Compare->getOpcode(),
                                                  Compare);
  if (!FusedOpcode)
    return false;

  // See whether we have a single branch with which to fuse.
  if (CCUsers.size() != 1)
    return false;
  MachineInstr *Branch = CCUsers[0];
  if (Branch->getOpcode() != SystemZ::BRC)
    return false;

  // Make sure that the operands are available at the branch.
  unsigned SrcReg = Compare->getOperand(0).getReg();
  unsigned SrcReg2 = (Compare->getOperand(1).isReg() ?
                      Compare->getOperand(1).getReg() : 0);
  MachineBasicBlock::iterator MBBI = Compare, MBBE = Branch;
  for (++MBBI; MBBI != MBBE; ++MBBI)
    if (MBBI->modifiesRegister(SrcReg, TRI) ||
        (SrcReg2 && MBBI->modifiesRegister(SrcReg2, TRI)))
      return false;

  // Read the branch mask and target.
  MachineOperand CCMask(MBBI->getOperand(1));
  MachineOperand Target(MBBI->getOperand(2));
  assert((CCMask.getImm() & ~SystemZ::CCMASK_ICMP) == 0 &&
         "Invalid condition-code mask for integer comparison");

  // Clear out all current operands.
  int CCUse = MBBI->findRegisterUseOperandIdx(SystemZ::CC, false, TRI);
  assert(CCUse >= 0 && "BRC must use CC");
  Branch->RemoveOperand(CCUse);
  Branch->RemoveOperand(2);
  Branch->RemoveOperand(1);
  Branch->RemoveOperand(0);

  // Rebuild Branch as a fused compare and branch.
  Branch->setDesc(TII->get(FusedOpcode));
  MachineInstrBuilder(*Branch->getParent()->getParent(), Branch)
    .addOperand(Compare->getOperand(0))
    .addOperand(Compare->getOperand(1))
    .addOperand(CCMask)
    .addOperand(Target)
    .addReg(SystemZ::CC, RegState::ImplicitDefine);

  // Clear any intervening kills of SrcReg and SrcReg2.
  MBBI = Compare;
  for (++MBBI; MBBI != MBBE; ++MBBI) {
    MBBI->clearRegisterKills(SrcReg, TRI);
    if (SrcReg2)
      MBBI->clearRegisterKills(SrcReg2, TRI);
  }
  FusedComparisons += 1;
  return true;
}
Example #2
0
// We have identified this II could be feeder to NVJ,
// verify that it can be.
static bool canBeFeederToNewValueJump(const HexagonInstrInfo *QII,
                                      const TargetRegisterInfo *TRI,
                                      MachineBasicBlock::iterator II,
                                      MachineBasicBlock::iterator end,
                                      MachineBasicBlock::iterator skip,
                                      MachineFunction &MF) {

  // Predicated instruction can not be feeder to NVJ.
  if (QII->isPredicated(*II))
    return false;

  // Bail out if feederReg is a paired register (double regs in
  // our case). One would think that we can check to see if a given
  // register cmpReg1 or cmpReg2 is a sub register of feederReg
  // using -- if (QRI->isSubRegister(feederReg, cmpReg1) logic
  // before the callsite of this function
  // But we can not as it comes in the following fashion.
  //    %D0<def> = Hexagon_S2_lsr_r_p %D0<kill>, %R2<kill>
  //    %R0<def> = KILL %R0, %D0<imp-use,kill>
  //    %P0<def> = CMPEQri %R0<kill>, 0
  // Hence, we need to check if it's a KILL instruction.
  if (II->getOpcode() == TargetOpcode::KILL)
    return false;


  // Make sure there there is no 'def' or 'use' of any of the uses of
  // feeder insn between it's definition, this MI and jump, jmpInst
  // skipping compare, cmpInst.
  // Here's the example.
  //    r21=memub(r22+r24<<#0)
  //    p0 = cmp.eq(r21, #0)
  //    r4=memub(r3+r21<<#0)
  //    if (p0.new) jump:t .LBB29_45
  // Without this check, it will be converted into
  //    r4=memub(r3+r21<<#0)
  //    r21=memub(r22+r24<<#0)
  //    p0 = cmp.eq(r21, #0)
  //    if (p0.new) jump:t .LBB29_45
  // and result WAR hazards if converted to New Value Jump.

  for (unsigned i = 0; i < II->getNumOperands(); ++i) {
    if (II->getOperand(i).isReg() &&
        (II->getOperand(i).isUse() || II->getOperand(i).isDef())) {
      MachineBasicBlock::iterator localII = II;
      ++localII;
      unsigned Reg = II->getOperand(i).getReg();
      for (MachineBasicBlock::iterator localBegin = localII;
                        localBegin != end; ++localBegin) {
        if (localBegin == skip ) continue;
        // Check for Subregisters too.
        if (localBegin->modifiesRegister(Reg, TRI) ||
            localBegin->readsRegister(Reg, TRI))
          return false;
      }
    }
  }
  return true;
}
Example #3
0
static bool canCompareBeNewValueJump(const HexagonInstrInfo *QII,
                                     const TargetRegisterInfo *TRI,
                                     MachineBasicBlock::iterator II,
                                     unsigned pReg,
                                     bool secondReg,
                                     bool optLocation,
                                     MachineBasicBlock::iterator end,
                                     MachineFunction &MF) {

  MachineInstr &MI = *II;

  // If the second operand of the compare is an imm, make sure it's in the
  // range specified by the arch.
  if (!secondReg) {
    int64_t v = MI.getOperand(2).getImm();

    if (!(isUInt<5>(v) || ((MI.getOpcode() == Hexagon::C2_cmpeqi ||
                            MI.getOpcode() == Hexagon::C2_cmpgti) &&
                           (v == -1))))
      return false;
  }

  unsigned cmpReg1, cmpOp2 = 0; // cmpOp2 assignment silences compiler warning.
  cmpReg1 = MI.getOperand(1).getReg();

  if (secondReg) {
    cmpOp2 = MI.getOperand(2).getReg();

    // Make sure that that second register is not from COPY
    // At machine code level, we don't need this, but if we decide
    // to move new value jump prior to RA, we would be needing this.
    MachineRegisterInfo &MRI = MF.getRegInfo();
    if (secondReg && !TargetRegisterInfo::isPhysicalRegister(cmpOp2)) {
      MachineInstr *def = MRI.getVRegDef(cmpOp2);
      if (def->getOpcode() == TargetOpcode::COPY)
        return false;
    }
  }

  // Walk the instructions after the compare (predicate def) to the jump,
  // and satisfy the following conditions.
  ++II ;
  for (MachineBasicBlock::iterator localII = II; localII != end;
       ++localII) {

    // Check 1.
    // If "common" checks fail, bail out.
    if (!commonChecksToProhibitNewValueJump(optLocation, localII))
      return false;

    // Check 2.
    // If there is a def or use of predicate (result of compare), bail out.
    if (localII->modifiesRegister(pReg, TRI) ||
        localII->readsRegister(pReg, TRI))
      return false;

    // Check 3.
    // If there is a def of any of the use of the compare (operands of compare),
    // bail out.
    // Eg.
    //    p0 = cmp.eq(r2, r0)
    //    r2 = r4
    //    if (p0.new) jump:t .LBB28_3
    if (localII->modifiesRegister(cmpReg1, TRI) ||
        (secondReg && localII->modifiesRegister(cmpOp2, TRI)))
      return false;
  }
  return true;
}
Example #4
0
// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
// around other non-memory instructions.
bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
  bool Changes = false;

  ST = &MF.getSubtarget<SISubtarget>();
  TII = ST->getInstrInfo();
  TRI = &TII->getRegisterInfo();
  MRI = &MF.getRegInfo();
  ISA = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

  HardwareLimits.Named.VM = AMDGPU::getVmcntBitMask(ISA);
  HardwareLimits.Named.EXP = AMDGPU::getExpcntBitMask(ISA);
  HardwareLimits.Named.LGKM = AMDGPU::getLgkmcntBitMask(ISA);

  WaitedOn = ZeroCounts;
  DelayedWaitOn = ZeroCounts;
  LastIssued = ZeroCounts;
  LastOpcodeType = OTHER;
  LastInstWritesM0 = false;
  IsFlatOutstanding = false;
  ReturnsVoid = MFI->returnsVoid();

  memset(&UsedRegs, 0, sizeof(UsedRegs));
  memset(&DefinedRegs, 0, sizeof(DefinedRegs));

  SmallVector<MachineInstr *, 4> RemoveMI;
  SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;

  bool HaveScalarStores = false;

  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
       BI != BE; ++BI) {

    MachineBasicBlock &MBB = *BI;

    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
         I != E; ++I) {

      if (!HaveScalarStores && TII->isScalarStore(*I))
        HaveScalarStores = true;

      if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
        // There is a hardware bug on CI/SI where SMRD instruction may corrupt
        // vccz bit, so when we detect that an instruction may read from a
        // corrupt vccz bit, we need to:
        // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to
        //    complete.
        // 2. Restore the correct value of vccz by writing the current value
        //    of vcc back to vcc.

        if (TII->isSMRD(I->getOpcode())) {
          VCCZCorrupt = true;
        } else if (!hasOutstandingLGKM() && I->modifiesRegister(AMDGPU::VCC, TRI)) {
          // FIXME: We only care about SMRD instructions here, not LDS or GDS.
          // Whenever we store a value in vcc, the correct value of vccz is
          // restored.
          VCCZCorrupt = false;
        }

        // Check if we need to apply the bug work-around
        if (VCCZCorrupt && readsVCCZ(*I)) {
          DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n');

          // Wait on everything, not just LGKM.  vccz reads usually come from
          // terminators, and we always wait on everything at the end of the
          // block, so if we only wait on LGKM here, we might end up with
          // another s_waitcnt inserted right after this if there are non-LGKM
          // instructions still outstanding.
          insertWait(MBB, I, LastIssued);

          // Restore the vccz bit.  Any time a value is written to vcc, the vcc
          // bit is updated, so we can restore the bit by reading the value of
          // vcc and then writing it back to the register.
          BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
                  AMDGPU::VCC)
            .addReg(AMDGPU::VCC);
        }
      }

      // Record pre-existing, explicitly requested waits
      if (I->getOpcode() == AMDGPU::S_WAITCNT) {
        handleExistingWait(*I);
        RemoveMI.push_back(&*I);
        continue;
      }

      Counters Required;

      // Wait for everything before a barrier.
      //
      // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
      // but we also want to wait for any other outstanding transfers before
      // signalling other hardware blocks
      if ((I->getOpcode() == AMDGPU::S_BARRIER &&
               !ST->hasAutoWaitcntBeforeBarrier()) ||
           I->getOpcode() == AMDGPU::S_SENDMSG ||
           I->getOpcode() == AMDGPU::S_SENDMSGHALT)
        Required = LastIssued;
      else
        Required = handleOperands(*I);

      Counters Increment = getHwCounts(*I);

      if (countersNonZero(Required) || countersNonZero(Increment))
        increaseCounters(Required, DelayedWaitOn);

      Changes |= insertWait(MBB, I, Required);

      pushInstruction(MBB, I, Increment);
      handleSendMsg(MBB, I);

      if (I->getOpcode() == AMDGPU::S_ENDPGM ||
          I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
        EndPgmBlocks.push_back(&MBB);
    }

    // Wait for everything at the end of the MBB. If there is only one
    // successor, we can defer this until the uses there.
    if (!hasTrivialSuccessor(MBB))
      Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
  }

  if (HaveScalarStores) {
    // If scalar writes are used, the cache must be flushed or else the next
    // wave to reuse the same scratch memory can be clobbered.
    //
    // Insert s_dcache_wb at wave termination points if there were any scalar
    // stores, and only if the cache hasn't already been flushed. This could be
    // improved by looking across blocks for flushes in postdominating blocks
    // from the stores but an explicitly requested flush is probably very rare.
    for (MachineBasicBlock *MBB : EndPgmBlocks) {
      bool SeenDCacheWB = false;

      for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
           I != E; ++I) {

        if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
          SeenDCacheWB = true;
        else if (TII->isScalarStore(*I))
          SeenDCacheWB = false;

        // FIXME: It would be better to insert this before a waitcnt if any.
        if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
             I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && !SeenDCacheWB) {
          Changes = true;
          BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
        }
      }
    }
  }

  for (MachineInstr *I : RemoveMI)
    I->eraseFromParent();

  if (!MFI->isEntryFunction()) {
    // Wait for any outstanding memory operations that the input registers may
    // depend on. We can't track them and it's better to to the wait after the
    // costly call sequence.

    // TODO: Could insert earlier and schedule more liberally with operations
    // that only use caller preserved registers.
    MachineBasicBlock &EntryBB = MF.front();
    BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
      .addImm(0);

    Changes = true;
  }

  return Changes;
}
Example #5
0
bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
  if (skipFunction(MF.getFunction()))
    return false;

  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
  const SIRegisterInfo *TRI = ST.getRegisterInfo();
  const SIInstrInfo *TII = ST.getInstrInfo();

  // Optimize sequences emitted for control flow lowering. They are originally
  // emitted as the separate operations because spill code may need to be
  // inserted for the saved copy of exec.
  //
  //     x = copy exec
  //     z = s_<op>_b64 x, y
  //     exec = copy z
  // =>
  //     x = s_<op>_saveexec_b64 y
  //

  for (MachineBasicBlock &MBB : MF) {
    MachineBasicBlock::reverse_iterator I = fixTerminators(*TII, MBB);
    MachineBasicBlock::reverse_iterator E = MBB.rend();
    if (I == E)
      continue;

    unsigned CopyToExec = isCopyToExec(*I);
    if (CopyToExec == AMDGPU::NoRegister)
      continue;

    // Scan backwards to find the def.
    auto CopyToExecInst = &*I;
    auto CopyFromExecInst = findExecCopy(*TII, MBB, I, CopyToExec);
    if (CopyFromExecInst == E) {
      auto PrepareExecInst = std::next(I);
      if (PrepareExecInst == E)
        continue;
      // Fold exec = COPY (S_AND_B64 reg, exec) -> exec = S_AND_B64 reg, exec
      if (CopyToExecInst->getOperand(1).isKill() &&
          isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) {
        DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst);

        PrepareExecInst->getOperand(0).setReg(AMDGPU::EXEC);
        PrepareExecInst->getOperand(0).setIsRenamable(false);

        DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n');

        CopyToExecInst->eraseFromParent();
      }

      continue;
    }

    if (isLiveOut(MBB, CopyToExec)) {
      // The copied register is live out and has a second use in another block.
      DEBUG(dbgs() << "Exec copy source register is live out\n");
      continue;
    }

    unsigned CopyFromExec = CopyFromExecInst->getOperand(0).getReg();
    MachineInstr *SaveExecInst = nullptr;
    SmallVector<MachineInstr *, 4> OtherUseInsts;

    for (MachineBasicBlock::iterator J
           = std::next(CopyFromExecInst->getIterator()), JE = I->getIterator();
         J != JE; ++J) {
      if (SaveExecInst && J->readsRegister(AMDGPU::EXEC, TRI)) {
        DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n');
        // Make sure this is inserted after any VALU ops that may have been
        // scheduled in between.
        SaveExecInst = nullptr;
        break;
      }

      bool ReadsCopyFromExec = J->readsRegister(CopyFromExec, TRI);

      if (J->modifiesRegister(CopyToExec, TRI)) {
        if (SaveExecInst) {
          DEBUG(dbgs() << "Multiple instructions modify "
                << printReg(CopyToExec, TRI) << '\n');
          SaveExecInst = nullptr;
          break;
        }

        unsigned SaveExecOp = getSaveExecOp(J->getOpcode());
        if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END)
          break;

        if (ReadsCopyFromExec) {
          SaveExecInst = &*J;
          DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n');
          continue;
        } else {
          DEBUG(dbgs() << "Instruction does not read exec copy: " << *J << '\n');
          break;
        }
      } else if (ReadsCopyFromExec && !SaveExecInst) {
        // Make sure no other instruction is trying to use this copy, before it
        // will be rewritten by the saveexec, i.e. hasOneUse. There may have
        // been another use, such as an inserted spill. For example:
        //
        // %sgpr0_sgpr1 = COPY %exec
        // spill %sgpr0_sgpr1
        // %sgpr2_sgpr3 = S_AND_B64 %sgpr0_sgpr1
        //
        DEBUG(dbgs() << "Found second use of save inst candidate: "
              << *J << '\n');
        break;
      }

      if (SaveExecInst && J->readsRegister(CopyToExec, TRI)) {
        assert(SaveExecInst != &*J);
        OtherUseInsts.push_back(&*J);
      }
    }

    if (!SaveExecInst)
      continue;

    DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n');

    MachineOperand &Src0 = SaveExecInst->getOperand(1);
    MachineOperand &Src1 = SaveExecInst->getOperand(2);

    MachineOperand *OtherOp = nullptr;

    if (Src0.isReg() && Src0.getReg() == CopyFromExec) {
      OtherOp = &Src1;
    } else if (Src1.isReg() && Src1.getReg() == CopyFromExec) {
      if (!SaveExecInst->isCommutable())
        break;

      OtherOp = &Src0;
    } else
      llvm_unreachable("unexpected");

    CopyFromExecInst->eraseFromParent();

    auto InsPt = SaveExecInst->getIterator();
    const DebugLoc &DL = SaveExecInst->getDebugLoc();

    BuildMI(MBB, InsPt, DL, TII->get(getSaveExecOp(SaveExecInst->getOpcode())),
            CopyFromExec)
      .addReg(OtherOp->getReg());
    SaveExecInst->eraseFromParent();

    CopyToExecInst->eraseFromParent();

    for (MachineInstr *OtherInst : OtherUseInsts) {
      OtherInst->substituteRegister(CopyToExec, AMDGPU::EXEC,
                                    AMDGPU::NoSubRegister, *TRI,
                                    /*ClearIsRenamable=*/true);
    }
  }

  return true;

}
Example #6
0
// Try to fuse comparison instruction Compare into a later branch.
// Return true on success and if Compare is therefore redundant.
bool SystemZElimCompare::fuseCompareOperations(
    MachineInstr &Compare, SmallVectorImpl<MachineInstr *> &CCUsers) {
  // See whether we have a single branch with which to fuse.
  if (CCUsers.size() != 1)
    return false;
  MachineInstr *Branch = CCUsers[0];
  SystemZII::FusedCompareType Type;
  switch (Branch->getOpcode()) {
  case SystemZ::BRC:
    Type = SystemZII::CompareAndBranch;
    break;
  case SystemZ::CondReturn:
    Type = SystemZII::CompareAndReturn;
    break;
  case SystemZ::CallBCR:
    Type = SystemZII::CompareAndSibcall;
    break;
  case SystemZ::CondTrap:
    Type = SystemZII::CompareAndTrap;
    break;
  default:
    return false;
  }

  // See whether we have a comparison that can be fused.
  unsigned FusedOpcode =
      TII->getFusedCompare(Compare.getOpcode(), Type, &Compare);
  if (!FusedOpcode)
    return false;

  // Make sure that the operands are available at the branch.
  // SrcReg2 is the register if the source operand is a register,
  // 0 if the source operand is immediate, and the base register
  // if the source operand is memory (index is not supported).
  unsigned SrcReg = Compare.getOperand(0).getReg();
  unsigned SrcReg2 =
      Compare.getOperand(1).isReg() ? Compare.getOperand(1).getReg() : 0;
  MachineBasicBlock::iterator MBBI = Compare, MBBE = Branch;
  for (++MBBI; MBBI != MBBE; ++MBBI)
    if (MBBI->modifiesRegister(SrcReg, TRI) ||
        (SrcReg2 && MBBI->modifiesRegister(SrcReg2, TRI)))
      return false;

  // Read the branch mask, target (if applicable), regmask (if applicable).
  MachineOperand CCMask(MBBI->getOperand(1));
  assert((CCMask.getImm() & ~SystemZ::CCMASK_ICMP) == 0 &&
         "Invalid condition-code mask for integer comparison");
  // This is only valid for CompareAndBranch.
  MachineOperand Target(MBBI->getOperand(
    Type == SystemZII::CompareAndBranch ? 2 : 0));
  const uint32_t *RegMask;
  if (Type == SystemZII::CompareAndSibcall)
    RegMask = MBBI->getOperand(2).getRegMask();

  // Clear out all current operands.
  int CCUse = MBBI->findRegisterUseOperandIdx(SystemZ::CC, false, TRI);
  assert(CCUse >= 0 && "BRC/BCR must use CC");
  Branch->RemoveOperand(CCUse);
  // Remove target (branch) or regmask (sibcall).
  if (Type == SystemZII::CompareAndBranch ||
      Type == SystemZII::CompareAndSibcall)
    Branch->RemoveOperand(2);
  Branch->RemoveOperand(1);
  Branch->RemoveOperand(0);

  // Rebuild Branch as a fused compare and branch.
  // SrcNOps is the number of MI operands of the compare instruction
  // that we need to copy over.
  unsigned SrcNOps = 2;
  if (FusedOpcode == SystemZ::CLT || FusedOpcode == SystemZ::CLGT)
    SrcNOps = 3;
  Branch->setDesc(TII->get(FusedOpcode));
  MachineInstrBuilder MIB(*Branch->getParent()->getParent(), Branch);
  for (unsigned I = 0; I < SrcNOps; I++)
    MIB.add(Compare.getOperand(I));
  MIB.add(CCMask);

  if (Type == SystemZII::CompareAndBranch) {
    // Only conditional branches define CC, as they may be converted back
    // to a non-fused branch because of a long displacement.  Conditional
    // returns don't have that problem.
    MIB.add(Target).addReg(SystemZ::CC,
                           RegState::ImplicitDefine | RegState::Dead);
  }

  if (Type == SystemZII::CompareAndSibcall)
    MIB.addRegMask(RegMask);

  // Clear any intervening kills of SrcReg and SrcReg2.
  MBBI = Compare;
  for (++MBBI; MBBI != MBBE; ++MBBI) {
    MBBI->clearRegisterKills(SrcReg, TRI);
    if (SrcReg2)
      MBBI->clearRegisterKills(SrcReg2, TRI);
  }
  FusedComparisons += 1;
  return true;
}
/// For a given conditional copy, predicate the definition of the source of
/// the copy under the given condition (using the same predicate register as
/// the copy).
bool HexagonExpandCondsets::predicate(MachineInstr &TfrI, bool Cond,
                                      std::set<unsigned> &UpdRegs) {
  // TfrI - A2_tfr[tf] Instruction (not A2_tfrsi).
  unsigned Opc = TfrI.getOpcode();
  (void)Opc;
  assert(Opc == Hexagon::A2_tfrt || Opc == Hexagon::A2_tfrf);
  DEBUG(dbgs() << "\nattempt to predicate if-" << (Cond ? "true" : "false")
               << ": " << TfrI);

  MachineOperand &MD = TfrI.getOperand(0);
  MachineOperand &MP = TfrI.getOperand(1);
  MachineOperand &MS = TfrI.getOperand(2);
  // The source operand should be a <kill>. This is not strictly necessary,
  // but it makes things a lot simpler. Otherwise, we would need to rename
  // some registers, which would complicate the transformation considerably.
  if (!MS.isKill())
    return false;
  // Avoid predicating instructions that define a subregister if subregister
  // liveness tracking is not enabled.
  if (MD.getSubReg() && !MRI->shouldTrackSubRegLiveness(MD.getReg()))
    return false;

  RegisterRef RT(MS);
  unsigned PredR = MP.getReg();
  MachineInstr *DefI = getReachingDefForPred(RT, TfrI, PredR, Cond);
  if (!DefI || !isPredicable(DefI))
    return false;

  DEBUG(dbgs() << "Source def: " << *DefI);

  // Collect the information about registers defined and used between the
  // DefI and the TfrI.
  // Map: reg -> bitmask of subregs
  ReferenceMap Uses, Defs;
  MachineBasicBlock::iterator DefIt = DefI, TfrIt = TfrI;

  // Check if the predicate register is valid between DefI and TfrI.
  // If it is, we can then ignore instructions predicated on the negated
  // conditions when collecting def and use information.
  bool PredValid = true;
  for (MachineBasicBlock::iterator I = std::next(DefIt); I != TfrIt; ++I) {
    if (!I->modifiesRegister(PredR, nullptr))
      continue;
    PredValid = false;
    break;
  }

  for (MachineBasicBlock::iterator I = std::next(DefIt); I != TfrIt; ++I) {
    MachineInstr *MI = &*I;
    // If this instruction is predicated on the same register, it could
    // potentially be ignored.
    // By default assume that the instruction executes on the same condition
    // as TfrI (Exec_Then), and also on the opposite one (Exec_Else).
    unsigned Exec = Exec_Then | Exec_Else;
    if (PredValid && HII->isPredicated(*MI) && MI->readsRegister(PredR))
      Exec = (Cond == HII->isPredicatedTrue(*MI)) ? Exec_Then : Exec_Else;

    for (auto &Op : MI->operands()) {
      if (!Op.isReg())
        continue;
      // We don't want to deal with physical registers. The reason is that
      // they can be aliased with other physical registers. Aliased virtual
      // registers must share the same register number, and can only differ
      // in the subregisters, which we are keeping track of. Physical
      // registers ters no longer have subregisters---their super- and
      // subregisters are other physical registers, and we are not checking
      // that.
      RegisterRef RR = Op;
      if (!TargetRegisterInfo::isVirtualRegister(RR.Reg))
        return false;

      ReferenceMap &Map = Op.isDef() ? Defs : Uses;
      if (Op.isDef() && Op.isUndef()) {
        assert(RR.Sub && "Expecting a subregister on <def,read-undef>");
        // If this is a <def,read-undef>, then it invalidates the non-written
        // part of the register. For the purpose of checking the validity of
        // the move, assume that it modifies the whole register.
        RR.Sub = 0;
      }
      addRefToMap(RR, Map, Exec);
    }
  }

  // The situation:
  //   RT = DefI
  //   ...
  //   RD = TfrI ..., RT

  // If the register-in-the-middle (RT) is used or redefined between
  // DefI and TfrI, we may not be able proceed with this transformation.
  // We can ignore a def that will not execute together with TfrI, and a
  // use that will. If there is such a use (that does execute together with
  // TfrI), we will not be able to move DefI down. If there is a use that
  // executed if TfrI's condition is false, then RT must be available
  // unconditionally (cannot be predicated).
  // Essentially, we need to be able to rename RT to RD in this segment.
  if (isRefInMap(RT, Defs, Exec_Then) || isRefInMap(RT, Uses, Exec_Else))
    return false;
  RegisterRef RD = MD;
  // If the predicate register is defined between DefI and TfrI, the only
  // potential thing to do would be to move the DefI down to TfrI, and then
  // predicate. The reaching def (DefI) must be movable down to the location
  // of the TfrI.
  // If the target register of the TfrI (RD) is not used or defined between
  // DefI and TfrI, consider moving TfrI up to DefI.
  bool CanUp =   canMoveOver(TfrI, Defs, Uses);
  bool CanDown = canMoveOver(*DefI, Defs, Uses);
  // The TfrI does not access memory, but DefI could. Check if it's safe
  // to move DefI down to TfrI.
  if (DefI->mayLoad() || DefI->mayStore())
    if (!canMoveMemTo(*DefI, TfrI, true))
      CanDown = false;

  DEBUG(dbgs() << "Can move up: " << (CanUp ? "yes" : "no")
               << ", can move down: " << (CanDown ? "yes\n" : "no\n"));
  MachineBasicBlock::iterator PastDefIt = std::next(DefIt);
  if (CanUp)
    predicateAt(MD, *DefI, PastDefIt, MP, Cond, UpdRegs);
  else if (CanDown)
    predicateAt(MD, *DefI, TfrIt, MP, Cond, UpdRegs);
  else
    return false;

  if (RT != RD) {
    renameInRange(RT, RD, PredR, Cond, PastDefIt, TfrIt);
    UpdRegs.insert(RT.Reg);
  }

  removeInstr(TfrI);
  removeInstr(*DefI);
  return true;
}