SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I,
                                  unsigned EltSize,
                                  SmallVectorImpl<MachineInstr*> &InstsToMove) {
  MachineBasicBlock::iterator E = I->getParent()->end();
  MachineBasicBlock::iterator MBBI = I;

  SmallVector<const MachineOperand *, 8> DefsToMove;
  addDefsToList(*I, DefsToMove);

  for ( ; MBBI != E; ++MBBI) {

    if (MBBI->getOpcode() != I->getOpcode()) {

      // This is not a matching DS instruction, but we can keep looking as
      // long as one of these conditions are met:
      // 1. It is safe to move I down past MBBI.
      // 2. It is safe to move MBBI down past the instruction that I will
      //    be merged into.

      if (MBBI->hasUnmodeledSideEffects())
        // We can't re-order this instruction with respect to other memory
        // opeations, so we fail both conditions mentioned above.
        return E;

      if (MBBI->mayLoadOrStore() &&
          !TII->areMemAccessesTriviallyDisjoint(*I, *MBBI, AA)) {
        // We fail condition #1, but we may still be able to satisfy condition
        // #2.  Add this instruction to the move list and then we will check
        // if condition #2 holds once we have selected the matching instruction.
        addDefsToList(*MBBI, DefsToMove);

      // When we match I with another DS instruction we will be moving I down
      // to the location of the matched instruction any uses of I will need to
      // be moved down as well.
      for (const MachineOperand *Def : DefsToMove) {
        bool ReadDef = MBBI->readsVirtualRegister(Def->getReg());
        // If ReadDef is true, then there is a use of Def between I
        // and the instruction that I will potentially be merged with. We
        // will need to move this instruction after the merged instructions.
        if (ReadDef) {
          addDefsToList(*MBBI, DefsToMove);

    // Don't merge volatiles.
    if (MBBI->hasOrderedMemoryRef())
      return E;

    int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr);
    const MachineOperand &AddrReg0 = I->getOperand(AddrIdx);
    const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);

    // Check same base pointer. Be careful of subregisters, which can occur with
    // vectors of pointers.
    if (AddrReg0.getReg() == AddrReg1.getReg() &&
        AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
      int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(),
      unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff;
      unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;

      // Check both offsets fit in the reduced range.
      // We also need to go through the list of instructions that we plan to
      // move and make sure they are all safe to move down past the merged
      // instruction.
      if (offsetsCanBeCombined(Offset0, Offset1, EltSize) &&
          canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA))
        return MBBI;

    // We've found a load/store that we couldn't merge for some reason.
    // We could potentially keep looking, but we'd need to make sure that
    // it was safe to move I and also all the instruction in InstsToMove
    // down past this instruction.
    // FIXME: This is too conservative.
  return E;