MachineBasicBlock::iterator
AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
                                     MachineBasicBlock::iterator Update,
                                     bool IsPreIdx) {
  assert((Update->getOpcode() == AArch64::ADDXri ||
          Update->getOpcode() == AArch64::SUBXri) &&
         "Unexpected base register update instruction to merge!");
  MachineBasicBlock::iterator NextI = I;
  // Return the instruction following the merged instruction, which is
  // the instruction following our unmerged load. Unless that's the add/sub
  // instruction we're merging, in which case it's the one after that.
  if (++NextI == Update)
    ++NextI;

  int Value = Update->getOperand(2).getImm();
  assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
         "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
  if (Update->getOpcode() == AArch64::SUBXri)
    Value = -Value;

  unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
                             : getPostIndexedOpcode(I->getOpcode());
  MachineInstrBuilder MIB;
  if (!isPairedLdSt(I)) {
    // Non-paired instruction.
    MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
              .addOperand(getLdStRegOp(Update))
              .addOperand(getLdStRegOp(I))
              .addOperand(getLdStBaseOp(I))
              .addImm(Value);
  } else {
    // Paired instruction.
    int Scale = getMemScale(I);
    MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
              .addOperand(getLdStRegOp(Update))
              .addOperand(getLdStRegOp(I, 0))
              .addOperand(getLdStRegOp(I, 1))
              .addOperand(getLdStBaseOp(I))
              .addImm(Value / Scale);
  }
  (void)MIB;

  if (IsPreIdx)
    DEBUG(dbgs() << "Creating pre-indexed load/store.");
  else
    DEBUG(dbgs() << "Creating post-indexed load/store.");
  DEBUG(dbgs() << "    Replacing instructions:\n    ");
  DEBUG(I->print(dbgs()));
  DEBUG(dbgs() << "    ");
  DEBUG(Update->print(dbgs()));
  DEBUG(dbgs() << "  with instruction:\n    ");
  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
  DEBUG(dbgs() << "\n");

  // Erase the old instructions for the block.
  I->eraseFromParent();
  Update->eraseFromParent();

  return NextI;
}
MachineBasicBlock::iterator
AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
                                      MachineBasicBlock::iterator Paired,
                                      const LdStPairFlags &Flags) {
  MachineBasicBlock::iterator NextI = I;
  ++NextI;
  // If NextI is the second of the two instructions to be merged, we need
  // to skip one further. Either way we merge will invalidate the iterator,
  // and we don't need to scan the new instruction, as it's a pairwise
  // instruction, which we're not considering for further action anyway.
  if (NextI == Paired)
    ++NextI;

  int SExtIdx = Flags.getSExtIdx();
  unsigned Opc =
      SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
  bool IsUnscaled = isUnscaledLdSt(Opc);
  int OffsetStride = IsUnscaled ? getMemScale(I) : 1;

  bool MergeForward = Flags.getMergeForward();
  unsigned NewOpc = getMatchingPairOpcode(Opc);
  // Insert our new paired instruction after whichever of the paired
  // instructions MergeForward indicates.
  MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
  // Also based on MergeForward is from where we copy the base register operand
  // so we get the flags compatible with the input code.
  const MachineOperand &BaseRegOp =
      MergeForward ? getLdStBaseOp(Paired) : getLdStBaseOp(I);

  // Which register is Rt and which is Rt2 depends on the offset order.
  MachineInstr *RtMI, *Rt2MI;
  if (getLdStOffsetOp(I).getImm() ==
      getLdStOffsetOp(Paired).getImm() + OffsetStride) {
    RtMI = Paired;
    Rt2MI = I;
    // Here we swapped the assumption made for SExtIdx.
    // I.e., we turn ldp I, Paired into ldp Paired, I.
    // Update the index accordingly.
    if (SExtIdx != -1)
      SExtIdx = (SExtIdx + 1) % 2;
  } else {
    RtMI = I;
    Rt2MI = Paired;
  }

  int OffsetImm = getLdStOffsetOp(RtMI).getImm();

  if (isSmallTypeLdMerge(Opc)) {
    // Change the scaled offset from small to large type.
    if (!IsUnscaled)
      OffsetImm /= 2;
    MachineInstr *RtNewDest = MergeForward ? I : Paired;
    // Construct the new load instruction.
    // FIXME: currently we support only halfword unsigned load. We need to
    // handle byte type, signed, and store instructions as well.
    MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2;
    NewMemMI = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
                   .addOperand(getLdStRegOp(RtNewDest))
                   .addOperand(BaseRegOp)
                   .addImm(OffsetImm);

    // Copy MachineMemOperands from the original loads.
    concatenateMemOperands(NewMemMI, I, Paired);

    DEBUG(
        dbgs()
        << "Creating the new load and extract. Replacing instructions:\n    ");
    DEBUG(I->print(dbgs()));
    DEBUG(dbgs() << "    ");
    DEBUG(Paired->print(dbgs()));
    DEBUG(dbgs() << "  with instructions:\n    ");
    DEBUG((NewMemMI)->print(dbgs()));

    MachineInstr *ExtDestMI = MergeForward ? Paired : I;
    if (ExtDestMI == Rt2MI) {
      // Create the bitfield extract for high half.
      BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
                          TII->get(AArch64::UBFMWri))
                      .addOperand(getLdStRegOp(Rt2MI))
                      .addReg(getLdStRegOp(RtNewDest).getReg())
                      .addImm(16)
                      .addImm(31);
      // Create the bitfield extract for low half.
      BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
                          TII->get(AArch64::ANDWri))
                      .addOperand(getLdStRegOp(RtMI))
                      .addReg(getLdStRegOp(RtNewDest).getReg())
                      .addImm(15);
    } else {
      // Create the bitfield extract for low half.
      BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
                          TII->get(AArch64::ANDWri))
                      .addOperand(getLdStRegOp(RtMI))
                      .addReg(getLdStRegOp(RtNewDest).getReg())
                      .addImm(15);
      // Create the bitfield extract for high half.
      BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
                          TII->get(AArch64::UBFMWri))
                      .addOperand(getLdStRegOp(Rt2MI))
                      .addReg(getLdStRegOp(RtNewDest).getReg())
                      .addImm(16)
                      .addImm(31);
    }
    DEBUG(dbgs() << "    ");
    DEBUG((BitExtMI1)->print(dbgs()));
    DEBUG(dbgs() << "    ");
    DEBUG((BitExtMI2)->print(dbgs()));
    DEBUG(dbgs() << "\n");

    // Erase the old instructions.
    I->eraseFromParent();
    Paired->eraseFromParent();
    return NextI;
  }

  // Handle Unscaled
  if (IsUnscaled)
    OffsetImm /= OffsetStride;

  // Construct the new instruction.
  MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint,
                                    I->getDebugLoc(), TII->get(NewOpc))
                                .addOperand(getLdStRegOp(RtMI))
                                .addOperand(getLdStRegOp(Rt2MI))
                                .addOperand(BaseRegOp)
                                .addImm(OffsetImm);
  (void)MIB;

  // FIXME: Do we need/want to copy the mem operands from the source
  //        instructions? Probably. What uses them after this?

  DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
  DEBUG(I->print(dbgs()));
  DEBUG(dbgs() << "    ");
  DEBUG(Paired->print(dbgs()));
  DEBUG(dbgs() << "  with instruction:\n    ");

  if (SExtIdx != -1) {
    // Generate the sign extension for the proper result of the ldp.
    // I.e., with X1, that would be:
    // %W1<def> = KILL %W1, %X1<imp-def>
    // %X1<def> = SBFMXri %X1<kill>, 0, 31
    MachineOperand &DstMO = MIB->getOperand(SExtIdx);
    // Right now, DstMO has the extended register, since it comes from an
    // extended opcode.
    unsigned DstRegX = DstMO.getReg();
    // Get the W variant of that register.
    unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
    // Update the result of LDP to use the W instead of the X variant.
    DstMO.setReg(DstRegW);
    DEBUG(((MachineInstr *)MIB)->print(dbgs()));
    DEBUG(dbgs() << "\n");
    // Make the machine verifier happy by providing a definition for
    // the X register.
    // Insert this definition right after the generated LDP, i.e., before
    // InsertionPoint.
    MachineInstrBuilder MIBKill =
        BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
                TII->get(TargetOpcode::KILL), DstRegW)
            .addReg(DstRegW)
            .addReg(DstRegX, RegState::Define);
    MIBKill->getOperand(2).setImplicit();
    // Create the sign extension.
    MachineInstrBuilder MIBSXTW =
        BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
                TII->get(AArch64::SBFMXri), DstRegX)
            .addReg(DstRegX)
            .addImm(0)
            .addImm(31);
    (void)MIBSXTW;
    DEBUG(dbgs() << "  Extend operand:\n    ");
    DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
    DEBUG(dbgs() << "\n");
  } else {
    DEBUG(((MachineInstr *)MIB)->print(dbgs()));
    DEBUG(dbgs() << "\n");
  }

  // Erase the old instructions.
  I->eraseFromParent();
  Paired->eraseFromParent();

  return NextI;
}