SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
    SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
    MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
  const ARMSubtarget &Subtarget =
      DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
  // Do repeated 4-byte loads and stores. To be improved.
  // This requires 4-byte alignment.
  if ((Align & 3) != 0)
    return SDValue();
  // This requires the copy size to be a constant, preferably
  // within a subtarget-specific limit.
  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
  if (!ConstantSize)
    return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
                                  RTLIB::MEMCPY);
  uint64_t SizeVal = ConstantSize->getZExtValue();
  if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
    return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
                                  RTLIB::MEMCPY);

  unsigned BytesLeft = SizeVal & 3;
  unsigned NumMemOps = SizeVal >> 2;
  unsigned EmittedNumMemOps = 0;
  EVT VT = MVT::i32;
  unsigned VTSize = 4;
  unsigned i = 0;
  // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
  const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6;
  SDValue TFOps[6];
  SDValue Loads[6];
  uint64_t SrcOff = 0, DstOff = 0;

  // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to
  // VLDM/VSTM and make this code emit it when appropriate. This would reduce
  // pressure on the general purpose registers. However this seems harder to map
  // onto the register allocator's view of the world.

  // The number of MEMCPY pseudo-instructions to emit. We use up to
  // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm
  // later on. This is a lower bound on the number of MEMCPY operations we must
  // emit.
  unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM;

  // Code size optimisation: do not inline memcpy if expansion results in
  // more instructions than the libary call.
  if (NumMEMCPYs > 1 && DAG.getMachineFunction().getFunction().optForMinSize()) {
    return SDValue();
  }

  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue);

  for (unsigned I = 0; I != NumMEMCPYs; ++I) {
    // Evenly distribute registers among MEMCPY operations to reduce register
    // pressure.
    unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs;
    unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps;

    Dst = DAG.getNode(ARMISD::MEMCPY, dl, VTs, Chain, Dst, Src,
                      DAG.getConstant(NumRegs, dl, MVT::i32));
    Src = Dst.getValue(1);
    Chain = Dst.getValue(2);

    DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize);
    SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize);

    EmittedNumMemOps = NextEmittedNumMemOps;
  }

  if (BytesLeft == 0)
    return Chain;

  // Issue loads / stores for the trailing (1 - 3) bytes.
  auto getRemainingValueType = [](unsigned BytesLeft) {
    return (BytesLeft >= 2) ? MVT::i16 : MVT::i8;
  };
  auto getRemainingSize = [](unsigned BytesLeft) {
    return (BytesLeft >= 2) ? 2 : 1;
  };

  unsigned BytesLeftSave = BytesLeft;
  i = 0;
  while (BytesLeft) {
    VT = getRemainingValueType(BytesLeft);
    VTSize = getRemainingSize(BytesLeft);
    Loads[i] = DAG.getLoad(VT, dl, Chain,
                           DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
                                       DAG.getConstant(SrcOff, dl, MVT::i32)),
                           SrcPtrInfo.getWithOffset(SrcOff));
    TFOps[i] = Loads[i].getValue(1);
    ++i;
    SrcOff += VTSize;
    BytesLeft -= VTSize;
  }
  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                      makeArrayRef(TFOps, i));

  i = 0;
  BytesLeft = BytesLeftSave;
  while (BytesLeft) {
    VT = getRemainingValueType(BytesLeft);
    VTSize = getRemainingSize(BytesLeft);
    TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
                            DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
                                        DAG.getConstant(DstOff, dl, MVT::i32)),
                            DstPtrInfo.getWithOffset(DstOff));
    ++i;
    DstOff += VTSize;
    BytesLeft -= VTSize;
  }
  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                     makeArrayRef(TFOps, i));
}
Exemplo n.º 2
0
SDValue
X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                                        SDValue Chain, SDValue Dst, SDValue Src,
                                        SDValue Size, unsigned Align,
                                        bool isVolatile, bool AlwaysInline,
                                         MachinePointerInfo DstPtrInfo,
                                         MachinePointerInfo SrcPtrInfo) const {
  // This requires the copy size to be a constant, preferably
  // within a subtarget-specific limit.
  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
  if (!ConstantSize)
    return SDValue();
  uint64_t SizeVal = ConstantSize->getZExtValue();
  if (!AlwaysInline && SizeVal > Subtarget->getMaxInlineSizeThreshold())
    return SDValue();

  /// If not DWORD aligned, it is more efficient to call the library.  However
  /// if calling the library is not allowed (AlwaysInline), then soldier on as
  /// the code generated here is better than the long load-store sequence we
  /// would otherwise get.
  if (!AlwaysInline && (Align & 3) != 0)
    return SDValue();

  // If to a segment-relative address space, use the default lowering.
  if (DstPtrInfo.getAddrSpace() >= 256 ||
      SrcPtrInfo.getAddrSpace() >= 256)
    return SDValue();

  // ESI might be used as a base pointer, in that case we can't simply overwrite
  // the register.  Fall back to generic code.
  const X86RegisterInfo *TRI =
      static_cast<const X86RegisterInfo *>(DAG.getTarget().getRegisterInfo());
  if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
      TRI->getBaseRegister() == X86::ESI)
    return SDValue();

  MVT AVT;
  if (Align & 1)
    AVT = MVT::i8;
  else if (Align & 2)
    AVT = MVT::i16;
  else if (Align & 4)
    // DWORD aligned
    AVT = MVT::i32;
  else
    // QWORD aligned
    AVT = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;

  unsigned UBytes = AVT.getSizeInBits() / 8;
  unsigned CountVal = SizeVal / UBytes;
  SDValue Count = DAG.getIntPtrConstant(CountVal);
  unsigned BytesLeft = SizeVal % UBytes;

  SDValue InFlag;
  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
                                                              X86::ECX,
                            Count, InFlag);
  InFlag = Chain.getValue(1);
  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
                                                              X86::EDI,
                            Dst, InFlag);
  InFlag = Chain.getValue(1);
  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI :
                                                              X86::ESI,
                            Src, InFlag);
  InFlag = Chain.getValue(1);

  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
  SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
  SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops);

  SmallVector<SDValue, 4> Results;
  Results.push_back(RepMovs);
  if (BytesLeft) {
    // Handle the last 1 - 7 bytes.
    unsigned Offset = SizeVal - BytesLeft;
    EVT DstVT = Dst.getValueType();
    EVT SrcVT = Src.getValueType();
    EVT SizeVT = Size.getValueType();
    Results.push_back(DAG.getMemcpy(Chain, dl,
                                    DAG.getNode(ISD::ADD, dl, DstVT, Dst,
                                                DAG.getConstant(Offset, DstVT)),
                                    DAG.getNode(ISD::ADD, dl, SrcVT, Src,
                                                DAG.getConstant(Offset, SrcVT)),
                                    DAG.getConstant(BytesLeft, SizeVT),
                                    Align, isVolatile, AlwaysInline,
                                    DstPtrInfo.getWithOffset(Offset),
                                    SrcPtrInfo.getWithOffset(Offset)));
  }

  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
}
Exemplo n.º 3
0
SDValue
X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
                                             SDValue Chain,
                                             SDValue Dst, SDValue Src,
                                             SDValue Size, unsigned Align,
                                             bool isVolatile,
                                         MachinePointerInfo DstPtrInfo) const {
  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);

  // If to a segment-relative address space, use the default lowering.
  if (DstPtrInfo.getAddrSpace() >= 256)
    return SDValue();

  // If not DWORD aligned or size is more than the threshold, call the library.
  // The libc version is likely to be faster for these cases. It can use the
  // address value and run time information about the CPU.
  if ((Align & 3) != 0 ||
      !ConstantSize ||
      ConstantSize->getZExtValue() >
        Subtarget->getMaxInlineSizeThreshold()) {
    // Check to see if there is a specialized entry-point for memory zeroing.
    ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);

    if (const char *bzeroEntry =  V &&
        V->isNullValue() ? Subtarget->getBZeroEntry() : nullptr) {
      EVT IntPtr = TLI.getPointerTy();
      Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
      TargetLowering::ArgListTy Args;
      TargetLowering::ArgListEntry Entry;
      Entry.Node = Dst;
      Entry.Ty = IntPtrTy;
      Args.push_back(Entry);
      Entry.Node = Size;
      Args.push_back(Entry);

      TargetLowering::CallLoweringInfo CLI(DAG);
      CLI.setDebugLoc(dl).setChain(Chain)
        .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
                   DAG.getExternalSymbol(bzeroEntry, IntPtr), &Args, 0)
        .setDiscardResult();

      std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
      return CallResult.second;
    }

    // Otherwise have the target-independent code call memset.
    return SDValue();
  }

  uint64_t SizeVal = ConstantSize->getZExtValue();
  SDValue InFlag;
  EVT AVT;
  SDValue Count;
  ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src);
  unsigned BytesLeft = 0;
  bool TwoRepStos = false;
  if (ValC) {
    unsigned ValReg;
    uint64_t Val = ValC->getZExtValue() & 255;

    // If the value is a constant, then we can potentially use larger sets.
    switch (Align & 3) {
    case 2:   // WORD aligned
      AVT = MVT::i16;
      ValReg = X86::AX;
      Val = (Val << 8) | Val;
      break;
    case 0:  // DWORD aligned
      AVT = MVT::i32;
      ValReg = X86::EAX;
      Val = (Val << 8)  | Val;
      Val = (Val << 16) | Val;
      if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) {  // QWORD aligned
        AVT = MVT::i64;
        ValReg = X86::RAX;
        Val = (Val << 32) | Val;
      }
      break;
    default:  // Byte aligned
      AVT = MVT::i8;
      ValReg = X86::AL;
      Count = DAG.getIntPtrConstant(SizeVal);
      break;
    }

    if (AVT.bitsGT(MVT::i8)) {
      unsigned UBytes = AVT.getSizeInBits() / 8;
      Count = DAG.getIntPtrConstant(SizeVal / UBytes);
      BytesLeft = SizeVal % UBytes;
    }

    Chain  = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT),
                              InFlag);
    InFlag = Chain.getValue(1);
  } else {
    AVT = MVT::i8;
    Count  = DAG.getIntPtrConstant(SizeVal);
    Chain  = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag);
    InFlag = Chain.getValue(1);
  }

  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
                                                              X86::ECX,
                            Count, InFlag);
  InFlag = Chain.getValue(1);
  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
                                                              X86::EDI,
                            Dst, InFlag);
  InFlag = Chain.getValue(1);

  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
  SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
  Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);

  if (TwoRepStos) {
    InFlag = Chain.getValue(1);
    Count  = Size;
    EVT CVT = Count.getValueType();
    SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count,
                               DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT));
    Chain  = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX :
                                                             X86::ECX,
                              Left, InFlag);
    InFlag = Chain.getValue(1);
    Tys = DAG.getVTList(MVT::Other, MVT::Glue);
    SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag };
    Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
  } else if (BytesLeft) {
    // Handle the last 1 - 7 bytes.
    unsigned Offset = SizeVal - BytesLeft;
    EVT AddrVT = Dst.getValueType();
    EVT SizeVT = Size.getValueType();

    Chain = DAG.getMemset(Chain, dl,
                          DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
                                      DAG.getConstant(Offset, AddrVT)),
                          Src,
                          DAG.getConstant(BytesLeft, SizeVT),
                          Align, isVolatile, DstPtrInfo.getWithOffset(Offset));
  }

  // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
  return Chain;
}
Exemplo n.º 4
0
void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
  EVT OutVT = N->getValueType(0);
  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
  SDValue InOp = N->getOperand(0);
  EVT InVT = InOp.getValueType();
  SDLoc dl(N);

  // Handle some special cases efficiently.
  switch (getTypeAction(InVT)) {
    case TargetLowering::TypeLegal:
    case TargetLowering::TypePromoteInteger:
      break;
    case TargetLowering::TypePromoteFloat:
      llvm_unreachable("Bitcast of a promotion-needing float should never need"
                       "expansion");
    case TargetLowering::TypeSoftenFloat:
      // Convert the integer operand instead.
      SplitInteger(GetSoftenedFloat(InOp), Lo, Hi);
      Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
      Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
      return;
    case TargetLowering::TypeExpandInteger:
    case TargetLowering::TypeExpandFloat:
      // Convert the expanded pieces of the input.
      GetExpandedOp(InOp, Lo, Hi);
      if (TLI.hasBigEndianPartOrdering(InVT) !=
          TLI.hasBigEndianPartOrdering(OutVT))
        std::swap(Lo, Hi);
      Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
      Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
      return;
    case TargetLowering::TypeSplitVector:
      GetSplitVector(InOp, Lo, Hi);
      if (TLI.hasBigEndianPartOrdering(OutVT))
        std::swap(Lo, Hi);
      Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
      Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
      return;
    case TargetLowering::TypeScalarizeVector:
      // Convert the element instead.
      SplitInteger(BitConvertToInteger(GetScalarizedVector(InOp)), Lo, Hi);
      Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
      Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
      return;
    case TargetLowering::TypeWidenVector: {
      assert(!(InVT.getVectorNumElements() & 1) && "Unsupported BITCAST");
      InOp = GetWidenedVector(InOp);
      EVT LoVT, HiVT;
      std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(InVT);
      std::tie(Lo, Hi) = DAG.SplitVector(InOp, dl, LoVT, HiVT);
      if (TLI.hasBigEndianPartOrdering(OutVT))
        std::swap(Lo, Hi);
      Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
      Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
      return;
    }
  }

  if (InVT.isVector() && OutVT.isInteger()) {
    // Handle cases like i64 = BITCAST v1i64 on x86, where the operand
    // is legal but the result is not.
    unsigned NumElems = 2;
    EVT ElemVT = NOutVT;
    EVT NVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElems);

    // If <ElemVT * N> is not a legal type, try <ElemVT/2 * (N*2)>.
    while (!isTypeLegal(NVT)) {
      unsigned NewSizeInBits = ElemVT.getSizeInBits() / 2;
      // If the element size is smaller than byte, bail.
      if (NewSizeInBits < 8)
        break;
      NumElems *= 2;
      ElemVT = EVT::getIntegerVT(*DAG.getContext(), NewSizeInBits);
      NVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElems);
    }

    if (isTypeLegal(NVT)) {
      SDValue CastInOp = DAG.getNode(ISD::BITCAST, dl, NVT, InOp);

      SmallVector<SDValue, 8> Vals;
      for (unsigned i = 0; i < NumElems; ++i)
        Vals.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ElemVT,
                                   CastInOp, DAG.getConstant(i, dl,
                                             TLI.getVectorIdxTy())));

      // Build Lo, Hi pair by pairing extracted elements if needed.
      unsigned Slot = 0;
      for (unsigned e = Vals.size(); e - Slot > 2; Slot += 2, e += 1) {
        // Each iteration will BUILD_PAIR two nodes and append the result until
        // there are only two nodes left, i.e. Lo and Hi.
        SDValue LHS = Vals[Slot];
        SDValue RHS = Vals[Slot + 1];

        if (TLI.isBigEndian())
          std::swap(LHS, RHS);

        Vals.push_back(DAG.getNode(ISD::BUILD_PAIR, dl,
                                   EVT::getIntegerVT(
                                     *DAG.getContext(),
                                     LHS.getValueType().getSizeInBits() << 1),
                                   LHS, RHS));
      }
      Lo = Vals[Slot++];
      Hi = Vals[Slot++];

      if (TLI.isBigEndian())
        std::swap(Lo, Hi);

      return;
    }
  }

  // Lower the bit-convert to a store/load from the stack.
  assert(NOutVT.isByteSized() && "Expanded type not byte sized!");

  // Create the stack frame object.  Make sure it is aligned for both
  // the source and expanded destination types.
  unsigned Alignment =
    TLI.getDataLayout()->getPrefTypeAlignment(NOutVT.
                                              getTypeForEVT(*DAG.getContext()));
  SDValue StackPtr = DAG.CreateStackTemporary(InVT, Alignment);
  int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
  MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SPFI);

  // Emit a store to the stack slot.
  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, InOp, StackPtr, PtrInfo,
                               false, false, 0);

  // Load the first half from the stack slot.
  Lo = DAG.getLoad(NOutVT, dl, Store, StackPtr, PtrInfo,
                   false, false, false, 0);

  // Increment the pointer to the other half.
  unsigned IncrementSize = NOutVT.getSizeInBits() / 8;
  StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
                         DAG.getConstant(IncrementSize, dl,
                                         StackPtr.getValueType()));

  // Load the second half from the stack slot.
  Hi = DAG.getLoad(NOutVT, dl, Store, StackPtr,
                   PtrInfo.getWithOffset(IncrementSize), false,
                   false, false, MinAlign(Alignment, IncrementSize));

  // Handle endianness of the load.
  if (TLI.hasBigEndianPartOrdering(OutVT))
    std::swap(Lo, Hi);
}
Exemplo n.º 5
0
SDValue SystemZSelectionDAGInfo::
EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
                        SDValue Dst, SDValue Byte, SDValue Size,
                        unsigned Align, bool IsVolatile,
                        MachinePointerInfo DstPtrInfo) const {
  EVT PtrVT = Dst.getValueType();

  if (IsVolatile)
    return SDValue();

  if (ConstantSDNode *CSize = dyn_cast<ConstantSDNode>(Size)) {
    uint64_t Bytes = CSize->getZExtValue();
    if (Bytes == 0)
      return SDValue();
    if (ConstantSDNode *CByte = dyn_cast<ConstantSDNode>(Byte)) {
      // Handle cases that can be done using at most two of
      // MVI, MVHI, MVHHI and MVGHI.  The latter two can only be
      // used if ByteVal is all zeros or all ones; in other casees,
      // we can move at most 2 halfwords.
      uint64_t ByteVal = CByte->getZExtValue();
      if (ByteVal == 0 || ByteVal == 255 ?
          Bytes <= 16 && CountPopulation_64(Bytes) <= 2 :
          Bytes <= 4) {
        unsigned Size1 = Bytes == 16 ? 8 : 1 << findLastSet(Bytes);
        unsigned Size2 = Bytes - Size1;
        SDValue Chain1 = memsetStore(DAG, DL, Chain, Dst, ByteVal, Size1,
                                     Align, DstPtrInfo);
        if (Size2 == 0)
          return Chain1;
        Dst = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
                          DAG.getConstant(Size1, PtrVT));
        DstPtrInfo = DstPtrInfo.getWithOffset(Size1);
        SDValue Chain2 = memsetStore(DAG, DL, Chain, Dst, ByteVal, Size2,
                                     std::min(Align, Size1), DstPtrInfo);
        return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chain1, Chain2);
      }
    } else {
      // Handle one and two bytes using STC.
      if (Bytes <= 2) {
        SDValue Chain1 = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo,
                                      false, false, Align);
        if (Bytes == 1)
          return Chain1;
        SDValue Dst2 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
                                   DAG.getConstant(1, PtrVT));
        SDValue Chain2 = DAG.getStore(Chain, DL, Byte, Dst2,
                                      DstPtrInfo.getWithOffset(1),
                                      false, false, 1);
        return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chain1, Chain2);
      }
    }
    assert(Bytes >= 2 && "Should have dealt with 0- and 1-byte cases already");
    // Copy the byte to the first location and then use MVC to copy
    // it to the rest.
    Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo,
                         false, false, Align);
    SDValue DstPlus1 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
                                   DAG.getConstant(1, PtrVT));
    return emitMVC(DAG, DL, Chain, DstPlus1, Dst, Bytes - 1);
  }
  return SDValue();
}
Exemplo n.º 6
0
SDValue
ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                                             SDValue Chain,
                                             SDValue Dst, SDValue Src,
                                             SDValue Size, unsigned Align,
                                             bool isVolatile, bool AlwaysInline,
                                             MachinePointerInfo DstPtrInfo,
                                          MachinePointerInfo SrcPtrInfo) const {
  // Do repeated 4-byte loads and stores. To be improved.
  // This requires 4-byte alignment.
  if ((Align & 3) != 0)
    return SDValue();
  // This requires the copy size to be a constant, preferably
  // within a subtarget-specific limit.
  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
  if (!ConstantSize)
    return SDValue();
  uint64_t SizeVal = ConstantSize->getZExtValue();
  if (!AlwaysInline && SizeVal > Subtarget->getMaxInlineSizeThreshold())
    return SDValue();

  unsigned BytesLeft = SizeVal & 3;
  unsigned NumMemOps = SizeVal >> 2;
  unsigned EmittedNumMemOps = 0;
  EVT VT = MVT::i32;
  unsigned VTSize = 4;
  unsigned i = 0;
  const unsigned MAX_LOADS_IN_LDM = 6;
  SDValue TFOps[MAX_LOADS_IN_LDM];
  SDValue Loads[MAX_LOADS_IN_LDM];
  uint64_t SrcOff = 0, DstOff = 0;

  // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the
  // same number of stores.  The loads and stores will get combined into
  // ldm/stm later on.
  while (EmittedNumMemOps < NumMemOps) {
    for (i = 0;
         i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
      Loads[i] = DAG.getLoad(VT, dl, Chain,
                             DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
                                         DAG.getConstant(SrcOff, MVT::i32)),
                             SrcPtrInfo.getWithOffset(SrcOff), isVolatile,
                             false, false, 0);
      TFOps[i] = Loads[i].getValue(1);
      SrcOff += VTSize;
    }
    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);

    for (i = 0;
         i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
      TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
                              DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
                                          DAG.getConstant(DstOff, MVT::i32)),
                              DstPtrInfo.getWithOffset(DstOff),
                              isVolatile, false, 0);
      DstOff += VTSize;
    }
    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);

    EmittedNumMemOps += i;
  }

  if (BytesLeft == 0)
    return Chain;

  // Issue loads / stores for the trailing (1 - 3) bytes.
  unsigned BytesLeftSave = BytesLeft;
  i = 0;
  while (BytesLeft) {
    if (BytesLeft >= 2) {
      VT = MVT::i16;
      VTSize = 2;
    } else {
      VT = MVT::i8;
      VTSize = 1;
    }

    Loads[i] = DAG.getLoad(VT, dl, Chain,
                           DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
                                       DAG.getConstant(SrcOff, MVT::i32)),
                           SrcPtrInfo.getWithOffset(SrcOff),
                           false, false, false, 0);
    TFOps[i] = Loads[i].getValue(1);
    ++i;
    SrcOff += VTSize;
    BytesLeft -= VTSize;
  }
  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);

  i = 0;
  BytesLeft = BytesLeftSave;
  while (BytesLeft) {
    if (BytesLeft >= 2) {
      VT = MVT::i16;
      VTSize = 2;
    } else {
      VT = MVT::i8;
      VTSize = 1;
    }

    TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
                            DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
                                        DAG.getConstant(DstOff, MVT::i32)),
                            DstPtrInfo.getWithOffset(DstOff), false, false, 0);
    ++i;
    DstOff += VTSize;
    BytesLeft -= VTSize;
  }
  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
}
Exemplo n.º 7
0
SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
    SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
    MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
  // This requires the copy size to be a constant, preferably
  // within a subtarget-specific limit.
  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
  const X86Subtarget &Subtarget =
      DAG.getMachineFunction().getSubtarget<X86Subtarget>();
  if (!ConstantSize)
    return SDValue();
  RepMovsRepeats Repeats(ConstantSize->getZExtValue());
  if (!AlwaysInline && Repeats.Size > Subtarget.getMaxInlineSizeThreshold())
    return SDValue();

  /// If not DWORD aligned, it is more efficient to call the library.  However
  /// if calling the library is not allowed (AlwaysInline), then soldier on as
  /// the code generated here is better than the long load-store sequence we
  /// would otherwise get.
  if (!AlwaysInline && (Align & 3) != 0)
    return SDValue();

  // If to a segment-relative address space, use the default lowering.
  if (DstPtrInfo.getAddrSpace() >= 256 ||
      SrcPtrInfo.getAddrSpace() >= 256)
    return SDValue();

  // If the base register might conflict with our physical registers, bail out.
  const MCPhysReg ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI,
                                  X86::ECX, X86::ESI, X86::EDI};
  if (isBaseRegConflictPossible(DAG, ClobberSet))
    return SDValue();

  // If the target has enhanced REPMOVSB, then it's at least as fast to use
  // REP MOVSB instead of REP MOVS{W,D,Q}, and it avoids having to handle
  // BytesLeft.
  if (!Subtarget.hasERMSB() && !(Align & 1)) {
    if (Align & 2)
      // WORD aligned
      Repeats.AVT = MVT::i16;
    else if (Align & 4)
      // DWORD aligned
      Repeats.AVT = MVT::i32;
    else
      // QWORD aligned
      Repeats.AVT = Subtarget.is64Bit() ? MVT::i64 : MVT::i32;

    if (Repeats.BytesLeft() > 0 &&
        DAG.getMachineFunction().getFunction().optForMinSize()) {
      // When agressively optimizing for size, avoid generating the code to
      // handle BytesLeft.
      Repeats.AVT = MVT::i8;
    }
  }

  SDValue InFlag;
  Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX,
                           DAG.getIntPtrConstant(Repeats.Count(), dl), InFlag);
  InFlag = Chain.getValue(1);
  Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI,
                           Dst, InFlag);
  InFlag = Chain.getValue(1);
  Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RSI : X86::ESI,
                           Src, InFlag);
  InFlag = Chain.getValue(1);

  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
  SDValue Ops[] = { Chain, DAG.getValueType(Repeats.AVT), InFlag };
  SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops);

  SmallVector<SDValue, 4> Results;
  Results.push_back(RepMovs);
  if (Repeats.BytesLeft()) {
    // Handle the last 1 - 7 bytes.
    unsigned Offset = Repeats.Size - Repeats.BytesLeft();
    EVT DstVT = Dst.getValueType();
    EVT SrcVT = Src.getValueType();
    EVT SizeVT = Size.getValueType();
    Results.push_back(DAG.getMemcpy(Chain, dl,
                                    DAG.getNode(ISD::ADD, dl, DstVT, Dst,
                                                DAG.getConstant(Offset, dl,
                                                                DstVT)),
                                    DAG.getNode(ISD::ADD, dl, SrcVT, Src,
                                                DAG.getConstant(Offset, dl,
                                                                SrcVT)),
                                    DAG.getConstant(Repeats.BytesLeft(), dl,
                                                    SizeVT),
                                    Align, isVolatile, AlwaysInline, false,
                                    DstPtrInfo.getWithOffset(Offset),
                                    SrcPtrInfo.getWithOffset(Offset)));
  }

  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
}
Exemplo n.º 8
0
SDValue
X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
        SDValue Chain, SDValue Dst, SDValue Src,
        SDValue Size, unsigned Align,
        bool isVolatile, bool AlwaysInline,
        MachinePointerInfo DstPtrInfo,
        MachinePointerInfo SrcPtrInfo) const {
    // This requires the copy size to be a constant, preferably
    // within a subtarget-specific limit.
    ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
    if (!ConstantSize)
        return SDValue();
    uint64_t SizeVal = ConstantSize->getZExtValue();
    if (!AlwaysInline && SizeVal > Subtarget->getMaxInlineSizeThreshold())
        return SDValue();

    /// If not DWORD aligned, it is more efficient to call the library.  However
    /// if calling the library is not allowed (AlwaysInline), then soldier on as
    /// the code generated here is better than the long load-store sequence we
    /// would otherwise get.
    if (!AlwaysInline && (Align & 3) != 0)
        return SDValue();

    // If to a segment-relative address space, use the default lowering.
    if (DstPtrInfo.getAddrSpace() >= 256 ||
            SrcPtrInfo.getAddrSpace() >= 256)
        return SDValue();

    // If ESI is used as a base pointer, we must preserve it when doing rep movs.
    const X86RegisterInfo *TRI =
        static_cast<const X86RegisterInfo *>(DAG.getTarget().getRegisterInfo());
    bool PreserveESI = TRI->hasBasePointer(DAG.getMachineFunction()) &&
                       TRI->getBaseRegister() == X86::ESI;

    MVT AVT;
    if (Align & 1)
        AVT = MVT::i8;
    else if (Align & 2)
        AVT = MVT::i16;
    else if (Align & 4)
        // DWORD aligned
        AVT = MVT::i32;
    else
        // QWORD aligned
        AVT = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;

    unsigned UBytes = AVT.getSizeInBits() / 8;
    unsigned CountVal = SizeVal / UBytes;
    SDValue Count = DAG.getIntPtrConstant(CountVal);
    unsigned BytesLeft = SizeVal % UBytes;


    if (PreserveESI) {
        // Save ESI to a physical register. (We cannot use a virtual register
        // because if it is spilled we wouldn't be able to reload it.)
        // We don't glue this because the register dependencies are explicit.
        Chain = DAG.getCopyToReg(Chain, dl, X86::EDX,
                                 DAG.getRegister(X86::ESI, MVT::i32));
    }

    SDValue InGlue(0, 0);
    Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
                              X86::ECX,
                              Count, InGlue);
    InGlue = Chain.getValue(1);
    Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
                              X86::EDI,
                              Dst, InGlue);
    InGlue = Chain.getValue(1);
    Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI :
                              X86::ESI,
                              Src, InGlue);
    InGlue = Chain.getValue(1);

    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
    SDValue Ops[] = { Chain, DAG.getValueType(AVT), InGlue };
    // FIXME: Make X86rep_movs explicitly use FCX, RDI, RSI instead of glue.
    SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops,
                                  array_lengthof(Ops));

    if (PreserveESI) {
        InGlue = RepMovs.getValue(1);
        RepMovs = DAG.getCopyToReg(RepMovs, dl, X86::ESI,
                                   DAG.getRegister(X86::EDX, MVT::i32), InGlue);
    }

    SmallVector<SDValue, 4> Results;
    Results.push_back(RepMovs);


    if (BytesLeft) {
        // Handle the last 1 - 7 bytes.
        unsigned Offset = SizeVal - BytesLeft;
        EVT DstVT = Dst.getValueType();
        EVT SrcVT = Src.getValueType();
        EVT SizeVT = Size.getValueType();
        Results.push_back(DAG.getMemcpy(Chain, dl,
                                        DAG.getNode(ISD::ADD, dl, DstVT, Dst,
                                                DAG.getConstant(Offset, DstVT)),
                                        DAG.getNode(ISD::ADD, dl, SrcVT, Src,
                                                DAG.getConstant(Offset, SrcVT)),
                                        DAG.getConstant(BytesLeft, SizeVT),
                                        Align, isVolatile, AlwaysInline,
                                        DstPtrInfo.getWithOffset(Offset),
                                        SrcPtrInfo.getWithOffset(Offset)));
    }

    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                       &Results[0], Results.size());
}
void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
  EVT OutVT = N->getValueType(0);
  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
  SDValue InOp = N->getOperand(0);
  EVT InVT = InOp.getValueType();
  DebugLoc dl = N->getDebugLoc();

  // Handle some special cases efficiently.
  switch (getTypeAction(InVT)) {
    default:
      assert(false && "Unknown type action!");
    case TargetLowering::TypeLegal:
    case TargetLowering::TypePromoteInteger:
      break;
    case TargetLowering::TypeSoftenFloat:
      // Convert the integer operand instead.
      SplitInteger(GetSoftenedFloat(InOp), Lo, Hi);
      Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
      Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
      return;
    case TargetLowering::TypeExpandInteger:
    case TargetLowering::TypeExpandFloat:
      // Convert the expanded pieces of the input.
      GetExpandedOp(InOp, Lo, Hi);
      Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
      Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
      return;
    case TargetLowering::TypeSplitVector:
      GetSplitVector(InOp, Lo, Hi);
      if (TLI.isBigEndian())
        std::swap(Lo, Hi);
      Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
      Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
      return;
    case TargetLowering::TypeScalarizeVector:
      // Convert the element instead.
      SplitInteger(BitConvertToInteger(GetScalarizedVector(InOp)), Lo, Hi);
      Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
      Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
      return;
    case TargetLowering::TypeWidenVector: {
      assert(!(InVT.getVectorNumElements() & 1) && "Unsupported BITCAST");
      InOp = GetWidenedVector(InOp);
      EVT InNVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(),
                                   InVT.getVectorNumElements()/2);
      Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp,
                       DAG.getIntPtrConstant(0));
      Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp,
                       DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
      if (TLI.isBigEndian())
        std::swap(Lo, Hi);
      Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
      Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
      return;
    }
  }

  if (InVT.isVector() && OutVT.isInteger()) {
    // Handle cases like i64 = BITCAST v1i64 on x86, where the operand
    // is legal but the result is not.
    EVT NVT = EVT::getVectorVT(*DAG.getContext(), NOutVT, 2);

    if (isTypeLegal(NVT)) {
      SDValue CastInOp = DAG.getNode(ISD::BITCAST, dl, NVT, InOp);
      Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NOutVT, CastInOp,
                       DAG.getIntPtrConstant(0));
      Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NOutVT, CastInOp,
                       DAG.getIntPtrConstant(1));

      if (TLI.isBigEndian())
        std::swap(Lo, Hi);

      return;
    }
  }

  // Lower the bit-convert to a store/load from the stack.
  assert(NOutVT.isByteSized() && "Expanded type not byte sized!");

  // Create the stack frame object.  Make sure it is aligned for both
  // the source and expanded destination types.
  unsigned Alignment =
    TLI.getTargetData()->getPrefTypeAlignment(NOutVT.
                                              getTypeForEVT(*DAG.getContext()));
  SDValue StackPtr = DAG.CreateStackTemporary(InVT, Alignment);
  int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
  MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SPFI);

  // Emit a store to the stack slot.
  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, InOp, StackPtr, PtrInfo,
                               false, false, 0);

  // Load the first half from the stack slot.
  Lo = DAG.getLoad(NOutVT, dl, Store, StackPtr, PtrInfo, false, false, 0);

  // Increment the pointer to the other half.
  unsigned IncrementSize = NOutVT.getSizeInBits() / 8;
  StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
                         DAG.getIntPtrConstant(IncrementSize));

  // Load the second half from the stack slot.
  Hi = DAG.getLoad(NOutVT, dl, Store, StackPtr,
                   PtrInfo.getWithOffset(IncrementSize), false,
                   false, MinAlign(Alignment, IncrementSize));

  // Handle endianness of the load.
  if (TLI.isBigEndian())
    std::swap(Lo, Hi);
}
Exemplo n.º 10
0
SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
    SDValue Size, unsigned Align, bool isVolatile,
    MachinePointerInfo DstPtrInfo) const {
  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
  const X86Subtarget &Subtarget =
      DAG.getMachineFunction().getSubtarget<X86Subtarget>();

#ifndef NDEBUG
  // If the base register might conflict with our physical registers, bail out.
  const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
                                  X86::ECX, X86::EAX, X86::EDI};
  assert(!isBaseRegConflictPossible(DAG, ClobberSet));
#endif

  // If to a segment-relative address space, use the default lowering.
  if (DstPtrInfo.getAddrSpace() >= 256)
    return SDValue();

  // If not DWORD aligned or size is more than the threshold, call the library.
  // The libc version is likely to be faster for these cases. It can use the
  // address value and run time information about the CPU.
  if ((Align & 3) != 0 || !ConstantSize ||
      ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) {
    // Check to see if there is a specialized entry-point for memory zeroing.
    ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Val);

    if (const char *bzeroName = (ValC && ValC->isNullValue())
        ? DAG.getTargetLoweringInfo().getLibcallName(RTLIB::BZERO)
        : nullptr) {
      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
      EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
      Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
      TargetLowering::ArgListTy Args;
      TargetLowering::ArgListEntry Entry;
      Entry.Node = Dst;
      Entry.Ty = IntPtrTy;
      Args.push_back(Entry);
      Entry.Node = Size;
      Args.push_back(Entry);

      TargetLowering::CallLoweringInfo CLI(DAG);
      CLI.setDebugLoc(dl)
          .setChain(Chain)
          .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
                        DAG.getExternalSymbol(bzeroName, IntPtr),
                        std::move(Args))
          .setDiscardResult();

      std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
      return CallResult.second;
    }

    // Otherwise have the target-independent code call memset.
    return SDValue();
  }

  uint64_t SizeVal = ConstantSize->getZExtValue();
  SDValue InFlag;
  EVT AVT;
  SDValue Count;
  ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Val);
  unsigned BytesLeft = 0;
  if (ValC) {
    unsigned ValReg;
    uint64_t Val = ValC->getZExtValue() & 255;

    // If the value is a constant, then we can potentially use larger sets.
    switch (Align & 3) {
    case 2:   // WORD aligned
      AVT = MVT::i16;
      ValReg = X86::AX;
      Val = (Val << 8) | Val;
      break;
    case 0:  // DWORD aligned
      AVT = MVT::i32;
      ValReg = X86::EAX;
      Val = (Val << 8)  | Val;
      Val = (Val << 16) | Val;
      if (Subtarget.is64Bit() && ((Align & 0x7) == 0)) {  // QWORD aligned
        AVT = MVT::i64;
        ValReg = X86::RAX;
        Val = (Val << 32) | Val;
      }
      break;
    default:  // Byte aligned
      AVT = MVT::i8;
      ValReg = X86::AL;
      Count = DAG.getIntPtrConstant(SizeVal, dl);
      break;
    }

    if (AVT.bitsGT(MVT::i8)) {
      unsigned UBytes = AVT.getSizeInBits() / 8;
      Count = DAG.getIntPtrConstant(SizeVal / UBytes, dl);
      BytesLeft = SizeVal % UBytes;
    }

    Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT),
                             InFlag);
    InFlag = Chain.getValue(1);
  } else {
    AVT = MVT::i8;
    Count  = DAG.getIntPtrConstant(SizeVal, dl);
    Chain  = DAG.getCopyToReg(Chain, dl, X86::AL, Val, InFlag);
    InFlag = Chain.getValue(1);
  }

  bool Use64BitRegs = Subtarget.isTarget64BitLP64();
  Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX,
                           Count, InFlag);
  InFlag = Chain.getValue(1);
  Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI,
                           Dst, InFlag);
  InFlag = Chain.getValue(1);

  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
  SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
  Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);

  if (BytesLeft) {
    // Handle the last 1 - 7 bytes.
    unsigned Offset = SizeVal - BytesLeft;
    EVT AddrVT = Dst.getValueType();
    EVT SizeVT = Size.getValueType();

    Chain = DAG.getMemset(Chain, dl,
                          DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
                                      DAG.getConstant(Offset, dl, AddrVT)),
                          Val,
                          DAG.getConstant(BytesLeft, dl, SizeVT),
                          Align, isVolatile, false,
                          DstPtrInfo.getWithOffset(Offset));
  }

  // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
  return Chain;
}