SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy( SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { const ARMSubtarget &Subtarget = DAG.getMachineFunction().getSubtarget<ARMSubtarget>(); // Do repeated 4-byte loads and stores. To be improved. // This requires 4-byte alignment. if ((Align & 3) != 0) return SDValue(); // This requires the copy size to be a constant, preferably // within a subtarget-specific limit. ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); if (!ConstantSize) return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align, RTLIB::MEMCPY); uint64_t SizeVal = ConstantSize->getZExtValue(); if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold()) return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align, RTLIB::MEMCPY); unsigned BytesLeft = SizeVal & 3; unsigned NumMemOps = SizeVal >> 2; unsigned EmittedNumMemOps = 0; EVT VT = MVT::i32; unsigned VTSize = 4; unsigned i = 0; // Emit a maximum of 4 loads in Thumb1 since we have fewer registers const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6; SDValue TFOps[6]; SDValue Loads[6]; uint64_t SrcOff = 0, DstOff = 0; // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to // VLDM/VSTM and make this code emit it when appropriate. This would reduce // pressure on the general purpose registers. However this seems harder to map // onto the register allocator's view of the world. // The number of MEMCPY pseudo-instructions to emit. We use up to // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm // later on. This is a lower bound on the number of MEMCPY operations we must // emit. unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM; // Code size optimisation: do not inline memcpy if expansion results in // more instructions than the libary call. if (NumMEMCPYs > 1 && DAG.getMachineFunction().getFunction().optForMinSize()) { return SDValue(); } SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue); for (unsigned I = 0; I != NumMEMCPYs; ++I) { // Evenly distribute registers among MEMCPY operations to reduce register // pressure. unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs; unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps; Dst = DAG.getNode(ARMISD::MEMCPY, dl, VTs, Chain, Dst, Src, DAG.getConstant(NumRegs, dl, MVT::i32)); Src = Dst.getValue(1); Chain = Dst.getValue(2); DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize); SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize); EmittedNumMemOps = NextEmittedNumMemOps; } if (BytesLeft == 0) return Chain; // Issue loads / stores for the trailing (1 - 3) bytes. auto getRemainingValueType = [](unsigned BytesLeft) { return (BytesLeft >= 2) ? MVT::i16 : MVT::i8; }; auto getRemainingSize = [](unsigned BytesLeft) { return (BytesLeft >= 2) ? 2 : 1; }; unsigned BytesLeftSave = BytesLeft; i = 0; while (BytesLeft) { VT = getRemainingValueType(BytesLeft); VTSize = getRemainingSize(BytesLeft); Loads[i] = DAG.getLoad(VT, dl, Chain, DAG.getNode(ISD::ADD, dl, MVT::i32, Src, DAG.getConstant(SrcOff, dl, MVT::i32)), SrcPtrInfo.getWithOffset(SrcOff)); TFOps[i] = Loads[i].getValue(1); ++i; SrcOff += VTSize; BytesLeft -= VTSize; } Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, makeArrayRef(TFOps, i)); i = 0; BytesLeft = BytesLeftSave; while (BytesLeft) { VT = getRemainingValueType(BytesLeft); VTSize = getRemainingSize(BytesLeft); TFOps[i] = DAG.getStore(Chain, dl, Loads[i], DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, DAG.getConstant(DstOff, dl, MVT::i32)), DstPtrInfo.getWithOffset(DstOff)); ++i; DstOff += VTSize; BytesLeft -= VTSize; } return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, makeArrayRef(TFOps, i)); }
SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { // This requires the copy size to be a constant, preferably // within a subtarget-specific limit. ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); if (!ConstantSize) return SDValue(); uint64_t SizeVal = ConstantSize->getZExtValue(); if (!AlwaysInline && SizeVal > Subtarget->getMaxInlineSizeThreshold()) return SDValue(); /// If not DWORD aligned, it is more efficient to call the library. However /// if calling the library is not allowed (AlwaysInline), then soldier on as /// the code generated here is better than the long load-store sequence we /// would otherwise get. if (!AlwaysInline && (Align & 3) != 0) return SDValue(); // If to a segment-relative address space, use the default lowering. if (DstPtrInfo.getAddrSpace() >= 256 || SrcPtrInfo.getAddrSpace() >= 256) return SDValue(); // ESI might be used as a base pointer, in that case we can't simply overwrite // the register. Fall back to generic code. const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>(DAG.getTarget().getRegisterInfo()); if (TRI->hasBasePointer(DAG.getMachineFunction()) && TRI->getBaseRegister() == X86::ESI) return SDValue(); MVT AVT; if (Align & 1) AVT = MVT::i8; else if (Align & 2) AVT = MVT::i16; else if (Align & 4) // DWORD aligned AVT = MVT::i32; else // QWORD aligned AVT = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; unsigned UBytes = AVT.getSizeInBits() / 8; unsigned CountVal = SizeVal / UBytes; SDValue Count = DAG.getIntPtrConstant(CountVal); unsigned BytesLeft = SizeVal % UBytes; SDValue InFlag; Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : X86::ECX, Count, InFlag); InFlag = Chain.getValue(1); Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : X86::EDI, Dst, InFlag); InFlag = Chain.getValue(1); Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI : X86::ESI, Src, InFlag); InFlag = Chain.getValue(1); SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag }; SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops); SmallVector<SDValue, 4> Results; Results.push_back(RepMovs); if (BytesLeft) { // Handle the last 1 - 7 bytes. unsigned Offset = SizeVal - BytesLeft; EVT DstVT = Dst.getValueType(); EVT SrcVT = Src.getValueType(); EVT SizeVT = Size.getValueType(); Results.push_back(DAG.getMemcpy(Chain, dl, DAG.getNode(ISD::ADD, dl, DstVT, Dst, DAG.getConstant(Offset, DstVT)), DAG.getNode(ISD::ADD, dl, SrcVT, Src, DAG.getConstant(Offset, SrcVT)), DAG.getConstant(BytesLeft, SizeVT), Align, isVolatile, AlwaysInline, DstPtrInfo.getWithOffset(Offset), SrcPtrInfo.getWithOffset(Offset))); } return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results); }
SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVolatile, MachinePointerInfo DstPtrInfo) const { ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); // If to a segment-relative address space, use the default lowering. if (DstPtrInfo.getAddrSpace() >= 256) return SDValue(); // If not DWORD aligned or size is more than the threshold, call the library. // The libc version is likely to be faster for these cases. It can use the // address value and run time information about the CPU. if ((Align & 3) != 0 || !ConstantSize || ConstantSize->getZExtValue() > Subtarget->getMaxInlineSizeThreshold()) { // Check to see if there is a specialized entry-point for memory zeroing. ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src); if (const char *bzeroEntry = V && V->isNullValue() ? Subtarget->getBZeroEntry() : nullptr) { EVT IntPtr = TLI.getPointerTy(); Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext()); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; Entry.Node = Dst; Entry.Ty = IntPtrTy; Args.push_back(Entry); Entry.Node = Size; Args.push_back(Entry); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(Chain) .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), DAG.getExternalSymbol(bzeroEntry, IntPtr), &Args, 0) .setDiscardResult(); std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI); return CallResult.second; } // Otherwise have the target-independent code call memset. return SDValue(); } uint64_t SizeVal = ConstantSize->getZExtValue(); SDValue InFlag; EVT AVT; SDValue Count; ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src); unsigned BytesLeft = 0; bool TwoRepStos = false; if (ValC) { unsigned ValReg; uint64_t Val = ValC->getZExtValue() & 255; // If the value is a constant, then we can potentially use larger sets. switch (Align & 3) { case 2: // WORD aligned AVT = MVT::i16; ValReg = X86::AX; Val = (Val << 8) | Val; break; case 0: // DWORD aligned AVT = MVT::i32; ValReg = X86::EAX; Val = (Val << 8) | Val; Val = (Val << 16) | Val; if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned AVT = MVT::i64; ValReg = X86::RAX; Val = (Val << 32) | Val; } break; default: // Byte aligned AVT = MVT::i8; ValReg = X86::AL; Count = DAG.getIntPtrConstant(SizeVal); break; } if (AVT.bitsGT(MVT::i8)) { unsigned UBytes = AVT.getSizeInBits() / 8; Count = DAG.getIntPtrConstant(SizeVal / UBytes); BytesLeft = SizeVal % UBytes; } Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT), InFlag); InFlag = Chain.getValue(1); } else { AVT = MVT::i8; Count = DAG.getIntPtrConstant(SizeVal); Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag); InFlag = Chain.getValue(1); } Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : X86::ECX, Count, InFlag); InFlag = Chain.getValue(1); Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : X86::EDI, Dst, InFlag); InFlag = Chain.getValue(1); SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag }; Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops); if (TwoRepStos) { InFlag = Chain.getValue(1); Count = Size; EVT CVT = Count.getValueType(); SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count, DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT)); Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : X86::ECX, Left, InFlag); InFlag = Chain.getValue(1); Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag }; Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops); } else if (BytesLeft) { // Handle the last 1 - 7 bytes. unsigned Offset = SizeVal - BytesLeft; EVT AddrVT = Dst.getValueType(); EVT SizeVT = Size.getValueType(); Chain = DAG.getMemset(Chain, dl, DAG.getNode(ISD::ADD, dl, AddrVT, Dst, DAG.getConstant(Offset, AddrVT)), Src, DAG.getConstant(BytesLeft, SizeVT), Align, isVolatile, DstPtrInfo.getWithOffset(Offset)); } // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain. return Chain; }
void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT OutVT = N->getValueType(0); EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT); SDValue InOp = N->getOperand(0); EVT InVT = InOp.getValueType(); SDLoc dl(N); // Handle some special cases efficiently. switch (getTypeAction(InVT)) { case TargetLowering::TypeLegal: case TargetLowering::TypePromoteInteger: break; case TargetLowering::TypePromoteFloat: llvm_unreachable("Bitcast of a promotion-needing float should never need" "expansion"); case TargetLowering::TypeSoftenFloat: // Convert the integer operand instead. SplitInteger(GetSoftenedFloat(InOp), Lo, Hi); Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo); Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi); return; case TargetLowering::TypeExpandInteger: case TargetLowering::TypeExpandFloat: // Convert the expanded pieces of the input. GetExpandedOp(InOp, Lo, Hi); if (TLI.hasBigEndianPartOrdering(InVT) != TLI.hasBigEndianPartOrdering(OutVT)) std::swap(Lo, Hi); Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo); Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi); return; case TargetLowering::TypeSplitVector: GetSplitVector(InOp, Lo, Hi); if (TLI.hasBigEndianPartOrdering(OutVT)) std::swap(Lo, Hi); Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo); Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi); return; case TargetLowering::TypeScalarizeVector: // Convert the element instead. SplitInteger(BitConvertToInteger(GetScalarizedVector(InOp)), Lo, Hi); Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo); Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi); return; case TargetLowering::TypeWidenVector: { assert(!(InVT.getVectorNumElements() & 1) && "Unsupported BITCAST"); InOp = GetWidenedVector(InOp); EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(InVT); std::tie(Lo, Hi) = DAG.SplitVector(InOp, dl, LoVT, HiVT); if (TLI.hasBigEndianPartOrdering(OutVT)) std::swap(Lo, Hi); Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo); Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi); return; } } if (InVT.isVector() && OutVT.isInteger()) { // Handle cases like i64 = BITCAST v1i64 on x86, where the operand // is legal but the result is not. unsigned NumElems = 2; EVT ElemVT = NOutVT; EVT NVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElems); // If <ElemVT * N> is not a legal type, try <ElemVT/2 * (N*2)>. while (!isTypeLegal(NVT)) { unsigned NewSizeInBits = ElemVT.getSizeInBits() / 2; // If the element size is smaller than byte, bail. if (NewSizeInBits < 8) break; NumElems *= 2; ElemVT = EVT::getIntegerVT(*DAG.getContext(), NewSizeInBits); NVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElems); } if (isTypeLegal(NVT)) { SDValue CastInOp = DAG.getNode(ISD::BITCAST, dl, NVT, InOp); SmallVector<SDValue, 8> Vals; for (unsigned i = 0; i < NumElems; ++i) Vals.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ElemVT, CastInOp, DAG.getConstant(i, dl, TLI.getVectorIdxTy()))); // Build Lo, Hi pair by pairing extracted elements if needed. unsigned Slot = 0; for (unsigned e = Vals.size(); e - Slot > 2; Slot += 2, e += 1) { // Each iteration will BUILD_PAIR two nodes and append the result until // there are only two nodes left, i.e. Lo and Hi. SDValue LHS = Vals[Slot]; SDValue RHS = Vals[Slot + 1]; if (TLI.isBigEndian()) std::swap(LHS, RHS); Vals.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, EVT::getIntegerVT( *DAG.getContext(), LHS.getValueType().getSizeInBits() << 1), LHS, RHS)); } Lo = Vals[Slot++]; Hi = Vals[Slot++]; if (TLI.isBigEndian()) std::swap(Lo, Hi); return; } } // Lower the bit-convert to a store/load from the stack. assert(NOutVT.isByteSized() && "Expanded type not byte sized!"); // Create the stack frame object. Make sure it is aligned for both // the source and expanded destination types. unsigned Alignment = TLI.getDataLayout()->getPrefTypeAlignment(NOutVT. getTypeForEVT(*DAG.getContext())); SDValue StackPtr = DAG.CreateStackTemporary(InVT, Alignment); int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex(); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SPFI); // Emit a store to the stack slot. SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, InOp, StackPtr, PtrInfo, false, false, 0); // Load the first half from the stack slot. Lo = DAG.getLoad(NOutVT, dl, Store, StackPtr, PtrInfo, false, false, false, 0); // Increment the pointer to the other half. unsigned IncrementSize = NOutVT.getSizeInBits() / 8; StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr, DAG.getConstant(IncrementSize, dl, StackPtr.getValueType())); // Load the second half from the stack slot. Hi = DAG.getLoad(NOutVT, dl, Store, StackPtr, PtrInfo.getWithOffset(IncrementSize), false, false, false, MinAlign(Alignment, IncrementSize)); // Handle endianness of the load. if (TLI.hasBigEndianPartOrdering(OutVT)) std::swap(Lo, Hi); }
SDValue SystemZSelectionDAGInfo:: EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain, SDValue Dst, SDValue Byte, SDValue Size, unsigned Align, bool IsVolatile, MachinePointerInfo DstPtrInfo) const { EVT PtrVT = Dst.getValueType(); if (IsVolatile) return SDValue(); if (ConstantSDNode *CSize = dyn_cast<ConstantSDNode>(Size)) { uint64_t Bytes = CSize->getZExtValue(); if (Bytes == 0) return SDValue(); if (ConstantSDNode *CByte = dyn_cast<ConstantSDNode>(Byte)) { // Handle cases that can be done using at most two of // MVI, MVHI, MVHHI and MVGHI. The latter two can only be // used if ByteVal is all zeros or all ones; in other casees, // we can move at most 2 halfwords. uint64_t ByteVal = CByte->getZExtValue(); if (ByteVal == 0 || ByteVal == 255 ? Bytes <= 16 && CountPopulation_64(Bytes) <= 2 : Bytes <= 4) { unsigned Size1 = Bytes == 16 ? 8 : 1 << findLastSet(Bytes); unsigned Size2 = Bytes - Size1; SDValue Chain1 = memsetStore(DAG, DL, Chain, Dst, ByteVal, Size1, Align, DstPtrInfo); if (Size2 == 0) return Chain1; Dst = DAG.getNode(ISD::ADD, DL, PtrVT, Dst, DAG.getConstant(Size1, PtrVT)); DstPtrInfo = DstPtrInfo.getWithOffset(Size1); SDValue Chain2 = memsetStore(DAG, DL, Chain, Dst, ByteVal, Size2, std::min(Align, Size1), DstPtrInfo); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chain1, Chain2); } } else { // Handle one and two bytes using STC. if (Bytes <= 2) { SDValue Chain1 = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo, false, false, Align); if (Bytes == 1) return Chain1; SDValue Dst2 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst, DAG.getConstant(1, PtrVT)); SDValue Chain2 = DAG.getStore(Chain, DL, Byte, Dst2, DstPtrInfo.getWithOffset(1), false, false, 1); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chain1, Chain2); } } assert(Bytes >= 2 && "Should have dealt with 0- and 1-byte cases already"); // Copy the byte to the first location and then use MVC to copy // it to the rest. Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo, false, false, Align); SDValue DstPlus1 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst, DAG.getConstant(1, PtrVT)); return emitMVC(DAG, DL, Chain, DstPlus1, Dst, Bytes - 1); } return SDValue(); }
SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { // Do repeated 4-byte loads and stores. To be improved. // This requires 4-byte alignment. if ((Align & 3) != 0) return SDValue(); // This requires the copy size to be a constant, preferably // within a subtarget-specific limit. ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); if (!ConstantSize) return SDValue(); uint64_t SizeVal = ConstantSize->getZExtValue(); if (!AlwaysInline && SizeVal > Subtarget->getMaxInlineSizeThreshold()) return SDValue(); unsigned BytesLeft = SizeVal & 3; unsigned NumMemOps = SizeVal >> 2; unsigned EmittedNumMemOps = 0; EVT VT = MVT::i32; unsigned VTSize = 4; unsigned i = 0; const unsigned MAX_LOADS_IN_LDM = 6; SDValue TFOps[MAX_LOADS_IN_LDM]; SDValue Loads[MAX_LOADS_IN_LDM]; uint64_t SrcOff = 0, DstOff = 0; // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the // same number of stores. The loads and stores will get combined into // ldm/stm later on. while (EmittedNumMemOps < NumMemOps) { for (i = 0; i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) { Loads[i] = DAG.getLoad(VT, dl, Chain, DAG.getNode(ISD::ADD, dl, MVT::i32, Src, DAG.getConstant(SrcOff, MVT::i32)), SrcPtrInfo.getWithOffset(SrcOff), isVolatile, false, false, 0); TFOps[i] = Loads[i].getValue(1); SrcOff += VTSize; } Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i); for (i = 0; i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) { TFOps[i] = DAG.getStore(Chain, dl, Loads[i], DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, DAG.getConstant(DstOff, MVT::i32)), DstPtrInfo.getWithOffset(DstOff), isVolatile, false, 0); DstOff += VTSize; } Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i); EmittedNumMemOps += i; } if (BytesLeft == 0) return Chain; // Issue loads / stores for the trailing (1 - 3) bytes. unsigned BytesLeftSave = BytesLeft; i = 0; while (BytesLeft) { if (BytesLeft >= 2) { VT = MVT::i16; VTSize = 2; } else { VT = MVT::i8; VTSize = 1; } Loads[i] = DAG.getLoad(VT, dl, Chain, DAG.getNode(ISD::ADD, dl, MVT::i32, Src, DAG.getConstant(SrcOff, MVT::i32)), SrcPtrInfo.getWithOffset(SrcOff), false, false, false, 0); TFOps[i] = Loads[i].getValue(1); ++i; SrcOff += VTSize; BytesLeft -= VTSize; } Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i); i = 0; BytesLeft = BytesLeftSave; while (BytesLeft) { if (BytesLeft >= 2) { VT = MVT::i16; VTSize = 2; } else { VT = MVT::i8; VTSize = 1; } TFOps[i] = DAG.getStore(Chain, dl, Loads[i], DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, DAG.getConstant(DstOff, MVT::i32)), DstPtrInfo.getWithOffset(DstOff), false, false, 0); ++i; DstOff += VTSize; BytesLeft -= VTSize; } return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i); }
SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { // This requires the copy size to be a constant, preferably // within a subtarget-specific limit. ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); const X86Subtarget &Subtarget = DAG.getMachineFunction().getSubtarget<X86Subtarget>(); if (!ConstantSize) return SDValue(); RepMovsRepeats Repeats(ConstantSize->getZExtValue()); if (!AlwaysInline && Repeats.Size > Subtarget.getMaxInlineSizeThreshold()) return SDValue(); /// If not DWORD aligned, it is more efficient to call the library. However /// if calling the library is not allowed (AlwaysInline), then soldier on as /// the code generated here is better than the long load-store sequence we /// would otherwise get. if (!AlwaysInline && (Align & 3) != 0) return SDValue(); // If to a segment-relative address space, use the default lowering. if (DstPtrInfo.getAddrSpace() >= 256 || SrcPtrInfo.getAddrSpace() >= 256) return SDValue(); // If the base register might conflict with our physical registers, bail out. const MCPhysReg ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI, X86::ECX, X86::ESI, X86::EDI}; if (isBaseRegConflictPossible(DAG, ClobberSet)) return SDValue(); // If the target has enhanced REPMOVSB, then it's at least as fast to use // REP MOVSB instead of REP MOVS{W,D,Q}, and it avoids having to handle // BytesLeft. if (!Subtarget.hasERMSB() && !(Align & 1)) { if (Align & 2) // WORD aligned Repeats.AVT = MVT::i16; else if (Align & 4) // DWORD aligned Repeats.AVT = MVT::i32; else // QWORD aligned Repeats.AVT = Subtarget.is64Bit() ? MVT::i64 : MVT::i32; if (Repeats.BytesLeft() > 0 && DAG.getMachineFunction().getFunction().optForMinSize()) { // When agressively optimizing for size, avoid generating the code to // handle BytesLeft. Repeats.AVT = MVT::i8; } } SDValue InFlag; Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX, DAG.getIntPtrConstant(Repeats.Count(), dl), InFlag); InFlag = Chain.getValue(1); Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI, Dst, InFlag); InFlag = Chain.getValue(1); Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RSI : X86::ESI, Src, InFlag); InFlag = Chain.getValue(1); SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, DAG.getValueType(Repeats.AVT), InFlag }; SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops); SmallVector<SDValue, 4> Results; Results.push_back(RepMovs); if (Repeats.BytesLeft()) { // Handle the last 1 - 7 bytes. unsigned Offset = Repeats.Size - Repeats.BytesLeft(); EVT DstVT = Dst.getValueType(); EVT SrcVT = Src.getValueType(); EVT SizeVT = Size.getValueType(); Results.push_back(DAG.getMemcpy(Chain, dl, DAG.getNode(ISD::ADD, dl, DstVT, Dst, DAG.getConstant(Offset, dl, DstVT)), DAG.getNode(ISD::ADD, dl, SrcVT, Src, DAG.getConstant(Offset, dl, SrcVT)), DAG.getConstant(Repeats.BytesLeft(), dl, SizeVT), Align, isVolatile, AlwaysInline, false, DstPtrInfo.getWithOffset(Offset), SrcPtrInfo.getWithOffset(Offset))); } return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results); }
SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { // This requires the copy size to be a constant, preferably // within a subtarget-specific limit. ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); if (!ConstantSize) return SDValue(); uint64_t SizeVal = ConstantSize->getZExtValue(); if (!AlwaysInline && SizeVal > Subtarget->getMaxInlineSizeThreshold()) return SDValue(); /// If not DWORD aligned, it is more efficient to call the library. However /// if calling the library is not allowed (AlwaysInline), then soldier on as /// the code generated here is better than the long load-store sequence we /// would otherwise get. if (!AlwaysInline && (Align & 3) != 0) return SDValue(); // If to a segment-relative address space, use the default lowering. if (DstPtrInfo.getAddrSpace() >= 256 || SrcPtrInfo.getAddrSpace() >= 256) return SDValue(); // If ESI is used as a base pointer, we must preserve it when doing rep movs. const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>(DAG.getTarget().getRegisterInfo()); bool PreserveESI = TRI->hasBasePointer(DAG.getMachineFunction()) && TRI->getBaseRegister() == X86::ESI; MVT AVT; if (Align & 1) AVT = MVT::i8; else if (Align & 2) AVT = MVT::i16; else if (Align & 4) // DWORD aligned AVT = MVT::i32; else // QWORD aligned AVT = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; unsigned UBytes = AVT.getSizeInBits() / 8; unsigned CountVal = SizeVal / UBytes; SDValue Count = DAG.getIntPtrConstant(CountVal); unsigned BytesLeft = SizeVal % UBytes; if (PreserveESI) { // Save ESI to a physical register. (We cannot use a virtual register // because if it is spilled we wouldn't be able to reload it.) // We don't glue this because the register dependencies are explicit. Chain = DAG.getCopyToReg(Chain, dl, X86::EDX, DAG.getRegister(X86::ESI, MVT::i32)); } SDValue InGlue(0, 0); Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : X86::ECX, Count, InGlue); InGlue = Chain.getValue(1); Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : X86::EDI, Dst, InGlue); InGlue = Chain.getValue(1); Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI : X86::ESI, Src, InGlue); InGlue = Chain.getValue(1); SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, DAG.getValueType(AVT), InGlue }; // FIXME: Make X86rep_movs explicitly use FCX, RDI, RSI instead of glue. SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops, array_lengthof(Ops)); if (PreserveESI) { InGlue = RepMovs.getValue(1); RepMovs = DAG.getCopyToReg(RepMovs, dl, X86::ESI, DAG.getRegister(X86::EDX, MVT::i32), InGlue); } SmallVector<SDValue, 4> Results; Results.push_back(RepMovs); if (BytesLeft) { // Handle the last 1 - 7 bytes. unsigned Offset = SizeVal - BytesLeft; EVT DstVT = Dst.getValueType(); EVT SrcVT = Src.getValueType(); EVT SizeVT = Size.getValueType(); Results.push_back(DAG.getMemcpy(Chain, dl, DAG.getNode(ISD::ADD, dl, DstVT, Dst, DAG.getConstant(Offset, DstVT)), DAG.getNode(ISD::ADD, dl, SrcVT, Src, DAG.getConstant(Offset, SrcVT)), DAG.getConstant(BytesLeft, SizeVT), Align, isVolatile, AlwaysInline, DstPtrInfo.getWithOffset(Offset), SrcPtrInfo.getWithOffset(Offset))); } return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Results[0], Results.size()); }
void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT OutVT = N->getValueType(0); EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT); SDValue InOp = N->getOperand(0); EVT InVT = InOp.getValueType(); DebugLoc dl = N->getDebugLoc(); // Handle some special cases efficiently. switch (getTypeAction(InVT)) { default: assert(false && "Unknown type action!"); case TargetLowering::TypeLegal: case TargetLowering::TypePromoteInteger: break; case TargetLowering::TypeSoftenFloat: // Convert the integer operand instead. SplitInteger(GetSoftenedFloat(InOp), Lo, Hi); Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo); Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi); return; case TargetLowering::TypeExpandInteger: case TargetLowering::TypeExpandFloat: // Convert the expanded pieces of the input. GetExpandedOp(InOp, Lo, Hi); Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo); Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi); return; case TargetLowering::TypeSplitVector: GetSplitVector(InOp, Lo, Hi); if (TLI.isBigEndian()) std::swap(Lo, Hi); Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo); Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi); return; case TargetLowering::TypeScalarizeVector: // Convert the element instead. SplitInteger(BitConvertToInteger(GetScalarizedVector(InOp)), Lo, Hi); Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo); Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi); return; case TargetLowering::TypeWidenVector: { assert(!(InVT.getVectorNumElements() & 1) && "Unsupported BITCAST"); InOp = GetWidenedVector(InOp); EVT InNVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(), InVT.getVectorNumElements()/2); Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp, DAG.getIntPtrConstant(0)); Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp, DAG.getIntPtrConstant(InNVT.getVectorNumElements())); if (TLI.isBigEndian()) std::swap(Lo, Hi); Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo); Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi); return; } } if (InVT.isVector() && OutVT.isInteger()) { // Handle cases like i64 = BITCAST v1i64 on x86, where the operand // is legal but the result is not. EVT NVT = EVT::getVectorVT(*DAG.getContext(), NOutVT, 2); if (isTypeLegal(NVT)) { SDValue CastInOp = DAG.getNode(ISD::BITCAST, dl, NVT, InOp); Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NOutVT, CastInOp, DAG.getIntPtrConstant(0)); Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NOutVT, CastInOp, DAG.getIntPtrConstant(1)); if (TLI.isBigEndian()) std::swap(Lo, Hi); return; } } // Lower the bit-convert to a store/load from the stack. assert(NOutVT.isByteSized() && "Expanded type not byte sized!"); // Create the stack frame object. Make sure it is aligned for both // the source and expanded destination types. unsigned Alignment = TLI.getTargetData()->getPrefTypeAlignment(NOutVT. getTypeForEVT(*DAG.getContext())); SDValue StackPtr = DAG.CreateStackTemporary(InVT, Alignment); int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex(); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SPFI); // Emit a store to the stack slot. SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, InOp, StackPtr, PtrInfo, false, false, 0); // Load the first half from the stack slot. Lo = DAG.getLoad(NOutVT, dl, Store, StackPtr, PtrInfo, false, false, 0); // Increment the pointer to the other half. unsigned IncrementSize = NOutVT.getSizeInBits() / 8; StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr, DAG.getIntPtrConstant(IncrementSize)); // Load the second half from the stack slot. Hi = DAG.getLoad(NOutVT, dl, Store, StackPtr, PtrInfo.getWithOffset(IncrementSize), false, false, MinAlign(Alignment, IncrementSize)); // Handle endianness of the load. if (TLI.isBigEndian()) std::swap(Lo, Hi); }
SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val, SDValue Size, unsigned Align, bool isVolatile, MachinePointerInfo DstPtrInfo) const { ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); const X86Subtarget &Subtarget = DAG.getMachineFunction().getSubtarget<X86Subtarget>(); #ifndef NDEBUG // If the base register might conflict with our physical registers, bail out. const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI, X86::ECX, X86::EAX, X86::EDI}; assert(!isBaseRegConflictPossible(DAG, ClobberSet)); #endif // If to a segment-relative address space, use the default lowering. if (DstPtrInfo.getAddrSpace() >= 256) return SDValue(); // If not DWORD aligned or size is more than the threshold, call the library. // The libc version is likely to be faster for these cases. It can use the // address value and run time information about the CPU. if ((Align & 3) != 0 || !ConstantSize || ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) { // Check to see if there is a specialized entry-point for memory zeroing. ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Val); if (const char *bzeroName = (ValC && ValC->isNullValue()) ? DAG.getTargetLoweringInfo().getLibcallName(RTLIB::BZERO) : nullptr) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout()); Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; Entry.Node = Dst; Entry.Ty = IntPtrTy; Args.push_back(Entry); Entry.Node = Size; Args.push_back(Entry); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(Chain) .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), DAG.getExternalSymbol(bzeroName, IntPtr), std::move(Args)) .setDiscardResult(); std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI); return CallResult.second; } // Otherwise have the target-independent code call memset. return SDValue(); } uint64_t SizeVal = ConstantSize->getZExtValue(); SDValue InFlag; EVT AVT; SDValue Count; ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Val); unsigned BytesLeft = 0; if (ValC) { unsigned ValReg; uint64_t Val = ValC->getZExtValue() & 255; // If the value is a constant, then we can potentially use larger sets. switch (Align & 3) { case 2: // WORD aligned AVT = MVT::i16; ValReg = X86::AX; Val = (Val << 8) | Val; break; case 0: // DWORD aligned AVT = MVT::i32; ValReg = X86::EAX; Val = (Val << 8) | Val; Val = (Val << 16) | Val; if (Subtarget.is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned AVT = MVT::i64; ValReg = X86::RAX; Val = (Val << 32) | Val; } break; default: // Byte aligned AVT = MVT::i8; ValReg = X86::AL; Count = DAG.getIntPtrConstant(SizeVal, dl); break; } if (AVT.bitsGT(MVT::i8)) { unsigned UBytes = AVT.getSizeInBits() / 8; Count = DAG.getIntPtrConstant(SizeVal / UBytes, dl); BytesLeft = SizeVal % UBytes; } Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT), InFlag); InFlag = Chain.getValue(1); } else { AVT = MVT::i8; Count = DAG.getIntPtrConstant(SizeVal, dl); Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Val, InFlag); InFlag = Chain.getValue(1); } bool Use64BitRegs = Subtarget.isTarget64BitLP64(); Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX, Count, InFlag); InFlag = Chain.getValue(1); Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI, Dst, InFlag); InFlag = Chain.getValue(1); SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag }; Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops); if (BytesLeft) { // Handle the last 1 - 7 bytes. unsigned Offset = SizeVal - BytesLeft; EVT AddrVT = Dst.getValueType(); EVT SizeVT = Size.getValueType(); Chain = DAG.getMemset(Chain, dl, DAG.getNode(ISD::ADD, dl, AddrVT, Dst, DAG.getConstant(Offset, dl, AddrVT)), Val, DAG.getConstant(BytesLeft, dl, SizeVT), Align, isVolatile, false, DstPtrInfo.getWithOffset(Offset)); } // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain. return Chain; }