SDValue DAGTypeLegalizer::ExpandOp_BUILD_VECTOR(SDNode *N) {
  // The vector type is legal but the element type needs expansion.
  EVT VecVT = N->getValueType(0);
  unsigned NumElts = VecVT.getVectorNumElements();
  EVT OldVT = N->getOperand(0).getValueType();
  EVT NewVT = TLI.getTypeToTransformTo(*DAG.getContext(), OldVT);
  SDLoc dl(N);

  assert(OldVT == VecVT.getVectorElementType() &&
         "BUILD_VECTOR operand type doesn't match vector element type!");

  // Build a vector of twice the length out of the expanded elements.
  // For example <3 x i64> -> <6 x i32>.
  std::vector<SDValue> NewElts;
  NewElts.reserve(NumElts*2);

  for (unsigned i = 0; i < NumElts; ++i) {
    SDValue Lo, Hi;
    GetExpandedOp(N->getOperand(i), Lo, Hi);
    if (TLI.isBigEndian())
      std::swap(Lo, Hi);
    NewElts.push_back(Lo);
    NewElts.push_back(Hi);
  }

  SDValue NewVec = DAG.getNode(ISD::BUILD_VECTOR, dl,
                               EVT::getVectorVT(*DAG.getContext(),
                                                NewVT, NewElts.size()),
                               &NewElts[0], NewElts.size());

  // Convert the new vector to the old vector type.
  return DAG.getNode(ISD::BITCAST, dl, VecVT, NewVec);
}
SDValue DAGTypeLegalizer::ExpandOp_INSERT_VECTOR_ELT(SDNode *N) {
  // The vector type is legal but the element type needs expansion.
  EVT VecVT = N->getValueType(0);
  unsigned NumElts = VecVT.getVectorNumElements();
  SDLoc dl(N);

  SDValue Val = N->getOperand(1);
  EVT OldEVT = Val.getValueType();
  EVT NewEVT = TLI.getTypeToTransformTo(*DAG.getContext(), OldEVT);

  assert(OldEVT == VecVT.getVectorElementType() &&
         "Inserted element type doesn't match vector element type!");

  // Bitconvert to a vector of twice the length with elements of the expanded
  // type, insert the expanded vector elements, and then convert back.
  EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewEVT, NumElts*2);
  SDValue NewVec = DAG.getNode(ISD::BITCAST, dl,
                               NewVecVT, N->getOperand(0));

  SDValue Lo, Hi;
  GetExpandedOp(Val, Lo, Hi);
  if (TLI.isBigEndian())
    std::swap(Lo, Hi);

  SDValue Idx = N->getOperand(2);
  Idx = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx, Idx);
  NewVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, NewVec, Lo, Idx);
  Idx = DAG.getNode(ISD::ADD, dl,
                    Idx.getValueType(), Idx,
                    DAG.getConstant(1, Idx.getValueType()));
  NewVec =  DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, NewVec, Hi, Idx);

  // Convert the new vector to the old vector type.
  return DAG.getNode(ISD::BITCAST, dl, VecVT, NewVec);
}
SDValue DAGTypeLegalizer::ExpandOp_SCALAR_TO_VECTOR(SDNode *N) {
  SDLoc dl(N);
  EVT VT = N->getValueType(0);
  assert(VT.getVectorElementType() == N->getOperand(0).getValueType() &&
         "SCALAR_TO_VECTOR operand type doesn't match vector element type!");
  unsigned NumElts = VT.getVectorNumElements();
  SmallVector<SDValue, 16> Ops(NumElts);
  Ops[0] = N->getOperand(0);
  SDValue UndefVal = DAG.getUNDEF(Ops[0].getValueType());
  for (unsigned i = 1; i < NumElts; ++i)
    Ops[i] = UndefVal;
  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], NumElts);
}
SDValue
NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                 bool isVarArg,
                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
                                 const SmallVectorImpl<SDValue> &OutVals,
                                 DebugLoc dl, SelectionDAG &DAG) const {

  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);

  unsigned sizesofar = 0;
  unsigned idx = 0;
  for (unsigned i=0, e=Outs.size(); i!=e; ++i) {
    SDValue theVal = OutVals[i];
    EVT theValType = theVal.getValueType();
    unsigned numElems = 1;
    if (theValType.isVector()) numElems = theValType.getVectorNumElements();
    for (unsigned j=0,je=numElems; j!=je; ++j) {
      SDValue tmpval = theVal;
      if (theValType.isVector())
        tmpval = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
                             theValType.getVectorElementType(),
                             tmpval, DAG.getIntPtrConstant(j));
      Chain = DAG.getNode(isABI ? NVPTXISD::StoreRetval :NVPTXISD::MoveToRetval,
          dl, MVT::Other,
          Chain,
          DAG.getConstant(isABI ? sizesofar : idx, MVT::i32),
          tmpval);
      if (theValType.isVector())
        sizesofar += theValType.getVectorElementType().getStoreSizeInBits()/8;
      else
        sizesofar += theValType.getStoreSizeInBits()/8;
      ++idx;
    }
  }

  return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
}
Exemple #5
0
static void EmitTypeGenerate(raw_ostream &OS, const Record *ArgType,
                             unsigned &ArgNo) {
  MVT::SimpleValueType VT = getValueType(ArgType->getValueAsDef("VT"));

  if (ArgType->isSubClassOf("LLVMMatchType")) {
    unsigned Number = ArgType->getValueAsInt("Number");
    assert(Number < ArgNo && "Invalid matching number!");
    if (ArgType->isSubClassOf("LLVMExtendedElementVectorType"))
      OS << "VectorType::getExtendedElementVectorType"
         << "(dyn_cast<VectorType>(Tys[" << Number << "]))";
    else if (ArgType->isSubClassOf("LLVMTruncatedElementVectorType"))
      OS << "VectorType::getTruncatedElementVectorType"
         << "(dyn_cast<VectorType>(Tys[" << Number << "]))";
    else
      OS << "Tys[" << Number << "]";
  } else if (VT == MVT::iAny || VT == MVT::fAny || VT == MVT::vAny) {
    // NOTE: The ArgNo variable here is not the absolute argument number, it is
    // the index of the "arbitrary" type in the Tys array passed to the
    // Intrinsic::getDeclaration function. Consequently, we only want to
    // increment it when we actually hit an overloaded type. Getting this wrong
    // leads to very subtle bugs!
    OS << "Tys[" << ArgNo++ << "]";
  } else if (EVT(VT).isVector()) {
    EVT VVT = VT;
    OS << "VectorType::get(";
    EmitTypeForValueType(OS, VVT.getVectorElementType().getSimpleVT().SimpleTy);
    OS << ", " << VVT.getVectorNumElements() << ")";
  } else if (VT == MVT::iPTR) {
    OS << "PointerType::getUnqual(";
    EmitTypeGenerate(OS, ArgType->getValueAsDef("ElTy"), ArgNo);
    OS << ")";
  } else if (VT == MVT::iPTRAny) {
    // Make sure the user has passed us an argument type to overload. If not,
    // treat it as an ordinary (not overloaded) intrinsic.
    OS << "(" << ArgNo << " < numTys) ? Tys[" << ArgNo 
    << "] : PointerType::getUnqual(";
    EmitTypeGenerate(OS, ArgType->getValueAsDef("ElTy"), ArgNo);
    OS << ")";
    ++ArgNo;
  } else if (VT == MVT::isVoid) {
    if (ArgNo == 0)
      OS << "Type::getVoidTy(Context)";
    else
      // MVT::isVoid is used to mean varargs here.
      OS << "...";
  } else {
    EmitTypeForValueType(OS, VT);
  }
}
// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
// (see LegalizeDAG.cpp). This is slow and uses local memory.
// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
SDValue NVPTXTargetLowering::
LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
  SDNode *Node = Op.getNode();
  DebugLoc dl = Node->getDebugLoc();
  SmallVector<SDValue, 8> Ops;
  unsigned NumOperands = Node->getNumOperands();
  for (unsigned i=0; i < NumOperands; ++i) {
    SDValue SubOp = Node->getOperand(i);
    EVT VVT = SubOp.getNode()->getValueType(0);
    EVT EltVT = VVT.getVectorElementType();
    unsigned NumSubElem = VVT.getVectorNumElements();
    for (unsigned j=0; j < NumSubElem; ++j) {
      Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
                                DAG.getIntPtrConstant(j)));
    }
  }
  return DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0),
                     &Ops[0], Ops.size());
}
static SDValue performDSPShiftCombine(unsigned Opc, SDNode *N, EVT Ty,
                                      SelectionDAG &DAG,
                                      const MipsSubtarget *Subtarget) {
  // See if this is a vector splat immediate node.
  APInt SplatValue, SplatUndef;
  unsigned SplatBitSize;
  bool HasAnyUndefs;
  unsigned EltSize = Ty.getVectorElementType().getSizeInBits();
  BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N->getOperand(1));

  if (!BV ||
      !BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs,
                           EltSize, !Subtarget->isLittle()) ||
      (SplatBitSize != EltSize) ||
      (SplatValue.getZExtValue() >= EltSize))
    return SDValue();

  return DAG.getNode(Opc, N->getDebugLoc(), Ty, N->getOperand(0),
                     DAG.getConstant(SplatValue.getZExtValue(), MVT::i32));
}
SDValue
NVPTXTargetLowering::LowerFormalArguments(SDValue Chain,
                                        CallingConv::ID CallConv, bool isVarArg,
                                      const SmallVectorImpl<ISD::InputArg> &Ins,
                                          DebugLoc dl, SelectionDAG &DAG,
                                       SmallVectorImpl<SDValue> &InVals) const {
  MachineFunction &MF = DAG.getMachineFunction();
  const DataLayout *TD = getDataLayout();

  const Function *F = MF.getFunction();
  const AttrListPtr &PAL = F->getAttributes();

  SDValue Root = DAG.getRoot();
  std::vector<SDValue> OutChains;

  bool isKernel = llvm::isKernelFunction(*F);
  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);

  std::vector<Type *> argTypes;
  std::vector<const Argument *> theArgs;
  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
      I != E; ++I) {
    theArgs.push_back(I);
    argTypes.push_back(I->getType());
  }
  assert(argTypes.size() == Ins.size() &&
         "Ins types and function types did not match");

  int idx = 0;
  for (unsigned i=0, e=Ins.size(); i!=e; ++i, ++idx) {
    Type *Ty = argTypes[i];
    EVT ObjectVT = getValueType(Ty);
    assert(ObjectVT == Ins[i].VT &&
           "Ins type did not match function type");

    // If the kernel argument is image*_t or sampler_t, convert it to
    // a i32 constant holding the parameter position. This can later
    // matched in the AsmPrinter to output the correct mangled name.
    if (isImageOrSamplerVal(theArgs[i],
                           (theArgs[i]->getParent() ?
                               theArgs[i]->getParent()->getParent() : 0))) {
      assert(isKernel && "Only kernels can have image/sampler params");
      InVals.push_back(DAG.getConstant(i+1, MVT::i32));
      continue;
    }

    if (theArgs[i]->use_empty()) {
      // argument is dead
      InVals.push_back(DAG.getNode(ISD::UNDEF, dl, ObjectVT));
      continue;
    }

    // In the following cases, assign a node order of "idx+1"
    // to newly created nodes. The SDNOdes for params have to
    // appear in the same order as their order of appearance
    // in the original function. "idx+1" holds that order.
    if (PAL.getParamAttributes(i+1).hasAttribute(Attributes::ByVal) == false) {
      // A plain scalar.
      if (isABI || isKernel) {
        // If ABI, load from the param symbol
        SDValue Arg = getParamSymbol(DAG, idx);
        Value *srcValue = new Argument(PointerType::get(ObjectVT.getTypeForEVT(
            F->getContext()),
            llvm::ADDRESS_SPACE_PARAM));
        SDValue p = DAG.getLoad(ObjectVT, dl, Root, Arg,
                                MachinePointerInfo(srcValue), false, false,
                                false,
                                TD->getABITypeAlignment(ObjectVT.getTypeForEVT(
                                  F->getContext())));
        if (p.getNode())
          DAG.AssignOrdering(p.getNode(), idx+1);
        InVals.push_back(p);
      }
      else {
        // If no ABI, just move the param symbol
        SDValue Arg = getParamSymbol(DAG, idx, ObjectVT);
        SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
        if (p.getNode())
          DAG.AssignOrdering(p.getNode(), idx+1);
        InVals.push_back(p);
      }
      continue;
    }

    // Param has ByVal attribute
    if (isABI || isKernel) {
      // Return MoveParam(param symbol).
      // Ideally, the param symbol can be returned directly,
      // but when SDNode builder decides to use it in a CopyToReg(),
      // machine instruction fails because TargetExternalSymbol
      // (not lowered) is target dependent, and CopyToReg assumes
      // the source is lowered.
      SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
      SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
      if (p.getNode())
        DAG.AssignOrdering(p.getNode(), idx+1);
      if (isKernel)
        InVals.push_back(p);
      else {
        SDValue p2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, ObjectVT,
                    DAG.getConstant(Intrinsic::nvvm_ptr_local_to_gen, MVT::i32),
                                 p);
        InVals.push_back(p2);
      }
    } else {
      // Have to move a set of param symbols to registers and
      // store them locally and return the local pointer in InVals
      const PointerType *elemPtrType = dyn_cast<PointerType>(argTypes[i]);
      assert(elemPtrType &&
             "Byval parameter should be a pointer type");
      Type *elemType = elemPtrType->getElementType();
      // Compute the constituent parts
      SmallVector<EVT, 16> vtparts;
      SmallVector<uint64_t, 16> offsets;
      ComputeValueVTs(*this, elemType, vtparts, &offsets, 0);
      unsigned totalsize = 0;
      for (unsigned j=0, je=vtparts.size(); j!=je; ++j)
        totalsize += vtparts[j].getStoreSizeInBits();
      SDValue localcopy =  DAG.getFrameIndex(MF.getFrameInfo()->
                                      CreateStackObject(totalsize/8, 16, false),
                                             getPointerTy());
      unsigned sizesofar = 0;
      std::vector<SDValue> theChains;
      for (unsigned j=0, je=vtparts.size(); j!=je; ++j) {
        unsigned numElems = 1;
        if (vtparts[j].isVector()) numElems = vtparts[j].getVectorNumElements();
        for (unsigned k=0, ke=numElems; k!=ke; ++k) {
          EVT tmpvt = vtparts[j];
          if (tmpvt.isVector()) tmpvt = tmpvt.getVectorElementType();
          SDValue arg = DAG.getNode(NVPTXISD::MoveParam, dl, tmpvt,
                                    getParamSymbol(DAG, idx, tmpvt));
          SDValue addr = DAG.getNode(ISD::ADD, dl, getPointerTy(), localcopy,
                                    DAG.getConstant(sizesofar, getPointerTy()));
          theChains.push_back(DAG.getStore(Chain, dl, arg, addr,
                                        MachinePointerInfo(), false, false, 0));
          sizesofar += tmpvt.getStoreSizeInBits()/8;
          ++idx;
        }
      }
      --idx;
      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &theChains[0],
                          theChains.size());
      InVals.push_back(localcopy);
    }
  }

  // Clang will check explicit VarArg and issue error if any. However, Clang
  // will let code with
  // implicit var arg like f() pass.
  // We treat this case as if the arg list is empty.
  //if (F.isVarArg()) {
  // assert(0 && "VarArg not supported yet!");
  //}

  if (!OutChains.empty())
    DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                            &OutChains[0], OutChains.size()));

  return Chain;
}
std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
  unsigned Opcode = Node->getOpcode();
  SDLoc DL(Node);

  ///
  // Instruction Selection not handled by the auto-generated
  // tablegen selection should be handled here.
  ///
  SDNode *Result;

  switch(Opcode) {
  default: break;

  case ISD::SUBE: {
    SDValue InFlag = Node->getOperand(2);
    Result = selectAddESubE(Mips::SUBu, InFlag, InFlag.getOperand(0), DL, Node);
    return std::make_pair(true, Result);
  }

  case ISD::ADDE: {
    if (Subtarget.hasDSP()) // Select DSP instructions, ADDSC and ADDWC.
      break;
    SDValue InFlag = Node->getOperand(2);
    Result = selectAddESubE(Mips::ADDu, InFlag, InFlag.getValue(0), DL, Node);
    return std::make_pair(true, Result);
  }

  case ISD::ConstantFP: {
    ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(Node);
    if (Node->getValueType(0) == MVT::f64 && CN->isExactlyValue(+0.0)) {
      if (Subtarget.hasMips64()) {
        SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
                                              Mips::ZERO_64, MVT::i64);
        Result = CurDAG->getMachineNode(Mips::DMTC1, DL, MVT::f64, Zero);
      } else {
        SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
                                              Mips::ZERO, MVT::i32);
        Result = CurDAG->getMachineNode(Mips::BuildPairF64, DL, MVT::f64, Zero,
                                        Zero);
      }

      return std::make_pair(true, Result);
    }
    break;
  }

  case ISD::Constant: {
    const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Node);
    unsigned Size = CN->getValueSizeInBits(0);

    if (Size == 32)
      break;

    MipsAnalyzeImmediate AnalyzeImm;
    int64_t Imm = CN->getSExtValue();

    const MipsAnalyzeImmediate::InstSeq &Seq =
      AnalyzeImm.Analyze(Imm, Size, false);

    MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin();
    SDLoc DL(CN);
    SDNode *RegOpnd;
    SDValue ImmOpnd = CurDAG->getTargetConstant(SignExtend64<16>(Inst->ImmOpnd),
                                                MVT::i64);

    // The first instruction can be a LUi which is different from other
    // instructions (ADDiu, ORI and SLL) in that it does not have a register
    // operand.
    if (Inst->Opc == Mips::LUi64)
      RegOpnd = CurDAG->getMachineNode(Inst->Opc, DL, MVT::i64, ImmOpnd);
    else
      RegOpnd =
        CurDAG->getMachineNode(Inst->Opc, DL, MVT::i64,
                               CurDAG->getRegister(Mips::ZERO_64, MVT::i64),
                               ImmOpnd);

    // The remaining instructions in the sequence are handled here.
    for (++Inst; Inst != Seq.end(); ++Inst) {
      ImmOpnd = CurDAG->getTargetConstant(SignExtend64<16>(Inst->ImmOpnd),
                                          MVT::i64);
      RegOpnd = CurDAG->getMachineNode(Inst->Opc, DL, MVT::i64,
                                       SDValue(RegOpnd, 0), ImmOpnd);
    }

    return std::make_pair(true, RegOpnd);
  }

  case ISD::INTRINSIC_W_CHAIN: {
    switch (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue()) {
    default:
      break;

    case Intrinsic::mips_cfcmsa: {
      SDValue ChainIn = Node->getOperand(0);
      SDValue RegIdx = Node->getOperand(2);
      SDValue Reg = CurDAG->getCopyFromReg(ChainIn, DL,
                                           getMSACtrlReg(RegIdx), MVT::i32);
      return std::make_pair(true, Reg.getNode());
    }
    }
    break;
  }

  case ISD::INTRINSIC_WO_CHAIN: {
    switch (cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue()) {
    default:
      break;

    case Intrinsic::mips_move_v:
      // Like an assignment but will always produce a move.v even if
      // unnecessary.
      return std::make_pair(true,
                            CurDAG->getMachineNode(Mips::MOVE_V, DL,
                                                   Node->getValueType(0),
                                                   Node->getOperand(1)));
    }
    break;
  }

  case ISD::INTRINSIC_VOID: {
    switch (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue()) {
    default:
      break;

    case Intrinsic::mips_ctcmsa: {
      SDValue ChainIn = Node->getOperand(0);
      SDValue RegIdx  = Node->getOperand(2);
      SDValue Value   = Node->getOperand(3);
      SDValue ChainOut = CurDAG->getCopyToReg(ChainIn, DL,
                                              getMSACtrlReg(RegIdx), Value);
      return std::make_pair(true, ChainOut.getNode());
    }
    }
    break;
  }

  case MipsISD::ThreadPointer: {
    EVT PtrVT = getTargetLowering()->getPointerTy();
    unsigned RdhwrOpc, DestReg;

    if (PtrVT == MVT::i32) {
      RdhwrOpc = Mips::RDHWR;
      DestReg = Mips::V1;
    } else {
      RdhwrOpc = Mips::RDHWR64;
      DestReg = Mips::V1_64;
    }

    SDNode *Rdhwr =
      CurDAG->getMachineNode(RdhwrOpc, SDLoc(Node),
                             Node->getValueType(0),
                             CurDAG->getRegister(Mips::HWR29, MVT::i32));
    SDValue Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, DestReg,
                                         SDValue(Rdhwr, 0));
    SDValue ResNode = CurDAG->getCopyFromReg(Chain, DL, DestReg, PtrVT);
    ReplaceUses(SDValue(Node, 0), ResNode);
    return std::make_pair(true, ResNode.getNode());
  }

  case ISD::BUILD_VECTOR: {
    // Select appropriate ldi.[bhwd] instructions for constant splats of
    // 128-bit when MSA is enabled. Fixup any register class mismatches that
    // occur as a result.
    //
    // This allows the compiler to use a wider range of immediates than would
    // otherwise be allowed. If, for example, v4i32 could only use ldi.h then
    // it would not be possible to load { 0x01010101, 0x01010101, 0x01010101,
    // 0x01010101 } without using a constant pool. This would be sub-optimal
    // when // 'ldi.b wd, 1' is capable of producing that bit-pattern in the
    // same set/ of registers. Similarly, ldi.h isn't capable of producing {
    // 0x00000000, 0x00000001, 0x00000000, 0x00000001 } but 'ldi.d wd, 1' can.

    BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Node);
    APInt SplatValue, SplatUndef;
    unsigned SplatBitSize;
    bool HasAnyUndefs;
    unsigned LdiOp;
    EVT ResVecTy = BVN->getValueType(0);
    EVT ViaVecTy;

    if (!Subtarget.hasMSA() || !BVN->getValueType(0).is128BitVector())
      return std::make_pair(false, (SDNode*)NULL);

    if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
                              HasAnyUndefs, 8,
                              !Subtarget.isLittle()))
      return std::make_pair(false, (SDNode*)NULL);

    switch (SplatBitSize) {
    default:
      return std::make_pair(false, (SDNode*)NULL);
    case 8:
      LdiOp = Mips::LDI_B;
      ViaVecTy = MVT::v16i8;
      break;
    case 16:
      LdiOp = Mips::LDI_H;
      ViaVecTy = MVT::v8i16;
      break;
    case 32:
      LdiOp = Mips::LDI_W;
      ViaVecTy = MVT::v4i32;
      break;
    case 64:
      LdiOp = Mips::LDI_D;
      ViaVecTy = MVT::v2i64;
      break;
    }

    if (!SplatValue.isSignedIntN(10))
      return std::make_pair(false, (SDNode*)NULL);

    SDValue Imm = CurDAG->getTargetConstant(SplatValue,
                                            ViaVecTy.getVectorElementType());

    SDNode *Res = CurDAG->getMachineNode(LdiOp, SDLoc(Node), ViaVecTy, Imm);

    if (ResVecTy != ViaVecTy) {
      // If LdiOp is writing to a different register class to ResVecTy, then
      // fix it up here. This COPY_TO_REGCLASS should never cause a move.v
      // since the source and destination register sets contain the same
      // registers.
      const TargetLowering *TLI = getTargetLowering();
      MVT ResVecTySimple = ResVecTy.getSimpleVT();
      const TargetRegisterClass *RC = TLI->getRegClassFor(ResVecTySimple);
      Res = CurDAG->getMachineNode(Mips::COPY_TO_REGCLASS, SDLoc(Node),
                                   ResVecTy, SDValue(Res, 0),
                                   CurDAG->getTargetConstant(RC->getID(),
                                                             MVT::i32));
    }

    return std::make_pair(true, Res);
  }

  }

  return std::make_pair(false, (SDNode*)NULL);
}
SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
  unsigned int Opc = N->getOpcode();
  if (N->isMachineOpcode()) {
    N->setNodeId(-1);
    return NULL;   // Already selected.
  }
  switch (Opc) {
  default: break;
  // We are selecting i64 ADD here instead of custom lower it during
  // DAG legalization, so we can fold some i64 ADDs used for address
  // calculation into the LOAD and STORE instructions.
  case ISD::ADD: {
    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
    if (N->getValueType(0) != MVT::i64 ||
        ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
      break;

    SDLoc DL(N);
    SDValue LHS = N->getOperand(0);
    SDValue RHS = N->getOperand(1);

    SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32);
    SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32);

    SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
                                         DL, MVT::i32, LHS, Sub0);
    SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
                                         DL, MVT::i32, LHS, Sub1);

    SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
                                         DL, MVT::i32, RHS, Sub0);
    SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
                                         DL, MVT::i32, RHS, Sub1);

    SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);

    SmallVector<SDValue, 8> AddLoArgs;
    AddLoArgs.push_back(SDValue(Lo0, 0));
    AddLoArgs.push_back(SDValue(Lo1, 0));

    SDNode *AddLo = CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL,
                                           VTList, AddLoArgs);
    SDValue Carry = SDValue(AddLo, 1);
    SDNode *AddHi = CurDAG->getMachineNode(AMDGPU::S_ADDC_U32, DL,
                                           MVT::i32, SDValue(Hi0, 0),
                                           SDValue(Hi1, 0), Carry);

    SDValue Args[5] = {
      CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32),
      SDValue(AddLo,0),
      Sub0,
      SDValue(AddHi,0),
      Sub1,
    };
    return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args, 5);
  }
  case ISD::BUILD_VECTOR: {
    unsigned RegClassID;
    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
    const AMDGPURegisterInfo *TRI =
                   static_cast<const AMDGPURegisterInfo*>(TM.getRegisterInfo());
    const SIRegisterInfo *SIRI =
                   static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
    EVT VT = N->getValueType(0);
    unsigned NumVectorElts = VT.getVectorNumElements();
    assert(VT.getVectorElementType().bitsEq(MVT::i32));
    if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
      bool UseVReg = true;
      for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
                                                    U != E; ++U) {
        if (!U->isMachineOpcode()) {
          continue;
        }
        const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
        if (!RC) {
          continue;
        }
        if (SIRI->isSGPRClass(RC)) {
          UseVReg = false;
        }
      }
      switch(NumVectorElts) {
      case 1: RegClassID = UseVReg ? AMDGPU::VReg_32RegClassID :
                                     AMDGPU::SReg_32RegClassID;
        break;
      case 2: RegClassID = UseVReg ? AMDGPU::VReg_64RegClassID :
                                     AMDGPU::SReg_64RegClassID;
        break;
      case 4: RegClassID = UseVReg ? AMDGPU::VReg_128RegClassID :
                                     AMDGPU::SReg_128RegClassID;
        break;
      case 8: RegClassID = UseVReg ? AMDGPU::VReg_256RegClassID :
                                     AMDGPU::SReg_256RegClassID;
        break;
      case 16: RegClassID = UseVReg ? AMDGPU::VReg_512RegClassID :
                                      AMDGPU::SReg_512RegClassID;
        break;
      default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
      }
    } else {
      // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
      // that adds a 128 bits reg copy when going through TwoAddressInstructions
      // pass. We want to avoid 128 bits copies as much as possible because they
      // can't be bundled by our scheduler.
      switch(NumVectorElts) {
      case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
      case 4: RegClassID = AMDGPU::R600_Reg128RegClassID; break;
      default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
      }
    }

    SDValue RegClass = CurDAG->getTargetConstant(RegClassID, MVT::i32);

    if (NumVectorElts == 1) {
      return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS,
                                  VT.getVectorElementType(),
                                  N->getOperand(0), RegClass);
    }

    assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not "
                                  "supported yet");
    // 16 = Max Num Vector Elements
    // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
    // 1 = Vector Register Class
    SDValue RegSeqArgs[16 * 2 + 1];

    RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, MVT::i32);
    bool IsRegSeq = true;
    for (unsigned i = 0; i < N->getNumOperands(); i++) {
      // XXX: Why is this here?
      if (dyn_cast<RegisterSDNode>(N->getOperand(i))) {
        IsRegSeq = false;
        break;
      }
      RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
      RegSeqArgs[1 + (2 * i) + 1] =
              CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), MVT::i32);
    }
    if (!IsRegSeq)
      break;
    return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(),
        RegSeqArgs, 2 * N->getNumOperands() + 1);
  }
  case ISD::BUILD_PAIR: {
    SDValue RC, SubReg0, SubReg1;
    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
      break;
    }
    if (N->getValueType(0) == MVT::i128) {
      RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32);
      SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, MVT::i32);
      SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, MVT::i32);
    } else if (N->getValueType(0) == MVT::i64) {
      RC = CurDAG->getTargetConstant(AMDGPU::VSrc_64RegClassID, MVT::i32);
      SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32);
      SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32);
    } else {
      llvm_unreachable("Unhandled value type for BUILD_PAIR");
    }
    const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
                            N->getOperand(1), SubReg1 };
    return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
                                  SDLoc(N), N->getValueType(0), Ops);
  }
  case AMDGPUISD::REGISTER_LOAD: {
    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
      break;
    SDValue Addr, Offset;

    SelectADDRIndirect(N->getOperand(1), Addr, Offset);
    const SDValue Ops[] = {
      Addr,
      Offset,
      CurDAG->getTargetConstant(0, MVT::i32),
      N->getOperand(0),
    };
    return CurDAG->getMachineNode(AMDGPU::SI_RegisterLoad, SDLoc(N),
                                  CurDAG->getVTList(MVT::i32, MVT::i64, MVT::Other),
                                  Ops);
  }
  case AMDGPUISD::REGISTER_STORE: {
    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
      break;
    SDValue Addr, Offset;
    SelectADDRIndirect(N->getOperand(2), Addr, Offset);
    const SDValue Ops[] = {
      N->getOperand(1),
      Addr,
      Offset,
      CurDAG->getTargetConstant(0, MVT::i32),
      N->getOperand(0),
    };
    return CurDAG->getMachineNode(AMDGPU::SI_RegisterStorePseudo, SDLoc(N),
                                        CurDAG->getVTList(MVT::Other),
                                        Ops);
  }
  }
  return SelectCode(N);
}
void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
  EVT OutVT = N->getValueType(0);
  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
  SDValue InOp = N->getOperand(0);
  EVT InVT = InOp.getValueType();
  SDLoc dl(N);

  // Handle some special cases efficiently.
  switch (getTypeAction(InVT)) {
    case TargetLowering::TypeLegal:
    case TargetLowering::TypePromoteInteger:
      break;
    case TargetLowering::TypeSoftenFloat:
      // Convert the integer operand instead.
      SplitInteger(GetSoftenedFloat(InOp), Lo, Hi);
      Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
      Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
      return;
    case TargetLowering::TypeExpandInteger:
    case TargetLowering::TypeExpandFloat:
      // Convert the expanded pieces of the input.
      GetExpandedOp(InOp, Lo, Hi);
      Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
      Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
      return;
    case TargetLowering::TypeSplitVector:
      GetSplitVector(InOp, Lo, Hi);
      if (TLI.isBigEndian())
        std::swap(Lo, Hi);
      Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
      Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
      return;
    case TargetLowering::TypeScalarizeVector:
      // Convert the element instead.
      SplitInteger(BitConvertToInteger(GetScalarizedVector(InOp)), Lo, Hi);
      Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
      Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
      return;
    case TargetLowering::TypeWidenVector: {
      assert(!(InVT.getVectorNumElements() & 1) && "Unsupported BITCAST");
      InOp = GetWidenedVector(InOp);
      EVT InNVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(),
                                   InVT.getVectorNumElements()/2);
      Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp,
                       DAG.getConstant(0, TLI.getVectorIdxTy()));
      Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp,
                       DAG.getConstant(InNVT.getVectorNumElements(),
                                       TLI.getVectorIdxTy()));
      if (TLI.isBigEndian())
        std::swap(Lo, Hi);
      Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
      Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
      return;
    }
  }

  if (InVT.isVector() && OutVT.isInteger()) {
    // Handle cases like i64 = BITCAST v1i64 on x86, where the operand
    // is legal but the result is not.
    unsigned NumElems = 2;
    EVT ElemVT = NOutVT;
    EVT NVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElems);

    // If <ElemVT * N> is not a legal type, try <ElemVT/2 * (N*2)>.
    while (!isTypeLegal(NVT)) {
      unsigned NewSizeInBits = ElemVT.getSizeInBits() / 2;
      // If the element size is smaller than byte, bail.
      if (NewSizeInBits < 8)
        break;
      NumElems *= 2;
      ElemVT = EVT::getIntegerVT(*DAG.getContext(), NewSizeInBits);
      NVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElems);
    }

    if (isTypeLegal(NVT)) {
      SDValue CastInOp = DAG.getNode(ISD::BITCAST, dl, NVT, InOp);

      SmallVector<SDValue, 8> Vals;
      for (unsigned i = 0; i < NumElems; ++i)
        Vals.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ElemVT,
                                   CastInOp, DAG.getConstant(i,
                                             TLI.getVectorIdxTy())));

      // Build Lo, Hi pair by pairing extracted elements if needed.
      unsigned Slot = 0;
      for (unsigned e = Vals.size(); e - Slot > 2; Slot += 2, e += 1) {
        // Each iteration will BUILD_PAIR two nodes and append the result until
        // there are only two nodes left, i.e. Lo and Hi.
        SDValue LHS = Vals[Slot];
        SDValue RHS = Vals[Slot + 1];

        if (TLI.isBigEndian())
          std::swap(LHS, RHS);

        Vals.push_back(DAG.getNode(ISD::BUILD_PAIR, dl,
                                   EVT::getIntegerVT(
                                     *DAG.getContext(),
                                     LHS.getValueType().getSizeInBits() << 1),
                                   LHS, RHS));
      }
      Lo = Vals[Slot++];
      Hi = Vals[Slot++];

      if (TLI.isBigEndian())
        std::swap(Lo, Hi);

      return;
    }
  }

  // Lower the bit-convert to a store/load from the stack.
  assert(NOutVT.isByteSized() && "Expanded type not byte sized!");

  // Create the stack frame object.  Make sure it is aligned for both
  // the source and expanded destination types.
  unsigned Alignment =
    TLI.getDataLayout()->getPrefTypeAlignment(NOutVT.
                                              getTypeForEVT(*DAG.getContext()));
  SDValue StackPtr = DAG.CreateStackTemporary(InVT, Alignment);
  int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
  MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SPFI);

  // Emit a store to the stack slot.
  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, InOp, StackPtr, PtrInfo,
                               false, false, 0);

  // Load the first half from the stack slot.
  Lo = DAG.getLoad(NOutVT, dl, Store, StackPtr, PtrInfo,
                   false, false, false, 0);

  // Increment the pointer to the other half.
  unsigned IncrementSize = NOutVT.getSizeInBits() / 8;
  StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
                         DAG.getIntPtrConstant(IncrementSize));

  // Load the second half from the stack slot.
  Hi = DAG.getLoad(NOutVT, dl, Store, StackPtr,
                   PtrInfo.getWithOffset(IncrementSize), false,
                   false, false, MinAlign(Alignment, IncrementSize));

  // Handle endianness of the load.
  if (TLI.isBigEndian())
    std::swap(Lo, Hi);
}
void DAGTypeLegalizer::ExpandRes_BIT_CONVERT(SDNode *N, SDValue &Lo,
                                             SDValue &Hi) {
  EVT OutVT = N->getValueType(0);
  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
  SDValue InOp = N->getOperand(0);
  EVT InVT = InOp.getValueType();
  DebugLoc dl = N->getDebugLoc();

  // Handle some special cases efficiently.
  switch (getTypeAction(InVT)) {
    default:
      assert(false && "Unknown type action!");
    case Legal:
    case PromoteInteger:
      break;
    case SoftenFloat:
      // Convert the integer operand instead.
      SplitInteger(GetSoftenedFloat(InOp), Lo, Hi);
      Lo = DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, Lo);
      Hi = DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, Hi);
      return;
    case ExpandInteger:
    case ExpandFloat:
      // Convert the expanded pieces of the input.
      GetExpandedOp(InOp, Lo, Hi);
      Lo = DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, Lo);
      Hi = DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, Hi);
      return;
    case SplitVector:
      GetSplitVector(InOp, Lo, Hi);
      if (TLI.isBigEndian())
        std::swap(Lo, Hi);
      Lo = DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, Lo);
      Hi = DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, Hi);
      return;
    case ScalarizeVector:
      // Convert the element instead.
      SplitInteger(BitConvertToInteger(GetScalarizedVector(InOp)), Lo, Hi);
      Lo = DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, Lo);
      Hi = DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, Hi);
      return;
    case WidenVector: {
      assert(!(InVT.getVectorNumElements() & 1) && "Unsupported BIT_CONVERT");
      InOp = GetWidenedVector(InOp);
      EVT InNVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(),
                                   InVT.getVectorNumElements()/2);
      Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp,
                       DAG.getIntPtrConstant(0));
      Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp,
                       DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
      if (TLI.isBigEndian())
        std::swap(Lo, Hi);
      Lo = DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, Lo);
      Hi = DAG.getNode(ISD::BIT_CONVERT, dl, NOutVT, Hi);
      return;
    }
  }

  if (InVT.isVector() && OutVT.isInteger()) {
    // Handle cases like i64 = BIT_CONVERT v1i64 on x86, where the operand
    // is legal but the result is not.
    EVT NVT = EVT::getVectorVT(*DAG.getContext(), NOutVT, 2);

    if (isTypeLegal(NVT)) {
      SDValue CastInOp = DAG.getNode(ISD::BIT_CONVERT, dl, NVT, InOp);
      Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NOutVT, CastInOp,
                       DAG.getIntPtrConstant(0));
      Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NOutVT, CastInOp,
                       DAG.getIntPtrConstant(1));

      if (TLI.isBigEndian())
        std::swap(Lo, Hi);

      return;
    }
  }

  // Lower the bit-convert to a store/load from the stack.
  assert(NOutVT.isByteSized() && "Expanded type not byte sized!");

  // Create the stack frame object.  Make sure it is aligned for both
  // the source and expanded destination types.
  unsigned Alignment =
    TLI.getTargetData()->getPrefTypeAlignment(NOutVT.getTypeForEVT(*DAG.getContext()));
  SDValue StackPtr = DAG.CreateStackTemporary(InVT, Alignment);
  int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
  const Value *SV = PseudoSourceValue::getFixedStack(SPFI);

  // Emit a store to the stack slot.
  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, InOp, StackPtr, SV, 0);

  // Load the first half from the stack slot.
  Lo = DAG.getLoad(NOutVT, dl, Store, StackPtr, SV, 0);

  // Increment the pointer to the other half.
  unsigned IncrementSize = NOutVT.getSizeInBits() / 8;
  StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
                         DAG.getIntPtrConstant(IncrementSize));

  // Load the second half from the stack slot.
  Hi = DAG.getLoad(NOutVT, dl, Store, StackPtr, SV, IncrementSize, false,
                   MinAlign(Alignment, IncrementSize));

  // Handle endianness of the load.
  if (TLI.isBigEndian())
    std::swap(Lo, Hi);
}
/// getVectorTypeBreakdown - Vector types are broken down into some number of
/// legal first class types.  For example, MVT::v8f32 maps to 2 MVT::v4f32
/// with Altivec or SSE1, or 8 promoted MVT::f64 values with the X86 FP stack.
/// Similarly, MVT::v2i64 turns into 4 MVT::i32 values with both PPC and X86.
///
/// This method returns the number of registers needed, and the VT for each
/// register.  It also returns the VT and quantity of the intermediate values
/// before they are promoted/expanded.
///
unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context, EVT VT,
                                                EVT &IntermediateVT,
                                                unsigned &NumIntermediates,
                                                MVT &RegisterVT) const {
  unsigned NumElts = VT.getVectorNumElements();

  // If there is a wider vector type with the same element type as this one,
  // or a promoted vector type that has the same number of elements which
  // are wider, then we should convert to that legal vector type.
  // This handles things like <2 x float> -> <4 x float> and
  // <4 x i1> -> <4 x i32>.
  LegalizeTypeAction TA = getTypeAction(Context, VT);
  if (NumElts != 1 && (TA == TypeWidenVector || TA == TypePromoteInteger)) {
    EVT RegisterEVT = getTypeToTransformTo(Context, VT);
    if (isTypeLegal(RegisterEVT)) {
      IntermediateVT = RegisterEVT;
      RegisterVT = RegisterEVT.getSimpleVT();
      NumIntermediates = 1;
      return 1;
    }
  }

  // Figure out the right, legal destination reg to copy into.
  EVT EltTy = VT.getVectorElementType();

  unsigned NumVectorRegs = 1;

  // FIXME: We don't support non-power-of-2-sized vectors for now.  Ideally we
  // could break down into LHS/RHS like LegalizeDAG does.
  if (!isPowerOf2_32(NumElts)) {
    NumVectorRegs = NumElts;
    NumElts = 1;
  }

  // Divide the input until we get to a supported size.  This will always
  // end with a scalar if the target doesn't support vectors.
  while (NumElts > 1 && !isTypeLegal(
                                   EVT::getVectorVT(Context, EltTy, NumElts))) {
    NumElts >>= 1;
    NumVectorRegs <<= 1;
  }

  NumIntermediates = NumVectorRegs;

  EVT NewVT = EVT::getVectorVT(Context, EltTy, NumElts);
  if (!isTypeLegal(NewVT))
    NewVT = EltTy;
  IntermediateVT = NewVT;

  MVT DestVT = getRegisterType(Context, NewVT);
  RegisterVT = DestVT;
  unsigned NewVTSize = NewVT.getSizeInBits();

  // Convert sizes such as i33 to i64.
  if (!isPowerOf2_32(NewVTSize))
    NewVTSize = NextPowerOf2(NewVTSize);

  if (EVT(DestVT).bitsLT(NewVT))   // Value is expanded, e.g. i64 -> i16.
    return NumVectorRegs*(NewVTSize/DestVT.getSizeInBits());

  // Otherwise, promotion or legal types use the same number of registers as
  // the vector decimated to the appropriate level.
  return NumVectorRegs;
}
SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
  unsigned int Opc = N->getOpcode();
  if (N->isMachineOpcode()) {
    return NULL;   // Already selected.
  }
  switch (Opc) {
  default: break;
  case ISD::BUILD_VECTOR: {
    unsigned RegClassID;
    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
    const AMDGPURegisterInfo *TRI =
                   static_cast<const AMDGPURegisterInfo*>(TM.getRegisterInfo());
    const SIRegisterInfo *SIRI =
                   static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
    EVT VT = N->getValueType(0);
    unsigned NumVectorElts = VT.getVectorNumElements();
    assert(VT.getVectorElementType().bitsEq(MVT::i32));
    if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
      bool UseVReg = true;
      for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
                                                    U != E; ++U) {
        if (!U->isMachineOpcode()) {
          continue;
        }
        const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
        if (!RC) {
          continue;
        }
        if (SIRI->isSGPRClass(RC)) {
          UseVReg = false;
        }
      }
      switch(NumVectorElts) {
      case 1: RegClassID = UseVReg ? AMDGPU::VReg_32RegClassID :
                                     AMDGPU::SReg_32RegClassID;
        break;
      case 2: RegClassID = UseVReg ? AMDGPU::VReg_64RegClassID :
                                     AMDGPU::SReg_64RegClassID;
        break;
      case 4: RegClassID = UseVReg ? AMDGPU::VReg_128RegClassID :
                                     AMDGPU::SReg_128RegClassID;
        break;
      case 8: RegClassID = UseVReg ? AMDGPU::VReg_256RegClassID :
                                     AMDGPU::SReg_256RegClassID;
        break;
      case 16: RegClassID = UseVReg ? AMDGPU::VReg_512RegClassID :
                                      AMDGPU::SReg_512RegClassID;
        break;
      default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
      }
    } else {
      // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
      // that adds a 128 bits reg copy when going through TwoAddressInstructions
      // pass. We want to avoid 128 bits copies as much as possible because they
      // can't be bundled by our scheduler.
      switch(NumVectorElts) {
      case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
      case 4: RegClassID = AMDGPU::R600_Reg128RegClassID; break;
      default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
      }
    }

    SDValue RegClass = CurDAG->getTargetConstant(RegClassID, MVT::i32);

    if (NumVectorElts == 1) {
      return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS,
                                  VT.getVectorElementType(),
                                  N->getOperand(0), RegClass);
    }

    assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not "
                                  "supported yet");
    // 16 = Max Num Vector Elements
    // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
    // 1 = Vector Register Class
    SDValue RegSeqArgs[16 * 2 + 1];

    RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, MVT::i32);
    bool IsRegSeq = true;
    for (unsigned i = 0; i < N->getNumOperands(); i++) {
      // XXX: Why is this here?
      if (dyn_cast<RegisterSDNode>(N->getOperand(i))) {
        IsRegSeq = false;
        break;
      }
      RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
      RegSeqArgs[1 + (2 * i) + 1] =
              CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), MVT::i32);
    }
    if (!IsRegSeq)
      break;
    return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(),
        RegSeqArgs, 2 * N->getNumOperands() + 1);
  }
  case ISD::BUILD_PAIR: {
    SDValue RC, SubReg0, SubReg1;
    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
      break;
    }
    if (N->getValueType(0) == MVT::i128) {
      RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32);
      SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, MVT::i32);
      SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, MVT::i32);
    } else if (N->getValueType(0) == MVT::i64) {
      RC = CurDAG->getTargetConstant(AMDGPU::VSrc_64RegClassID, MVT::i32);
      SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32);
      SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32);
    } else {
      llvm_unreachable("Unhandled value type for BUILD_PAIR");
    }
    const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
                            N->getOperand(1), SubReg1 };
    return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
                                  SDLoc(N), N->getValueType(0), Ops);
  }
  }
  return SelectCode(N);
}
SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
  const R600InstrInfo *TII =
                      static_cast<const R600InstrInfo*>(TM.getInstrInfo());
  unsigned int Opc = N->getOpcode();
  if (N->isMachineOpcode()) {
    return NULL;   // Already selected.
  }
  switch (Opc) {
  default: break;
  case AMDGPUISD::CONST_ADDRESS: {
    for (SDNode::use_iterator I = N->use_begin(), Next = llvm::next(I);
                              I != SDNode::use_end(); I = Next) {
      Next = llvm::next(I);
      if (!I->isMachineOpcode()) {
        continue;
      }
      unsigned Opcode = I->getMachineOpcode();
      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
      int SrcIdx = I.getOperandNo();
      int SelIdx;
      // Unlike MachineInstrs, SDNodes do not have results in their operand
      // list, so we need to increment the SrcIdx, since
      // R600InstrInfo::getOperandIdx is based on the MachineInstr indices.
      if (HasDst) {
        SrcIdx++;
      }

      SelIdx = TII->getSelIdx(I->getMachineOpcode(), SrcIdx);
      if (SelIdx < 0) {
        continue;
      }

      SDValue CstOffset;
      if (N->getValueType(0).isVector() ||
          !SelectGlobalValueConstantOffset(N->getOperand(0), CstOffset))
        continue;

      // Gather constants values
      int SrcIndices[] = {
        TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
        TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
        TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
        TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
        TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
        TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
        TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
        TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
        TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
        TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
        TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
      };
      std::vector<unsigned> Consts;
      for (unsigned i = 0; i < sizeof(SrcIndices) / sizeof(int); i++) {
        int OtherSrcIdx = SrcIndices[i];
        int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
        if (OtherSrcIdx < 0 || OtherSelIdx < 0) {
          continue;
        }
        if (HasDst) {
          OtherSrcIdx--;
          OtherSelIdx--;
        }
        if (RegisterSDNode *Reg =
                         dyn_cast<RegisterSDNode>(I->getOperand(OtherSrcIdx))) {
          if (Reg->getReg() == AMDGPU::ALU_CONST) {
            ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(I->getOperand(OtherSelIdx));
            Consts.push_back(Cst->getZExtValue());
          }
        }
      }

      ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
      Consts.push_back(Cst->getZExtValue());
      if (!TII->fitsConstReadLimitations(Consts))
        continue;

      // Convert back to SDNode indices
      if (HasDst) {
        SrcIdx--;
        SelIdx--;
      }
      std::vector<SDValue> Ops;
      for (int i = 0, e = I->getNumOperands(); i != e; ++i) {
        if (i == SrcIdx) {
          Ops.push_back(CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32));
        } else if (i == SelIdx) {
          Ops.push_back(CstOffset);
        } else {
          Ops.push_back(I->getOperand(i));
        }
      }
      CurDAG->UpdateNodeOperands(*I, Ops.data(), Ops.size());
    }
    break;
  }
  case ISD::BUILD_VECTOR: {
    unsigned RegClassID;
    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
    const AMDGPURegisterInfo *TRI =
                   static_cast<const AMDGPURegisterInfo*>(TM.getRegisterInfo());
    const SIRegisterInfo *SIRI =
                   static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
    EVT VT = N->getValueType(0);
    unsigned NumVectorElts = VT.getVectorNumElements();
    assert(VT.getVectorElementType().bitsEq(MVT::i32));
    if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
      bool UseVReg = true;
      for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
                                                    U != E; ++U) {
        if (!U->isMachineOpcode()) {
          continue;
        }
        const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
        if (!RC) {
          continue;
        }
        if (SIRI->isSGPRClass(RC)) {
          UseVReg = false;
        }
      }
      switch(NumVectorElts) {
      case 1: RegClassID = UseVReg ? AMDGPU::VReg_32RegClassID :
                                     AMDGPU::SReg_32RegClassID;
        break;
      case 2: RegClassID = UseVReg ? AMDGPU::VReg_64RegClassID :
                                     AMDGPU::SReg_64RegClassID;
        break;
      case 4: RegClassID = UseVReg ? AMDGPU::VReg_128RegClassID :
                                     AMDGPU::SReg_128RegClassID;
        break;
      case 8: RegClassID = UseVReg ? AMDGPU::VReg_256RegClassID :
                                     AMDGPU::SReg_256RegClassID;
        break;
      case 16: RegClassID = UseVReg ? AMDGPU::VReg_512RegClassID :
                                      AMDGPU::SReg_512RegClassID;
        break;
      default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
      }
    } else {
      // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
      // that adds a 128 bits reg copy when going through TwoAddressInstructions
      // pass. We want to avoid 128 bits copies as much as possible because they
      // can't be bundled by our scheduler.
      switch(NumVectorElts) {
      case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
      case 4: RegClassID = AMDGPU::R600_Reg128RegClassID; break;
      default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
      }
    }

    SDValue RegClass = CurDAG->getTargetConstant(RegClassID, MVT::i32);

    if (NumVectorElts == 1) {
      return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS,
                                  VT.getVectorElementType(),
                                  N->getOperand(0), RegClass);
    }

    assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not "
                                  "supported yet");
    // 16 = Max Num Vector Elements
    // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
    // 1 = Vector Register Class
    SDValue RegSeqArgs[16 * 2 + 1];

    RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, MVT::i32);
    bool IsRegSeq = true;
    for (unsigned i = 0; i < N->getNumOperands(); i++) {
      // XXX: Why is this here?
      if (dyn_cast<RegisterSDNode>(N->getOperand(i))) {
        IsRegSeq = false;
        break;
      }
      RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
      RegSeqArgs[1 + (2 * i) + 1] =
              CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), MVT::i32);
    }
    if (!IsRegSeq)
      break;
    return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(),
        RegSeqArgs, 2 * N->getNumOperands() + 1);
  }
  case ISD::BUILD_PAIR: {
    SDValue RC, SubReg0, SubReg1;
    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
      break;
    }
    if (N->getValueType(0) == MVT::i128) {
      RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32);
      SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, MVT::i32);
      SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, MVT::i32);
    } else if (N->getValueType(0) == MVT::i64) {
      RC = CurDAG->getTargetConstant(AMDGPU::VSrc_64RegClassID, MVT::i32);
      SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32);
      SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32);
    } else {
      llvm_unreachable("Unhandled value type for BUILD_PAIR");
    }
    const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
                            N->getOperand(1), SubReg1 };
    return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
                                  SDLoc(N), N->getValueType(0), Ops);
  }

  case ISD::ConstantFP:
  case ISD::Constant: {
    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
    // XXX: Custom immediate lowering not implemented yet.  Instead we use
    // pseudo instructions defined in SIInstructions.td
    if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
      break;
    }

    uint64_t ImmValue = 0;
    unsigned ImmReg = AMDGPU::ALU_LITERAL_X;

    if (N->getOpcode() == ISD::ConstantFP) {
      // XXX: 64-bit Immediates not supported yet
      assert(N->getValueType(0) != MVT::f64);

      ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N);
      APFloat Value = C->getValueAPF();
      float FloatValue = Value.convertToFloat();
      if (FloatValue == 0.0) {
        ImmReg = AMDGPU::ZERO;
      } else if (FloatValue == 0.5) {
        ImmReg = AMDGPU::HALF;
      } else if (FloatValue == 1.0) {
        ImmReg = AMDGPU::ONE;
      } else {
        ImmValue = Value.bitcastToAPInt().getZExtValue();
      }
    } else {
      // XXX: 64-bit Immediates not supported yet
      assert(N->getValueType(0) != MVT::i64);

      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N);
      if (C->getZExtValue() == 0) {
        ImmReg = AMDGPU::ZERO;
      } else if (C->getZExtValue() == 1) {
        ImmReg = AMDGPU::ONE_INT;
      } else {
        ImmValue = C->getZExtValue();
      }
    }

    for (SDNode::use_iterator Use = N->use_begin(), Next = llvm::next(Use);
                              Use != SDNode::use_end(); Use = Next) {
      Next = llvm::next(Use);
      std::vector<SDValue> Ops;
      for (unsigned i = 0; i < Use->getNumOperands(); ++i) {
        Ops.push_back(Use->getOperand(i));
      }

      if (!Use->isMachineOpcode()) {
          if (ImmReg == AMDGPU::ALU_LITERAL_X) {
            // We can only use literal constants (e.g. AMDGPU::ZERO,
            // AMDGPU::ONE, etc) in machine opcodes.
            continue;
          }
      } else {
        switch(Use->getMachineOpcode()) {
        case AMDGPU::REG_SEQUENCE: break;
        default:
          if (!TII->isALUInstr(Use->getMachineOpcode()) ||
              (TII->get(Use->getMachineOpcode()).TSFlags &
               R600_InstFlag::VECTOR)) {
            continue;
          }
        }

        // Check that we aren't already using an immediate.
        // XXX: It's possible for an instruction to have more than one
        // immediate operand, but this is not supported yet.
        if (ImmReg == AMDGPU::ALU_LITERAL_X) {
          int ImmIdx = TII->getOperandIdx(Use->getMachineOpcode(),
                                          AMDGPU::OpName::literal);
          if (ImmIdx == -1) {
            continue;
          }

          if (TII->getOperandIdx(Use->getMachineOpcode(),
                                 AMDGPU::OpName::dst) != -1) {
            // subtract one from ImmIdx, because the DST operand is usually index
            // 0 for MachineInstrs, but we have no DST in the Ops vector.
            ImmIdx--;
          }
          ConstantSDNode *C = dyn_cast<ConstantSDNode>(Use->getOperand(ImmIdx));
          assert(C);

          if (C->getZExtValue() != 0) {
            // This instruction is already using an immediate.
            continue;
          }

          // Set the immediate value
          Ops[ImmIdx] = CurDAG->getTargetConstant(ImmValue, MVT::i32);
        }
      }
      // Set the immediate register
      Ops[Use.getOperandNo()] = CurDAG->getRegister(ImmReg, MVT::i32);

      CurDAG->UpdateNodeOperands(*Use, Ops.data(), Use->getNumOperands());
    }
    break;
  }
  }
  SDNode *Result = SelectCode(N);

  // Fold operands of selected node

  const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
  if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
    const R600InstrInfo *TII =
        static_cast<const R600InstrInfo*>(TM.getInstrInfo());
    if (Result && Result->isMachineOpcode() && Result->getMachineOpcode() == AMDGPU::DOT_4) {
      bool IsModified = false;
      do {
        std::vector<SDValue> Ops;
        for(SDNode::op_iterator I = Result->op_begin(), E = Result->op_end();
            I != E; ++I)
          Ops.push_back(*I);
        IsModified = FoldDotOperands(Result->getMachineOpcode(), TII, Ops);
        if (IsModified) {
          Result = CurDAG->UpdateNodeOperands(Result, Ops.data(), Ops.size());
        }
      } while (IsModified);

    }
    if (Result && Result->isMachineOpcode() &&
        !(TII->get(Result->getMachineOpcode()).TSFlags & R600_InstFlag::VECTOR)
        && TII->hasInstrModifiers(Result->getMachineOpcode())) {
      // Fold FNEG/FABS
      // TODO: Isel can generate multiple MachineInst, we need to recursively
      // parse Result
      bool IsModified = false;
      do {
        std::vector<SDValue> Ops;
        for(SDNode::op_iterator I = Result->op_begin(), E = Result->op_end();
            I != E; ++I)
          Ops.push_back(*I);
        IsModified = FoldOperands(Result->getMachineOpcode(), TII, Ops);
        if (IsModified) {
          Result = CurDAG->UpdateNodeOperands(Result, Ops.data(), Ops.size());
        }
      } while (IsModified);

      // If node has a single use which is CLAMP_R600, folds it
      if (Result->hasOneUse() && Result->isMachineOpcode()) {
        SDNode *PotentialClamp = *Result->use_begin();
        if (PotentialClamp->isMachineOpcode() &&
            PotentialClamp->getMachineOpcode() == AMDGPU::CLAMP_R600) {
          unsigned ClampIdx =
            TII->getOperandIdx(Result->getMachineOpcode(), AMDGPU::OpName::clamp);
          std::vector<SDValue> Ops;
          unsigned NumOp = Result->getNumOperands();
          for (unsigned i = 0; i < NumOp; ++i) {
            Ops.push_back(Result->getOperand(i));
          }
          Ops[ClampIdx - 1] = CurDAG->getTargetConstant(1, MVT::i32);
          Result = CurDAG->SelectNodeTo(PotentialClamp,
              Result->getMachineOpcode(), PotentialClamp->getVTList(),
              Ops.data(), NumOp);
        }
      }
    }
  }

  return Result;
}