Ejemplo n.º 1
0
static std::string getOCLTypeName(Type *Ty, bool Signed) {
  switch (Ty->getTypeID()) {
  case Type::HalfTyID:
    return "half";
  case Type::FloatTyID:
    return "float";
  case Type::DoubleTyID:
    return "double";
  case Type::IntegerTyID: {
    if (!Signed)
      return (Twine('u') + getOCLTypeName(Ty, true)).str();
    unsigned BW = Ty->getIntegerBitWidth();
    switch (BW) {
    case 8:
      return "char";
    case 16:
      return "short";
    case 32:
      return "int";
    case 64:
      return "long";
    default:
      return (Twine('i') + Twine(BW)).str();
    }
  }
  case Type::VectorTyID: {
    VectorType *VecTy = cast<VectorType>(Ty);
    Type *EleTy = VecTy->getElementType();
    unsigned Size = VecTy->getVectorNumElements();
    return (Twine(getOCLTypeName(EleTy, Signed)) + Twine(Size)).str();
  }
  default:
    return "unknown";
  }
}
Ejemplo n.º 2
0
unsigned X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
                                           unsigned Alignment,
                                           unsigned AddressSpace) {
  VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
  if (!SrcVTy)
    // To calculate scalar take the regular cost, without mask
    return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace);

  unsigned NumElem = SrcVTy->getVectorNumElements();
  VectorType *MaskTy =
    VectorType::get(Type::getInt8Ty(getGlobalContext()), NumElem);
  if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy, 1)) ||
      (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy, 1)) ||
      !isPowerOf2_32(NumElem)) {
    // Scalarization
    unsigned MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
    unsigned ScalarCompareCost =
      getCmpSelInstrCost(Instruction::ICmp,
                         Type::getInt8Ty(getGlobalContext()), NULL);
    unsigned BranchCost = getCFInstrCost(Instruction::Br);
    unsigned MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);

    unsigned ValueSplitCost =
      getScalarizationOverhead(SrcVTy, Opcode == Instruction::Load,
                               Opcode == Instruction::Store);
    unsigned MemopCost =
        NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
                                         Alignment, AddressSpace);
    return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
  }

  // Legalize the type.
  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(SrcVTy);
  unsigned Cost = 0;
  if (LT.second != TLI->getValueType(SrcVTy).getSimpleVT() &&
      LT.second.getVectorNumElements() == NumElem)
    // Promotion requires expand/truncate for data and a shuffle for mask.
    Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, 0) +
            getShuffleCost(TTI::SK_Alternate, MaskTy, 0, 0);

  else if (LT.second.getVectorNumElements() > NumElem) {
    VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
                                            LT.second.getVectorNumElements());
    // Expanding requires fill mask with zeroes
    Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
  }
  if (!ST->hasAVX512())
    return Cost + LT.first*4; // Each maskmov costs 4

  // AVX-512 masked load/store is cheapper
  return Cost+LT.first;
}
Ejemplo n.º 3
0
// Lowers this interleaved access group into X86-specific
// instructions/intrinsics.
bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
  SmallVector<Instruction *, 4> DecomposedVectors;
  SmallVector<Value *, 4> TransposedVectors;
  VectorType *ShuffleTy = Shuffles[0]->getType();

  if (isa<LoadInst>(Inst)) {
    // Try to generate target-sized register(/instruction).
    decompose(Inst, Factor, ShuffleTy, DecomposedVectors);

    Type *ShuffleEltTy = Inst->getType();
    unsigned NumSubVecElems = ShuffleEltTy->getVectorNumElements() / Factor;
    // Perform matrix-transposition in order to compute interleaved
    // results by generating some sort of (optimized) target-specific
    // instructions.

    switch (NumSubVecElems) {
    default:
      return false;
    case 4:
      transpose_4x4(DecomposedVectors, TransposedVectors);
      break;
    case 8:
    case 16:
    case 32:
      deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
                              NumSubVecElems);
      break;
    }

    // Now replace the unoptimized-interleaved-vectors with the
    // transposed-interleaved vectors.
    for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
      Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);

    return true;
  }

  Type *ShuffleEltTy = ShuffleTy->getVectorElementType();
  unsigned NumSubVecElems = ShuffleTy->getVectorNumElements() / Factor;

  // Lower the interleaved stores:
  //   1. Decompose the interleaved wide shuffle into individual shuffle
  //   vectors.
  decompose(Shuffles[0], Factor, VectorType::get(ShuffleEltTy, NumSubVecElems),
            DecomposedVectors);

  //   2. Transpose the interleaved-vectors into vectors of contiguous
  //      elements.
  switch (NumSubVecElems) {
  case 4:
    transpose_4x4(DecomposedVectors, TransposedVectors);
    break;
  case 16:
  case 32:
    interleave8bitStride4(DecomposedVectors, TransposedVectors, NumSubVecElems);
    break;
  default:
    return false;
  }

  //   3. Concatenate the contiguous-vectors back into a wide vector.
  Value *WideVec = concatenateVectors(Builder, TransposedVectors);

  //   4. Generate a store instruction for wide-vec.
  StoreInst *SI = cast<StoreInst>(Inst);
  Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(),
                             SI->getAlignment());

  return true;
}
Ejemplo n.º 4
0
/// If we have insertion into a vector that is wider than the vector that we
/// are extracting from, try to widen the source vector to allow a single
/// shufflevector to replace one or more insert/extract pairs.
static void replaceExtractElements(InsertElementInst *InsElt,
                                   ExtractElementInst *ExtElt,
                                   InstCombiner &IC) {
  VectorType *InsVecType = InsElt->getType();
  VectorType *ExtVecType = ExtElt->getVectorOperandType();
  unsigned NumInsElts = InsVecType->getVectorNumElements();
  unsigned NumExtElts = ExtVecType->getVectorNumElements();

  // The inserted-to vector must be wider than the extracted-from vector.
  if (InsVecType->getElementType() != ExtVecType->getElementType() ||
      NumExtElts >= NumInsElts)
    return;

  // Create a shuffle mask to widen the extended-from vector using undefined
  // values. The mask selects all of the values of the original vector followed
  // by as many undefined values as needed to create a vector of the same length
  // as the inserted-to vector.
  SmallVector<Constant *, 16> ExtendMask;
  IntegerType *IntType = Type::getInt32Ty(InsElt->getContext());
  for (unsigned i = 0; i < NumExtElts; ++i)
    ExtendMask.push_back(ConstantInt::get(IntType, i));
  for (unsigned i = NumExtElts; i < NumInsElts; ++i)
    ExtendMask.push_back(UndefValue::get(IntType));

  Value *ExtVecOp = ExtElt->getVectorOperand();
  auto *ExtVecOpInst = dyn_cast<Instruction>(ExtVecOp);
  BasicBlock *InsertionBlock = (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst))
                                   ? ExtVecOpInst->getParent()
                                   : ExtElt->getParent();

  // TODO: This restriction matches the basic block check below when creating
  // new extractelement instructions. If that limitation is removed, this one
  // could also be removed. But for now, we just bail out to ensure that we
  // will replace the extractelement instruction that is feeding our
  // insertelement instruction. This allows the insertelement to then be
  // replaced by a shufflevector. If the insertelement is not replaced, we can
  // induce infinite looping because there's an optimization for extractelement
  // that will delete our widening shuffle. This would trigger another attempt
  // here to create that shuffle, and we spin forever.
  if (InsertionBlock != InsElt->getParent())
    return;

  auto *WideVec = new ShuffleVectorInst(ExtVecOp, UndefValue::get(ExtVecType),
                                        ConstantVector::get(ExtendMask));

  // Insert the new shuffle after the vector operand of the extract is defined
  // (as long as it's not a PHI) or at the start of the basic block of the
  // extract, so any subsequent extracts in the same basic block can use it.
  // TODO: Insert before the earliest ExtractElementInst that is replaced.
  if (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst))
    WideVec->insertAfter(ExtVecOpInst);
  else
    IC.InsertNewInstWith(WideVec, *ExtElt->getParent()->getFirstInsertionPt());

  // Replace extracts from the original narrow vector with extracts from the new
  // wide vector.
  for (User *U : ExtVecOp->users()) {
    ExtractElementInst *OldExt = dyn_cast<ExtractElementInst>(U);
    if (!OldExt || OldExt->getParent() != WideVec->getParent())
      continue;
    auto *NewExt = ExtractElementInst::Create(WideVec, OldExt->getOperand(1));
    NewExt->insertAfter(WideVec);
    IC.replaceInstUsesWith(*OldExt, NewExt);
  }
}