static std::string getOCLTypeName(Type *Ty, bool Signed) { switch (Ty->getTypeID()) { case Type::HalfTyID: return "half"; case Type::FloatTyID: return "float"; case Type::DoubleTyID: return "double"; case Type::IntegerTyID: { if (!Signed) return (Twine('u') + getOCLTypeName(Ty, true)).str(); unsigned BW = Ty->getIntegerBitWidth(); switch (BW) { case 8: return "char"; case 16: return "short"; case 32: return "int"; case 64: return "long"; default: return (Twine('i') + Twine(BW)).str(); } } case Type::VectorTyID: { VectorType *VecTy = cast<VectorType>(Ty); Type *EleTy = VecTy->getElementType(); unsigned Size = VecTy->getVectorNumElements(); return (Twine(getOCLTypeName(EleTy, Signed)) + Twine(Size)).str(); } default: return "unknown"; } }
unsigned X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, unsigned Alignment, unsigned AddressSpace) { VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy); if (!SrcVTy) // To calculate scalar take the regular cost, without mask return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace); unsigned NumElem = SrcVTy->getVectorNumElements(); VectorType *MaskTy = VectorType::get(Type::getInt8Ty(getGlobalContext()), NumElem); if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy, 1)) || (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy, 1)) || !isPowerOf2_32(NumElem)) { // Scalarization unsigned MaskSplitCost = getScalarizationOverhead(MaskTy, false, true); unsigned ScalarCompareCost = getCmpSelInstrCost(Instruction::ICmp, Type::getInt8Ty(getGlobalContext()), NULL); unsigned BranchCost = getCFInstrCost(Instruction::Br); unsigned MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); unsigned ValueSplitCost = getScalarizationOverhead(SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store); unsigned MemopCost = NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), Alignment, AddressSpace); return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; } // Legalize the type. std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(SrcVTy); unsigned Cost = 0; if (LT.second != TLI->getValueType(SrcVTy).getSimpleVT() && LT.second.getVectorNumElements() == NumElem) // Promotion requires expand/truncate for data and a shuffle for mask. Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, 0) + getShuffleCost(TTI::SK_Alternate, MaskTy, 0, 0); else if (LT.second.getVectorNumElements() > NumElem) { VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(), LT.second.getVectorNumElements()); // Expanding requires fill mask with zeroes Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy); } if (!ST->hasAVX512()) return Cost + LT.first*4; // Each maskmov costs 4 // AVX-512 masked load/store is cheapper return Cost+LT.first; }
// Lowers this interleaved access group into X86-specific // instructions/intrinsics. bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { SmallVector<Instruction *, 4> DecomposedVectors; SmallVector<Value *, 4> TransposedVectors; VectorType *ShuffleTy = Shuffles[0]->getType(); if (isa<LoadInst>(Inst)) { // Try to generate target-sized register(/instruction). decompose(Inst, Factor, ShuffleTy, DecomposedVectors); Type *ShuffleEltTy = Inst->getType(); unsigned NumSubVecElems = ShuffleEltTy->getVectorNumElements() / Factor; // Perform matrix-transposition in order to compute interleaved // results by generating some sort of (optimized) target-specific // instructions. switch (NumSubVecElems) { default: return false; case 4: transpose_4x4(DecomposedVectors, TransposedVectors); break; case 8: case 16: case 32: deinterleave8bitStride3(DecomposedVectors, TransposedVectors, NumSubVecElems); break; } // Now replace the unoptimized-interleaved-vectors with the // transposed-interleaved vectors. for (unsigned i = 0, e = Shuffles.size(); i < e; ++i) Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]); return true; } Type *ShuffleEltTy = ShuffleTy->getVectorElementType(); unsigned NumSubVecElems = ShuffleTy->getVectorNumElements() / Factor; // Lower the interleaved stores: // 1. Decompose the interleaved wide shuffle into individual shuffle // vectors. decompose(Shuffles[0], Factor, VectorType::get(ShuffleEltTy, NumSubVecElems), DecomposedVectors); // 2. Transpose the interleaved-vectors into vectors of contiguous // elements. switch (NumSubVecElems) { case 4: transpose_4x4(DecomposedVectors, TransposedVectors); break; case 16: case 32: interleave8bitStride4(DecomposedVectors, TransposedVectors, NumSubVecElems); break; default: return false; } // 3. Concatenate the contiguous-vectors back into a wide vector. Value *WideVec = concatenateVectors(Builder, TransposedVectors); // 4. Generate a store instruction for wide-vec. StoreInst *SI = cast<StoreInst>(Inst); Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(), SI->getAlignment()); return true; }
/// If we have insertion into a vector that is wider than the vector that we /// are extracting from, try to widen the source vector to allow a single /// shufflevector to replace one or more insert/extract pairs. static void replaceExtractElements(InsertElementInst *InsElt, ExtractElementInst *ExtElt, InstCombiner &IC) { VectorType *InsVecType = InsElt->getType(); VectorType *ExtVecType = ExtElt->getVectorOperandType(); unsigned NumInsElts = InsVecType->getVectorNumElements(); unsigned NumExtElts = ExtVecType->getVectorNumElements(); // The inserted-to vector must be wider than the extracted-from vector. if (InsVecType->getElementType() != ExtVecType->getElementType() || NumExtElts >= NumInsElts) return; // Create a shuffle mask to widen the extended-from vector using undefined // values. The mask selects all of the values of the original vector followed // by as many undefined values as needed to create a vector of the same length // as the inserted-to vector. SmallVector<Constant *, 16> ExtendMask; IntegerType *IntType = Type::getInt32Ty(InsElt->getContext()); for (unsigned i = 0; i < NumExtElts; ++i) ExtendMask.push_back(ConstantInt::get(IntType, i)); for (unsigned i = NumExtElts; i < NumInsElts; ++i) ExtendMask.push_back(UndefValue::get(IntType)); Value *ExtVecOp = ExtElt->getVectorOperand(); auto *ExtVecOpInst = dyn_cast<Instruction>(ExtVecOp); BasicBlock *InsertionBlock = (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst)) ? ExtVecOpInst->getParent() : ExtElt->getParent(); // TODO: This restriction matches the basic block check below when creating // new extractelement instructions. If that limitation is removed, this one // could also be removed. But for now, we just bail out to ensure that we // will replace the extractelement instruction that is feeding our // insertelement instruction. This allows the insertelement to then be // replaced by a shufflevector. If the insertelement is not replaced, we can // induce infinite looping because there's an optimization for extractelement // that will delete our widening shuffle. This would trigger another attempt // here to create that shuffle, and we spin forever. if (InsertionBlock != InsElt->getParent()) return; auto *WideVec = new ShuffleVectorInst(ExtVecOp, UndefValue::get(ExtVecType), ConstantVector::get(ExtendMask)); // Insert the new shuffle after the vector operand of the extract is defined // (as long as it's not a PHI) or at the start of the basic block of the // extract, so any subsequent extracts in the same basic block can use it. // TODO: Insert before the earliest ExtractElementInst that is replaced. if (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst)) WideVec->insertAfter(ExtVecOpInst); else IC.InsertNewInstWith(WideVec, *ExtElt->getParent()->getFirstInsertionPt()); // Replace extracts from the original narrow vector with extracts from the new // wide vector. for (User *U : ExtVecOp->users()) { ExtractElementInst *OldExt = dyn_cast<ExtractElementInst>(U); if (!OldExt || OldExt->getParent() != WideVec->getParent()) continue; auto *NewExt = ExtractElementInst::Create(WideVec, OldExt->getOperand(1)); NewExt->insertAfter(WideVec); IC.replaceInstUsesWith(*OldExt, NewExt); } }