bool X86InterleavedAccessGroup::isSupported() const { VectorType *ShuffleVecTy = Shuffles[0]->getType(); Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType(); unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy); unsigned WideInstSize; // Currently, lowering is supported for the following vectors: // Stride 4: // 1. Store and load of 4-element vectors of 64 bits on AVX. // 2. Store of 16/32-element vectors of 8 bits on AVX. // Stride 3: // 1. Load of 16/32-element vecotrs of 8 bits on AVX. if (!Subtarget.hasAVX() || (Factor != 4 && Factor != 3)) return false; if (isa<LoadInst>(Inst)) { WideInstSize = DL.getTypeSizeInBits(Inst->getType()); } else WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType()); // We support shuffle represents stride 4 for byte type with size of // WideInstSize. if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4) return true; if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 && (WideInstSize == 512 || WideInstSize == 1024)) return true; if (ShuffleElemSize == 8 && isa<LoadInst>(Inst) && Factor == 3 && (WideInstSize == 384 || WideInstSize == 768)) return true; return false; }
unsigned X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, unsigned Alignment, unsigned AddressSpace) { VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy); if (!SrcVTy) // To calculate scalar take the regular cost, without mask return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace); unsigned NumElem = SrcVTy->getVectorNumElements(); VectorType *MaskTy = VectorType::get(Type::getInt8Ty(getGlobalContext()), NumElem); if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy, 1)) || (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy, 1)) || !isPowerOf2_32(NumElem)) { // Scalarization unsigned MaskSplitCost = getScalarizationOverhead(MaskTy, false, true); unsigned ScalarCompareCost = getCmpSelInstrCost(Instruction::ICmp, Type::getInt8Ty(getGlobalContext()), NULL); unsigned BranchCost = getCFInstrCost(Instruction::Br); unsigned MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); unsigned ValueSplitCost = getScalarizationOverhead(SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store); unsigned MemopCost = NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), Alignment, AddressSpace); return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; } // Legalize the type. std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(SrcVTy); unsigned Cost = 0; if (LT.second != TLI->getValueType(SrcVTy).getSimpleVT() && LT.second.getVectorNumElements() == NumElem) // Promotion requires expand/truncate for data and a shuffle for mask. Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, 0) + getShuffleCost(TTI::SK_Alternate, MaskTy, 0, 0); else if (LT.second.getVectorNumElements() > NumElem) { VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(), LT.second.getVectorNumElements()); // Expanding requires fill mask with zeroes Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy); } if (!ST->hasAVX512()) return Cost + LT.first*4; // Each maskmov costs 4 // AVX-512 masked load/store is cheapper return Cost+LT.first; }
// Lowers this interleaved access group into X86-specific // instructions/intrinsics. bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { SmallVector<Instruction *, 4> DecomposedVectors; SmallVector<Value *, 4> TransposedVectors; VectorType *ShuffleTy = Shuffles[0]->getType(); if (isa<LoadInst>(Inst)) { // Try to generate target-sized register(/instruction). decompose(Inst, Factor, ShuffleTy, DecomposedVectors); Type *ShuffleEltTy = Inst->getType(); unsigned NumSubVecElems = ShuffleEltTy->getVectorNumElements() / Factor; // Perform matrix-transposition in order to compute interleaved // results by generating some sort of (optimized) target-specific // instructions. switch (NumSubVecElems) { default: return false; case 4: transpose_4x4(DecomposedVectors, TransposedVectors); break; case 8: case 16: case 32: deinterleave8bitStride3(DecomposedVectors, TransposedVectors, NumSubVecElems); break; } // Now replace the unoptimized-interleaved-vectors with the // transposed-interleaved vectors. for (unsigned i = 0, e = Shuffles.size(); i < e; ++i) Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]); return true; } Type *ShuffleEltTy = ShuffleTy->getVectorElementType(); unsigned NumSubVecElems = ShuffleTy->getVectorNumElements() / Factor; // Lower the interleaved stores: // 1. Decompose the interleaved wide shuffle into individual shuffle // vectors. decompose(Shuffles[0], Factor, VectorType::get(ShuffleEltTy, NumSubVecElems), DecomposedVectors); // 2. Transpose the interleaved-vectors into vectors of contiguous // elements. switch (NumSubVecElems) { case 4: transpose_4x4(DecomposedVectors, TransposedVectors); break; case 16: case 32: interleave8bitStride4(DecomposedVectors, TransposedVectors, NumSubVecElems); break; default: return false; } // 3. Concatenate the contiguous-vectors back into a wide vector. Value *WideVec = concatenateVectors(Builder, TransposedVectors); // 4. Generate a store instruction for wide-vec. StoreInst *SI = cast<StoreInst>(Inst); Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(), SI->getAlignment()); return true; }
bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { CallingConv::ID CC = F.getCallingConv(); if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty()) return false; auto &TPC = getAnalysis<TargetPassConfig>(); const TargetMachine &TM = TPC.getTM<TargetMachine>(); const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); LLVMContext &Ctx = F.getParent()->getContext(); const DataLayout &DL = F.getParent()->getDataLayout(); BasicBlock &EntryBlock = *F.begin(); IRBuilder<> Builder(&*EntryBlock.begin()); const unsigned KernArgBaseAlign = 16; // FIXME: Increase if necessary const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F); unsigned MaxAlign; // FIXME: Alignment is broken broken with explicit arg offset.; const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign); if (TotalKernArgSize == 0) return false; CallInst *KernArgSegment = Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, {}, {}, nullptr, F.getName() + ".kernarg.segment"); KernArgSegment->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull); KernArgSegment->addAttribute(AttributeList::ReturnIndex, Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize)); unsigned AS = KernArgSegment->getType()->getPointerAddressSpace(); uint64_t ExplicitArgOffset = 0; for (Argument &Arg : F.args()) { Type *ArgTy = Arg.getType(); unsigned Align = DL.getABITypeAlignment(ArgTy); unsigned Size = DL.getTypeSizeInBits(ArgTy); unsigned AllocSize = DL.getTypeAllocSize(ArgTy); uint64_t EltOffset = alignTo(ExplicitArgOffset, Align) + BaseOffset; ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize; if (Arg.use_empty()) continue; if (PointerType *PT = dyn_cast<PointerType>(ArgTy)) { // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing // modes on SI to know the high bits are 0 so pointer adds don't wrap. We // can't represent this with range metadata because it's only allowed for // integer types. if ((PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) && ST.getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) continue; // FIXME: We can replace this with equivalent alias.scope/noalias // metadata, but this appears to be a lot of work. if (Arg.hasNoAliasAttr()) continue; } VectorType *VT = dyn_cast<VectorType>(ArgTy); bool IsV3 = VT && VT->getNumElements() == 3; bool DoShiftOpt = Size < 32 && !ArgTy->isAggregateType(); VectorType *V4Ty = nullptr; int64_t AlignDownOffset = alignDown(EltOffset, 4); int64_t OffsetDiff = EltOffset - AlignDownOffset; unsigned AdjustedAlign = MinAlign(DoShiftOpt ? AlignDownOffset : EltOffset, KernArgBaseAlign); Value *ArgPtr; Type *AdjustedArgTy; if (DoShiftOpt) { // FIXME: Handle aggregate types // Since we don't have sub-dword scalar loads, avoid doing an extload by // loading earlier than the argument address, and extracting the relevant // bits. // // Additionally widen any sub-dword load to i32 even if suitably aligned, // so that CSE between different argument loads works easily. ArgPtr = Builder.CreateConstInBoundsGEP1_64( Builder.getInt8Ty(), KernArgSegment, AlignDownOffset, Arg.getName() + ".kernarg.offset.align.down"); AdjustedArgTy = Builder.getInt32Ty(); } else { ArgPtr = Builder.CreateConstInBoundsGEP1_64( Builder.getInt8Ty(), KernArgSegment, EltOffset, Arg.getName() + ".kernarg.offset"); AdjustedArgTy = ArgTy; } if (IsV3 && Size >= 32) { V4Ty = VectorType::get(VT->getVectorElementType(), 4); // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads AdjustedArgTy = V4Ty; } ArgPtr = Builder.CreateBitCast(ArgPtr, AdjustedArgTy->getPointerTo(AS), ArgPtr->getName() + ".cast"); LoadInst *Load = Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign); Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {})); MDBuilder MDB(Ctx); if (isa<PointerType>(ArgTy)) { if (Arg.hasNonNullAttr()) Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {})); uint64_t DerefBytes = Arg.getDereferenceableBytes(); if (DerefBytes != 0) { Load->setMetadata( LLVMContext::MD_dereferenceable, MDNode::get(Ctx, MDB.createConstant( ConstantInt::get(Builder.getInt64Ty(), DerefBytes)))); } uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes(); if (DerefOrNullBytes != 0) { Load->setMetadata( LLVMContext::MD_dereferenceable_or_null, MDNode::get(Ctx, MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(), DerefOrNullBytes)))); } unsigned ParamAlign = Arg.getParamAlignment(); if (ParamAlign != 0) { Load->setMetadata( LLVMContext::MD_align, MDNode::get(Ctx, MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(), ParamAlign)))); } } // TODO: Convert noalias arg to !noalias if (DoShiftOpt) { Value *ExtractBits = OffsetDiff == 0 ? Load : Builder.CreateLShr(Load, OffsetDiff * 8); IntegerType *ArgIntTy = Builder.getIntNTy(Size); Value *Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy); Value *NewVal = Builder.CreateBitCast(Trunc, ArgTy, Arg.getName() + ".load"); Arg.replaceAllUsesWith(NewVal); } else if (IsV3) { Value *Shuf = Builder.CreateShuffleVector(Load, UndefValue::get(V4Ty), {0, 1, 2}, Arg.getName() + ".load"); Arg.replaceAllUsesWith(Shuf); } else { Load->setName(Arg.getName() + ".load"); Arg.replaceAllUsesWith(Load); } } KernArgSegment->addAttribute( AttributeList::ReturnIndex, Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign))); return true; }