bool X86InterleavedAccessGroup::isSupported() const {
  VectorType *ShuffleVecTy = Shuffles[0]->getType();
  Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
  unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
  unsigned WideInstSize;

  // Currently, lowering is supported for the following vectors:
  // Stride 4:
  //    1. Store and load of 4-element vectors of 64 bits on AVX.
  //    2. Store of 16/32-element vectors of 8 bits on AVX.
  // Stride 3:
  //    1. Load of 16/32-element vecotrs of 8 bits on AVX.
  if (!Subtarget.hasAVX() || (Factor != 4 && Factor != 3))
    return false;

  if (isa<LoadInst>(Inst)) {
    WideInstSize = DL.getTypeSizeInBits(Inst->getType());
  } else
    WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());

  // We support shuffle represents stride 4 for byte type with size of
  // WideInstSize.
  if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
     return true;

  if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
      (WideInstSize == 512 || WideInstSize == 1024))
     return true;

  if (ShuffleElemSize == 8 && isa<LoadInst>(Inst) && Factor == 3 &&
      (WideInstSize == 384 || WideInstSize == 768))
     return true;

  return false;
}
unsigned X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
                                           unsigned Alignment,
                                           unsigned AddressSpace) {
  VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
  if (!SrcVTy)
    // To calculate scalar take the regular cost, without mask
    return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace);

  unsigned NumElem = SrcVTy->getVectorNumElements();
  VectorType *MaskTy =
    VectorType::get(Type::getInt8Ty(getGlobalContext()), NumElem);
  if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy, 1)) ||
      (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy, 1)) ||
      !isPowerOf2_32(NumElem)) {
    // Scalarization
    unsigned MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
    unsigned ScalarCompareCost =
      getCmpSelInstrCost(Instruction::ICmp,
                         Type::getInt8Ty(getGlobalContext()), NULL);
    unsigned BranchCost = getCFInstrCost(Instruction::Br);
    unsigned MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);

    unsigned ValueSplitCost =
      getScalarizationOverhead(SrcVTy, Opcode == Instruction::Load,
                               Opcode == Instruction::Store);
    unsigned MemopCost =
        NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
                                         Alignment, AddressSpace);
    return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
  }

  // Legalize the type.
  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(SrcVTy);
  unsigned Cost = 0;
  if (LT.second != TLI->getValueType(SrcVTy).getSimpleVT() &&
      LT.second.getVectorNumElements() == NumElem)
    // Promotion requires expand/truncate for data and a shuffle for mask.
    Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, 0) +
            getShuffleCost(TTI::SK_Alternate, MaskTy, 0, 0);

  else if (LT.second.getVectorNumElements() > NumElem) {
    VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
                                            LT.second.getVectorNumElements());
    // Expanding requires fill mask with zeroes
    Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
  }
  if (!ST->hasAVX512())
    return Cost + LT.first*4; // Each maskmov costs 4

  // AVX-512 masked load/store is cheapper
  return Cost+LT.first;
}
// Lowers this interleaved access group into X86-specific
// instructions/intrinsics.
bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
  SmallVector<Instruction *, 4> DecomposedVectors;
  SmallVector<Value *, 4> TransposedVectors;
  VectorType *ShuffleTy = Shuffles[0]->getType();

  if (isa<LoadInst>(Inst)) {
    // Try to generate target-sized register(/instruction).
    decompose(Inst, Factor, ShuffleTy, DecomposedVectors);

    Type *ShuffleEltTy = Inst->getType();
    unsigned NumSubVecElems = ShuffleEltTy->getVectorNumElements() / Factor;
    // Perform matrix-transposition in order to compute interleaved
    // results by generating some sort of (optimized) target-specific
    // instructions.

    switch (NumSubVecElems) {
    default:
      return false;
    case 4:
      transpose_4x4(DecomposedVectors, TransposedVectors);
      break;
    case 8:
    case 16:
    case 32:
      deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
                              NumSubVecElems);
      break;
    }

    // Now replace the unoptimized-interleaved-vectors with the
    // transposed-interleaved vectors.
    for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
      Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);

    return true;
  }

  Type *ShuffleEltTy = ShuffleTy->getVectorElementType();
  unsigned NumSubVecElems = ShuffleTy->getVectorNumElements() / Factor;

  // Lower the interleaved stores:
  //   1. Decompose the interleaved wide shuffle into individual shuffle
  //   vectors.
  decompose(Shuffles[0], Factor, VectorType::get(ShuffleEltTy, NumSubVecElems),
            DecomposedVectors);

  //   2. Transpose the interleaved-vectors into vectors of contiguous
  //      elements.
  switch (NumSubVecElems) {
  case 4:
    transpose_4x4(DecomposedVectors, TransposedVectors);
    break;
  case 16:
  case 32:
    interleave8bitStride4(DecomposedVectors, TransposedVectors, NumSubVecElems);
    break;
  default:
    return false;
  }

  //   3. Concatenate the contiguous-vectors back into a wide vector.
  Value *WideVec = concatenateVectors(Builder, TransposedVectors);

  //   4. Generate a store instruction for wide-vec.
  StoreInst *SI = cast<StoreInst>(Inst);
  Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(),
                             SI->getAlignment());

  return true;
}
bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
  CallingConv::ID CC = F.getCallingConv();
  if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())
    return false;

  auto &TPC = getAnalysis<TargetPassConfig>();

  const TargetMachine &TM = TPC.getTM<TargetMachine>();
  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
  LLVMContext &Ctx = F.getParent()->getContext();
  const DataLayout &DL = F.getParent()->getDataLayout();
  BasicBlock &EntryBlock = *F.begin();
  IRBuilder<> Builder(&*EntryBlock.begin());

  const unsigned KernArgBaseAlign = 16; // FIXME: Increase if necessary
  const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F);

  unsigned MaxAlign;
  // FIXME: Alignment is broken broken with explicit arg offset.;
  const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign);
  if (TotalKernArgSize == 0)
    return false;

  CallInst *KernArgSegment =
      Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, {}, {},
                              nullptr, F.getName() + ".kernarg.segment");

  KernArgSegment->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
  KernArgSegment->addAttribute(AttributeList::ReturnIndex,
    Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));

  unsigned AS = KernArgSegment->getType()->getPointerAddressSpace();
  uint64_t ExplicitArgOffset = 0;

  for (Argument &Arg : F.args()) {
    Type *ArgTy = Arg.getType();
    unsigned Align = DL.getABITypeAlignment(ArgTy);
    unsigned Size = DL.getTypeSizeInBits(ArgTy);
    unsigned AllocSize = DL.getTypeAllocSize(ArgTy);

    uint64_t EltOffset = alignTo(ExplicitArgOffset, Align) + BaseOffset;
    ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;

    if (Arg.use_empty())
      continue;

    if (PointerType *PT = dyn_cast<PointerType>(ArgTy)) {
      // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing
      // modes on SI to know the high bits are 0 so pointer adds don't wrap. We
      // can't represent this with range metadata because it's only allowed for
      // integer types.
      if ((PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
           PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) &&
          ST.getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)
        continue;

      // FIXME: We can replace this with equivalent alias.scope/noalias
      // metadata, but this appears to be a lot of work.
      if (Arg.hasNoAliasAttr())
        continue;
    }

    VectorType *VT = dyn_cast<VectorType>(ArgTy);
    bool IsV3 = VT && VT->getNumElements() == 3;
    bool DoShiftOpt = Size < 32 && !ArgTy->isAggregateType();

    VectorType *V4Ty = nullptr;

    int64_t AlignDownOffset = alignDown(EltOffset, 4);
    int64_t OffsetDiff = EltOffset - AlignDownOffset;
    unsigned AdjustedAlign = MinAlign(DoShiftOpt ? AlignDownOffset : EltOffset,
                                      KernArgBaseAlign);

    Value *ArgPtr;
    Type *AdjustedArgTy;
    if (DoShiftOpt) { // FIXME: Handle aggregate types
      // Since we don't have sub-dword scalar loads, avoid doing an extload by
      // loading earlier than the argument address, and extracting the relevant
      // bits.
      //
      // Additionally widen any sub-dword load to i32 even if suitably aligned,
      // so that CSE between different argument loads works easily.
      ArgPtr = Builder.CreateConstInBoundsGEP1_64(
          Builder.getInt8Ty(), KernArgSegment, AlignDownOffset,
          Arg.getName() + ".kernarg.offset.align.down");
      AdjustedArgTy = Builder.getInt32Ty();
    } else {
      ArgPtr = Builder.CreateConstInBoundsGEP1_64(
          Builder.getInt8Ty(), KernArgSegment, EltOffset,
          Arg.getName() + ".kernarg.offset");
      AdjustedArgTy = ArgTy;
    }

    if (IsV3 && Size >= 32) {
      V4Ty = VectorType::get(VT->getVectorElementType(), 4);
      // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads
      AdjustedArgTy = V4Ty;
    }

    ArgPtr = Builder.CreateBitCast(ArgPtr, AdjustedArgTy->getPointerTo(AS),
                                   ArgPtr->getName() + ".cast");
    LoadInst *Load =
        Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign);
    Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {}));

    MDBuilder MDB(Ctx);

    if (isa<PointerType>(ArgTy)) {
      if (Arg.hasNonNullAttr())
        Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {}));

      uint64_t DerefBytes = Arg.getDereferenceableBytes();
      if (DerefBytes != 0) {
        Load->setMetadata(
          LLVMContext::MD_dereferenceable,
          MDNode::get(Ctx,
                      MDB.createConstant(
                        ConstantInt::get(Builder.getInt64Ty(), DerefBytes))));
      }

      uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes();
      if (DerefOrNullBytes != 0) {
        Load->setMetadata(
          LLVMContext::MD_dereferenceable_or_null,
          MDNode::get(Ctx,
                      MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
                                                          DerefOrNullBytes))));
      }

      unsigned ParamAlign = Arg.getParamAlignment();
      if (ParamAlign != 0) {
        Load->setMetadata(
          LLVMContext::MD_align,
          MDNode::get(Ctx,
                      MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
                                                          ParamAlign))));
      }
    }

    // TODO: Convert noalias arg to !noalias

    if (DoShiftOpt) {
      Value *ExtractBits = OffsetDiff == 0 ?
        Load : Builder.CreateLShr(Load, OffsetDiff * 8);

      IntegerType *ArgIntTy = Builder.getIntNTy(Size);
      Value *Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy);
      Value *NewVal = Builder.CreateBitCast(Trunc, ArgTy,
                                            Arg.getName() + ".load");
      Arg.replaceAllUsesWith(NewVal);
    } else if (IsV3) {
      Value *Shuf = Builder.CreateShuffleVector(Load, UndefValue::get(V4Ty),
                                                {0, 1, 2},
                                                Arg.getName() + ".load");
      Arg.replaceAllUsesWith(Shuf);
    } else {
      Load->setName(Arg.getName() + ".load");
      Arg.replaceAllUsesWith(Load);
    }
  }

  KernArgSegment->addAttribute(
    AttributeList::ReturnIndex,
    Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));

  return true;
}