static bool hasPrivateLoadStore(Loop *L) { const std::vector<Loop*> subLoops = L->getSubLoops(); std::set<BasicBlock*> subBlocks, blocks; for(auto l : subLoops) for(auto bb : l->getBlocks()) subBlocks.insert(bb); for(auto bb : L->getBlocks()) if (subBlocks.find(bb) == subBlocks.end()) blocks.insert(bb); for(auto bb : blocks) { for (BasicBlock::iterator inst = bb->begin(), instE = bb->end(); inst != instE; ++inst) { unsigned addrSpace = -1; if (isa<LoadInst>(*inst)) { LoadInst *ld = cast<LoadInst>(&*inst); addrSpace = ld->getPointerAddressSpace(); } else if (isa<StoreInst>(*inst)) { StoreInst *st = cast<StoreInst>(&*inst); addrSpace = st->getPointerAddressSpace(); } if (addrSpace == 0) return true; } } return false; }
bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { if (!WidenLoads) return false; if ((I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && canWidenScalarExtLoad(I)) { IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); Type *I32Ty = Builder.getInt32Ty(); Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace()); Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT); LoadInst *WidenLoad = Builder.CreateLoad(BitCast); WidenLoad->copyMetadata(I); // If we have range metadata, we need to convert the type, and not make // assumptions about the high bits. if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) { ConstantInt *Lower = mdconst::extract<ConstantInt>(Range->getOperand(0)); if (Lower->getValue().isNullValue()) { WidenLoad->setMetadata(LLVMContext::MD_range, nullptr); } else { Metadata *LowAndHigh[] = { ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))), // Don't make assumptions about the high bits. ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0)) }; WidenLoad->setMetadata(LLVMContext::MD_range, MDNode::get(Mod->getContext(), LowAndHigh)); } } int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType()); Type *IntNTy = Builder.getIntNTy(TySize); Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy); Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType()); I.replaceAllUsesWith(ValOrig); I.eraseFromParent(); return true; } return false; }
void PropagateJuliaAddrspaces::visitLoadInst(LoadInst &LI) { unsigned AS = LI.getPointerAddressSpace(); if (!isSpecialAS(AS)) return; Value *Replacement = LiftPointer(LI.getPointerOperand(), LI.getType(), &LI); if (!Replacement) return; LI.setOperand(LoadInst::getPointerOperandIndex(), Replacement); }
void X86InterleavedAccessGroup::decompose( Instruction *VecInst, unsigned NumSubVectors, VectorType *SubVecTy, SmallVectorImpl<Instruction *> &DecomposedVectors) { assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) && "Expected Load or Shuffle"); Type *VecTy = VecInst->getType(); (void)VecTy; assert(VecTy->isVectorTy() && DL.getTypeSizeInBits(VecTy) >= DL.getTypeSizeInBits(SubVecTy) * NumSubVectors && "Invalid Inst-size!!!"); if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) { Value *Op0 = SVI->getOperand(0); Value *Op1 = SVI->getOperand(1); // Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type. for (unsigned i = 0; i < NumSubVectors; ++i) DecomposedVectors.push_back( cast<ShuffleVectorInst>(Builder.CreateShuffleVector( Op0, Op1, createSequentialMask(Builder, Indices[i], SubVecTy->getVectorNumElements(), 0)))); return; } // Decompose the load instruction. LoadInst *LI = cast<LoadInst>(VecInst); Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace()); Value *VecBasePtr; unsigned int NumLoads = NumSubVectors; // In the case of stride 3 with a vector of 32 elements load the information // in the following way: // [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1] if (DL.getTypeSizeInBits(VecTy) == 768) { Type *VecTran = VectorType::get(Type::getInt8Ty(LI->getContext()), 16)->getPointerTo(); VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecTran); NumLoads = NumSubVectors * 2; } else VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy); // Generate N loads of T type. for (unsigned i = 0; i < NumLoads; i++) { // TODO: Support inbounds GEP. Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i)); Instruction *NewLoad = Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment()); DecomposedVectors.push_back(NewLoad); } }
PointerOffsetPair LoadCombine::getPointerOffsetPair(LoadInst &LI) { auto &DL = LI.getModule()->getDataLayout(); PointerOffsetPair POP; POP.Pointer = LI.getPointerOperand(); unsigned BitWidth = DL.getPointerSizeInBits(LI.getPointerAddressSpace()); POP.Offset = APInt(BitWidth, 0); while (isa<BitCastInst>(POP.Pointer) || isa<GetElementPtrInst>(POP.Pointer)) { if (auto *GEP = dyn_cast<GetElementPtrInst>(POP.Pointer)) { APInt LastOffset = POP.Offset; if (!GEP->accumulateConstantOffset(DL, POP.Offset)) { // Can't handle GEPs with variable indices. POP.Offset = LastOffset; return POP; } POP.Pointer = GEP->getPointerOperand(); } else if (auto *BC = dyn_cast<BitCastInst>(POP.Pointer)) { POP.Pointer = BC->getOperand(0); } } return POP; }
unsigned CostModelAnalysis::getInstructionCost(Instruction *I) const { if (!VTTI) return -1; switch (I->getOpcode()) { case Instruction::Ret: case Instruction::PHI: case Instruction::Br: { return VTTI->getCFInstrCost(I->getOpcode()); } case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: case Instruction::FSub: case Instruction::Mul: case Instruction::FMul: case Instruction::UDiv: case Instruction::SDiv: case Instruction::FDiv: case Instruction::URem: case Instruction::SRem: case Instruction::FRem: case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: case Instruction::And: case Instruction::Or: case Instruction::Xor: { return VTTI->getArithmeticInstrCost(I->getOpcode(), I->getType()); } case Instruction::Select: { SelectInst *SI = cast<SelectInst>(I); Type *CondTy = SI->getCondition()->getType(); return VTTI->getCmpSelInstrCost(I->getOpcode(), I->getType(), CondTy); } case Instruction::ICmp: case Instruction::FCmp: { Type *ValTy = I->getOperand(0)->getType(); return VTTI->getCmpSelInstrCost(I->getOpcode(), ValTy); } case Instruction::Store: { StoreInst *SI = cast<StoreInst>(I); Type *ValTy = SI->getValueOperand()->getType(); return VTTI->getMemoryOpCost(I->getOpcode(), ValTy, SI->getAlignment(), SI->getPointerAddressSpace()); } case Instruction::Load: { LoadInst *LI = cast<LoadInst>(I); return VTTI->getMemoryOpCost(I->getOpcode(), I->getType(), LI->getAlignment(), LI->getPointerAddressSpace()); } case Instruction::ZExt: case Instruction::SExt: case Instruction::FPToUI: case Instruction::FPToSI: case Instruction::FPExt: case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::SIToFP: case Instruction::UIToFP: case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { Type *SrcTy = I->getOperand(0)->getType(); return VTTI->getCastInstrCost(I->getOpcode(), I->getType(), SrcTy); } case Instruction::ExtractElement: { ExtractElementInst * EEI = cast<ExtractElementInst>(I); ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1)); unsigned Idx = -1; if (CI) Idx = CI->getZExtValue(); return VTTI->getVectorInstrCost(I->getOpcode(), EEI->getOperand(0)->getType(), Idx); } case Instruction::InsertElement: { InsertElementInst * IE = cast<InsertElementInst>(I); ConstantInt *CI = dyn_cast<ConstantInt>(IE->getOperand(2)); unsigned Idx = -1; if (CI) Idx = CI->getZExtValue(); return VTTI->getVectorInstrCost(I->getOpcode(), IE->getType(), Idx); } default: // We don't have any information on this instruction. return -1; } }
LVILatticeVal LVIQuery::getBlockValue(BasicBlock *BB) { // See if we already have a value for this block. LVILatticeVal BBLV = getCachedEntryForBlock(BB); // If we've already computed this block's value, return it. if (!BBLV.isUndefined()) { DEBUG(dbgs() << " reuse BB '" << BB->getName() << "' val=" << BBLV <<'\n'); return BBLV; } // Otherwise, this is the first time we're seeing this block. Reset the // lattice value to overdefined, so that cycles will terminate and be // conservatively correct. BBLV.markOverdefined(); Cache[BB] = BBLV; Instruction *BBI = dyn_cast<Instruction>(Val); if (BBI == 0 || BBI->getParent() != BB) { LVILatticeVal Result; // Start Undefined. // If this is a pointer, and there's a load from that pointer in this BB, // then we know that the pointer can't be NULL. bool NotNull = false; if (Val->getType()->isPointerTy()) { for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();BI != BE;++BI){ LoadInst *L = dyn_cast<LoadInst>(BI); if (L && L->getPointerAddressSpace() == 0 && L->getPointerOperand()->getUnderlyingObject() == Val->getUnderlyingObject()) { NotNull = true; break; } } } unsigned NumPreds = 0; // Loop over all of our predecessors, merging what we know from them into // result. for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { Result.mergeIn(getEdgeValue(*PI, BB)); // If we hit overdefined, exit early. The BlockVals entry is already set // to overdefined. if (Result.isOverdefined()) { DEBUG(dbgs() << " compute BB '" << BB->getName() << "' - overdefined because of pred.\n"); // If we previously determined that this is a pointer that can't be null // then return that rather than giving up entirely. if (NotNull) { const PointerType *PTy = cast<PointerType>(Val->getType()); Result = LVILatticeVal::getNot(ConstantPointerNull::get(PTy)); } return Result; } ++NumPreds; } // If this is the entry block, we must be asking about an argument. The // value is overdefined. if (NumPreds == 0 && BB == &BB->getParent()->front()) { assert(isa<Argument>(Val) && "Unknown live-in to the entry block"); Result.markOverdefined(); return Result; } // Return the merged value, which is more precise than 'overdefined'. assert(!Result.isOverdefined()); return Cache[BB] = Result; } // If this value is defined by an instruction in this block, we have to // process it here somehow or return overdefined. if (PHINode *PN = dyn_cast<PHINode>(BBI)) { LVILatticeVal Result; // Start Undefined. // Loop over all of our predecessors, merging what we know from them into // result. for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { Value* PhiVal = PN->getIncomingValueForBlock(*PI); Result.mergeIn(Parent.getValueOnEdge(PhiVal, *PI, BB)); // If we hit overdefined, exit early. The BlockVals entry is already set // to overdefined. if (Result.isOverdefined()) { DEBUG(dbgs() << " compute BB '" << BB->getName() << "' - overdefined because of pred.\n"); return Result; } } // Return the merged value, which is more precise than 'overdefined'. assert(!Result.isOverdefined()); return Cache[BB] = Result; } assert(Cache[BB].isOverdefined() && "Recursive query changed our cache?"); // We can only analyze the definitions of certain classes of instructions // (integral binops and casts at the moment), so bail if this isn't one. LVILatticeVal Result; if ((!isa<BinaryOperator>(BBI) && !isa<CastInst>(BBI)) || !BBI->getType()->isIntegerTy()) { DEBUG(dbgs() << " compute BB '" << BB->getName() << "' - overdefined because inst def found.\n"); Result.markOverdefined(); return Result; } // FIXME: We're currently limited to binops with a constant RHS. This should // be improved. BinaryOperator *BO = dyn_cast<BinaryOperator>(BBI); if (BO && !isa<ConstantInt>(BO->getOperand(1))) { DEBUG(dbgs() << " compute BB '" << BB->getName() << "' - overdefined because inst def found.\n"); Result.markOverdefined(); return Result; } // Figure out the range of the LHS. If that fails, bail. LVILatticeVal LHSVal = Parent.getValueInBlock(BBI->getOperand(0), BB); if (!LHSVal.isConstantRange()) { Result.markOverdefined(); return Result; } ConstantInt *RHS = 0; ConstantRange LHSRange = LHSVal.getConstantRange(); ConstantRange RHSRange(1); const IntegerType *ResultTy = cast<IntegerType>(BBI->getType()); if (isa<BinaryOperator>(BBI)) { RHS = dyn_cast<ConstantInt>(BBI->getOperand(1)); if (!RHS) { Result.markOverdefined(); return Result; } RHSRange = ConstantRange(RHS->getValue(), RHS->getValue()+1); } // NOTE: We're currently limited by the set of operations that ConstantRange // can evaluate symbolically. Enhancing that set will allows us to analyze // more definitions. switch (BBI->getOpcode()) { case Instruction::Add: Result.markConstantRange(LHSRange.add(RHSRange)); break; case Instruction::Sub: Result.markConstantRange(LHSRange.sub(RHSRange)); break; case Instruction::Mul: Result.markConstantRange(LHSRange.multiply(RHSRange)); break; case Instruction::UDiv: Result.markConstantRange(LHSRange.udiv(RHSRange)); break; case Instruction::Shl: Result.markConstantRange(LHSRange.shl(RHSRange)); break; case Instruction::LShr: Result.markConstantRange(LHSRange.lshr(RHSRange)); break; case Instruction::Trunc: Result.markConstantRange(LHSRange.truncate(ResultTy->getBitWidth())); break; case Instruction::SExt: Result.markConstantRange(LHSRange.signExtend(ResultTy->getBitWidth())); break; case Instruction::ZExt: Result.markConstantRange(LHSRange.zeroExtend(ResultTy->getBitWidth())); break; case Instruction::BitCast: Result.markConstantRange(LHSRange); break; case Instruction::And: Result.markConstantRange(LHSRange.binaryAnd(RHSRange)); break; case Instruction::Or: Result.markConstantRange(LHSRange.binaryOr(RHSRange)); break; // Unhandled instructions are overdefined. default: DEBUG(dbgs() << " compute BB '" << BB->getName() << "' - overdefined because inst def found.\n"); Result.markOverdefined(); break; } return Cache[BB] = Result; }
int qdp_jit_vec::vectorize_loads( std::vector<std::vector<Instruction*> >& load_instructions ) { DEBUG(dbgs() << "Vectorize loads, total of " << load_instructions.size() << "\n"); //std::vector<std::pair<Value*,Value*> > scalar_vector_loads; scalar_vector_pairs.clear(); if (load_instructions.empty()) return 0; int load_vec_elem = 0; for( std::vector<Instruction*>& VI : load_instructions ) { DEBUG(dbgs() << "Processing vector of loads number " << load_vec_elem++ << "\n"); assert( VI.size() == vec_len && "length of vector of loads does not match vec_len" ); int loads_consec = true; uint64_t lo,hi; bool first = true; for( Instruction* I : VI ) { GetElementPtrInst* GEP; if ((GEP = dyn_cast<GetElementPtrInst>(I->getOperand(0)))) { if (first) { ConstantInt * CI; if ((CI = dyn_cast<ConstantInt>(GEP->getOperand(1)))) { lo = CI->getZExtValue(); hi = lo+1; first=false; } else { DEBUG(dbgs() << "First load in the chain: Operand of GEP not a ConstantInt" << *GEP->getOperand(1) << "\n"); assert( 0 && "First load in the chain: Operand of GEP not a ConstantInt\n"); exit(0); } } else { ConstantInt * CI; if ((CI = dyn_cast<ConstantInt>(GEP->getOperand(1)))) { if (hi != CI->getZExtValue()) { DEBUG(dbgs() << "Loads not consecutive lo=" << lo << " hi=" << hi << " this=" << CI->getZExtValue() << "\n"); loads_consec = false; } else { hi++; } } } } else { DEBUG(dbgs() << "Operand of load not a GEP " << *I->getOperand(0) << "\n"); assert( 0 && "Operand of load not a GEP" ); exit(0); loads_consec = false; } } if (loads_consec) { DEBUG(dbgs() << "Loads consecuetive\n"); LoadInst* LI = cast<LoadInst>(VI.at(0)); GetElementPtrInst* GEP = cast<GetElementPtrInst>(LI->getOperand(0)); Instruction* GEPcl = clone_with_operands(GEP); unsigned AS = LI->getPointerAddressSpace(); VectorType *VecTy = VectorType::get( LI->getType() , vec_len ); unsigned bitwidth = LI->getType()->getPrimitiveSizeInBits(); unsigned bytewidth = bitwidth == 1 ? 1 : bitwidth/8; DEBUG(dbgs() << "bit/byte width of load instr trype: " << bitwidth << "/" << bytewidth << "\n"); //Builder->SetInsertPoint( GEP ); Value *VecPtr = Builder->CreateBitCast(GEPcl,VecTy->getPointerTo(AS)); //Value *VecLoad = Builder->CreateLoad( VecPtr ); unsigned align = lo % vec_len == 0 ? bytewidth * vec_len : bytewidth; Value *VecLoad = Builder->CreateAlignedLoad( VecPtr , align ); //DEBUG(dbgs() << "created vector load: " << *VecLoad << "\n"); //function->dump(); // unsigned AS = LI->getPointerAddressSpace(); // VectorType *VecTy = VectorType::get( LI->getType() , vec_len ); // Builder->SetInsertPoint( LI ); // Value *VecPtr = Builder->CreateBitCast(LI->getPointerOperand(),VecTy->getPointerTo(AS)); // Value *VecLoad = Builder->CreateLoad( VecPtr ); scalar_vector_pairs.push_back( std::make_pair( VI.at(0) , VecLoad ) ); } else { DEBUG(dbgs() << "Loads not consecutive:\n"); for (Value* V: VI) { DEBUG(dbgs() << *V << "\n"); } //Instruction* I = dyn_cast<Instruction>(VI.back()->getNextNode()); //DEBUG(dbgs() << *I << "\n"); //Builder->SetInsertPoint( VI.at(0) ); std::vector<Instruction*> VIcl; for( Instruction* I : VI ) { VIcl.push_back( clone_with_operands(I) ); } VectorType *VecTy = VectorType::get( VI.at(0)->getType() , vec_len ); Value *Vec = UndefValue::get(VecTy); int i=0; for( Instruction* I : VIcl ) { Vec = Builder->CreateInsertElement(Vec, I, Builder->getInt32(i++)); } scalar_vector_pairs.push_back( std::make_pair( VI.at(0) , Vec ) ); } } //vectorize_all_uses( scalar_vector_loads ); DEBUG(dbgs() << "Searching for the stores:\n"); //function->dump(); // // Vectorize all StoreInst reachable by the first load of each vector of loads // { SetVector<Instruction*> to_visit; SetVector<Instruction*> stores_processed; for( std::vector<Instruction*>& VI : load_instructions ) { to_visit.insert(VI.at(0)); } while (!to_visit.empty()) { Instruction* I = to_visit.back(); to_visit.pop_back(); DEBUG(dbgs() << "visiting " << *I << "\n"); if (StoreInst* SI = dyn_cast<StoreInst>(I)) { if (!stores_processed.count(SI)) { get_vector_version( SI ); stores_processed.insert( SI ); } } else { for (Use &U : I->uses()) { Value* V = U.getUser(); to_visit.insert(cast<Instruction>(V)); } } } } // DEBUG(dbgs() << "After vectorizing the stores\n"); // function->dump(); // // Mark all stores as being processed // SetVector<Instruction*> to_visit; for( std::vector<Instruction*>& VI : load_instructions ) { for( Instruction* I : VI ) { to_visit.insert(I); if (GetElementPtrInst* GEP = dyn_cast<GetElementPtrInst>(I->getOperand(0))) { for_erasure.insert(GEP); } } } while (!to_visit.empty()) { Instruction* I = to_visit.back(); to_visit.pop_back(); for_erasure.insert(I); if (StoreInst* SI = dyn_cast<StoreInst>(I)) { stores_processed.insert(SI); if (GetElementPtrInst* GEP = dyn_cast<GetElementPtrInst>(SI->getOperand(1))) { for_erasure.insert(GEP); } } else { for (Use &U : I->uses()) { Value* V = U.getUser(); to_visit.insert(cast<Instruction>(V)); } } } DEBUG(dbgs() << "----------------------------------------\n"); DEBUG(dbgs() << "After vectorize_loads\n"); //function->dump(); return 0; }