bool NVPTXSplitBBatBar::runOnFunction(Function &F) { SmallVector<Instruction *, 4> SplitPoints; bool changed = false; // Collect all the split points in SplitPoints for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { BasicBlock::iterator IB = BI->begin(); BasicBlock::iterator II = IB; BasicBlock::iterator IE = BI->end(); // Skit the first instruction. No splitting is needed at this // point even if this is a bar. while (II != IE) { if (IntrinsicInst *inst = dyn_cast<IntrinsicInst>(II)) { Intrinsic::ID id = inst->getIntrinsicID(); // If this is a barrier, split at this instruction // and the next instruction. if (llvm::isBarrierIntrinsic(id)) { if (II != IB) SplitPoints.push_back(II); II++; if ((II != IE) && (!II->isTerminator())) { SplitPoints.push_back(II); II++; } continue; } } II++; } } for (unsigned i = 0; i != SplitPoints.size(); i++) { changed = true; Instruction *inst = SplitPoints[i]; inst->getParent()->splitBasicBlock(inst, "bar_split"); } return changed; }
// Reroll the provided loop with respect to the provided induction variable. // Generally, we're looking for a loop like this: // // %iv = phi [ (preheader, ...), (body, %iv.next) ] // f(%iv) // %iv.1 = add %iv, 1 <-- a root increment // f(%iv.1) // %iv.2 = add %iv, 2 <-- a root increment // f(%iv.2) // %iv.scale_m_1 = add %iv, scale-1 <-- a root increment // f(%iv.scale_m_1) // ... // %iv.next = add %iv, scale // %cmp = icmp(%iv, ...) // br %cmp, header, exit // // Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of // instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can // be intermixed with eachother. The restriction imposed by this algorithm is // that the relative order of the isomorphic instructions in f(%iv), f(%iv.1), // etc. be the same. // // First, we collect the use set of %iv, excluding the other increment roots. // This gives us f(%iv). Then we iterate over the loop instructions (scale-1) // times, having collected the use set of f(%iv.(i+1)), during which we: // - Ensure that the next unmatched instruction in f(%iv) is isomorphic to // the next unmatched instruction in f(%iv.(i+1)). // - Ensure that both matched instructions don't have any external users // (with the exception of last-in-chain reduction instructions). // - Track the (aliasing) write set, and other side effects, of all // instructions that belong to future iterations that come before the matched // instructions. If the matched instructions read from that write set, then // f(%iv) or f(%iv.(i+1)) has some dependency on instructions in // f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly, // if any of these future instructions had side effects (could not be // speculatively executed), and so do the matched instructions, when we // cannot reorder those side-effect-producing instructions, and rerolling // fails. // // Finally, we make sure that all loop instructions are either loop increment // roots, belong to simple latch code, parts of validated reductions, part of // f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions // have been validated), then we reroll the loop. bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount, ReductionTracker &Reductions) { const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(IV)); uint64_t Inc = cast<SCEVConstant>(RealIVSCEV->getOperand(1))-> getValue()->getZExtValue(); // The collection of loop increment instructions. SmallInstructionVector LoopIncs; uint64_t Scale = Inc; // The effective induction variable, IV, is normally also the real induction // variable. When we're dealing with a loop like: // for (int i = 0; i < 500; ++i) // x[3*i] = ...; // x[3*i+1] = ...; // x[3*i+2] = ...; // then the real IV is still i, but the effective IV is (3*i). Instruction *RealIV = IV; if (Inc == 1 && !findScaleFromMul(RealIV, Scale, IV, LoopIncs)) return false; assert(Scale <= MaxInc && "Scale is too large"); assert(Scale > 1 && "Scale must be at least 2"); // The set of increment instructions for each increment value. SmallVector<SmallInstructionVector, 32> Roots(Scale-1); SmallInstructionSet AllRoots; if (!collectAllRoots(L, Inc, Scale, IV, Roots, AllRoots, LoopIncs)) return false; DEBUG(dbgs() << "LRR: Found all root induction increments for: " << *RealIV << "\n"); // An array of just the possible reductions for this scale factor. When we // collect the set of all users of some root instructions, these reduction // instructions are treated as 'final' (their uses are not considered). // This is important because we don't want the root use set to search down // the reduction chain. SmallInstructionSet PossibleRedSet; SmallInstructionSet PossibleRedLastSet, PossibleRedPHISet; Reductions.restrictToScale(Scale, PossibleRedSet, PossibleRedPHISet, PossibleRedLastSet); // We now need to check for equivalence of the use graph of each root with // that of the primary induction variable (excluding the roots). Our goal // here is not to solve the full graph isomorphism problem, but rather to // catch common cases without a lot of work. As a result, we will assume // that the relative order of the instructions in each unrolled iteration // is the same (although we will not make an assumption about how the // different iterations are intermixed). Note that while the order must be // the same, the instructions may not be in the same basic block. SmallInstructionSet Exclude(AllRoots); Exclude.insert(LoopIncs.begin(), LoopIncs.end()); DenseSet<Instruction *> BaseUseSet; collectInLoopUserSet(L, IV, Exclude, PossibleRedSet, BaseUseSet); DenseSet<Instruction *> AllRootUses; std::vector<DenseSet<Instruction *> > RootUseSets(Scale-1); bool MatchFailed = false; for (unsigned i = 0; i < Scale-1 && !MatchFailed; ++i) { DenseSet<Instruction *> &RootUseSet = RootUseSets[i]; collectInLoopUserSet(L, Roots[i], SmallInstructionSet(), PossibleRedSet, RootUseSet); DEBUG(dbgs() << "LRR: base use set size: " << BaseUseSet.size() << " vs. iteration increment " << (i+1) << " use set size: " << RootUseSet.size() << "\n"); if (BaseUseSet.size() != RootUseSet.size()) { MatchFailed = true; break; } // In addition to regular aliasing information, we need to look for // instructions from later (future) iterations that have side effects // preventing us from reordering them past other instructions with side // effects. bool FutureSideEffects = false; AliasSetTracker AST(*AA); // The map between instructions in f(%iv.(i+1)) and f(%iv). DenseMap<Value *, Value *> BaseMap; assert(L->getNumBlocks() == 1 && "Cannot handle multi-block loops"); for (BasicBlock::iterator J1 = Header->begin(), J2 = Header->begin(), JE = Header->end(); J1 != JE && !MatchFailed; ++J1) { if (cast<Instruction>(J1) == RealIV) continue; if (cast<Instruction>(J1) == IV) continue; if (!BaseUseSet.count(J1)) continue; if (PossibleRedPHISet.count(J1)) // Skip reduction PHIs. continue; while (J2 != JE && (!RootUseSet.count(J2) || std::find(Roots[i].begin(), Roots[i].end(), J2) != Roots[i].end())) { // As we iterate through the instructions, instructions that don't // belong to previous iterations (or the base case), must belong to // future iterations. We want to track the alias set of writes from // previous iterations. if (!isa<PHINode>(J2) && !BaseUseSet.count(J2) && !AllRootUses.count(J2)) { if (J2->mayWriteToMemory()) AST.add(J2); // Note: This is specifically guarded by a check on isa<PHINode>, // which while a valid (somewhat arbitrary) micro-optimization, is // needed because otherwise isSafeToSpeculativelyExecute returns // false on PHI nodes. if (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2, DL)) FutureSideEffects = true; } ++J2; } if (!J1->isSameOperationAs(J2)) { DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << " vs. " << *J2 << "\n"); MatchFailed = true; break; } // Make sure that this instruction, which is in the use set of this // root instruction, does not also belong to the base set or the set of // some previous root instruction. if (BaseUseSet.count(J2) || AllRootUses.count(J2)) { DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << " vs. " << *J2 << " (prev. case overlap)\n"); MatchFailed = true; break; } // Make sure that we don't alias with any instruction in the alias set // tracker. If we do, then we depend on a future iteration, and we // can't reroll. if (J2->mayReadFromMemory()) { for (AliasSetTracker::iterator K = AST.begin(), KE = AST.end(); K != KE && !MatchFailed; ++K) { if (K->aliasesUnknownInst(J2, *AA)) { DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << " vs. " << *J2 << " (depends on future store)\n"); MatchFailed = true; break; } } } // If we've past an instruction from a future iteration that may have // side effects, and this instruction might also, then we can't reorder // them, and this matching fails. As an exception, we allow the alias // set tracker to handle regular (simple) load/store dependencies. if (FutureSideEffects && ((!isSimpleLoadStore(J1) && !isSafeToSpeculativelyExecute(J1)) || (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2)))) { DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << " vs. " << *J2 << " (side effects prevent reordering)\n"); MatchFailed = true; break; } // For instructions that are part of a reduction, if the operation is // associative, then don't bother matching the operands (because we // already know that the instructions are isomorphic, and the order // within the iteration does not matter). For non-associative reductions, // we do need to match the operands, because we need to reject // out-of-order instructions within an iteration! // For example (assume floating-point addition), we need to reject this: // x += a[i]; x += b[i]; // x += a[i+1]; x += b[i+1]; // x += b[i+2]; x += a[i+2]; bool InReduction = Reductions.isPairInSame(J1, J2); if (!(InReduction && J1->isAssociative())) { bool Swapped = false, SomeOpMatched = false;; for (unsigned j = 0; j < J1->getNumOperands() && !MatchFailed; ++j) { Value *Op2 = J2->getOperand(j); // If this is part of a reduction (and the operation is not // associatve), then we match all operands, but not those that are // part of the reduction. if (InReduction) if (Instruction *Op2I = dyn_cast<Instruction>(Op2)) if (Reductions.isPairInSame(J2, Op2I)) continue; DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2); if (BMI != BaseMap.end()) Op2 = BMI->second; else if (std::find(Roots[i].begin(), Roots[i].end(), (Instruction*) Op2) != Roots[i].end()) Op2 = IV; if (J1->getOperand(Swapped ? unsigned(!j) : j) != Op2) { // If we've not already decided to swap the matched operands, and // we've not already matched our first operand (note that we could // have skipped matching the first operand because it is part of a // reduction above), and the instruction is commutative, then try // the swapped match. if (!Swapped && J1->isCommutative() && !SomeOpMatched && J1->getOperand(!j) == Op2) { Swapped = true; } else { DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << " vs. " << *J2 << " (operand " << j << ")\n"); MatchFailed = true; break; } } SomeOpMatched = true; } } if ((!PossibleRedLastSet.count(J1) && hasUsesOutsideLoop(J1, L)) || (!PossibleRedLastSet.count(J2) && hasUsesOutsideLoop(J2, L))) { DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << " vs. " << *J2 << " (uses outside loop)\n"); MatchFailed = true; break; } if (!MatchFailed) BaseMap.insert(std::pair<Value *, Value *>(J2, J1)); AllRootUses.insert(J2); Reductions.recordPair(J1, J2, i+1); ++J2; } } if (MatchFailed) return false; DEBUG(dbgs() << "LRR: Matched all iteration increments for " << *RealIV << "\n"); DenseSet<Instruction *> LoopIncUseSet; collectInLoopUserSet(L, LoopIncs, SmallInstructionSet(), SmallInstructionSet(), LoopIncUseSet); DEBUG(dbgs() << "LRR: Loop increment set size: " << LoopIncUseSet.size() << "\n"); // Make sure that all instructions in the loop have been included in some // use set. for (BasicBlock::iterator J = Header->begin(), JE = Header->end(); J != JE; ++J) { if (isa<DbgInfoIntrinsic>(J)) continue; if (cast<Instruction>(J) == RealIV) continue; if (cast<Instruction>(J) == IV) continue; if (BaseUseSet.count(J) || AllRootUses.count(J) || (LoopIncUseSet.count(J) && (J->isTerminator() || isSafeToSpeculativelyExecute(J, DL)))) continue; if (AllRoots.count(J)) continue; if (Reductions.isSelectedPHI(J)) continue; DEBUG(dbgs() << "LRR: aborting reroll based on " << *RealIV << " unprocessed instruction found: " << *J << "\n"); MatchFailed = true; break; } if (MatchFailed) return false; DEBUG(dbgs() << "LRR: all instructions processed from " << *RealIV << "\n"); if (!Reductions.validateSelected()) return false; // At this point, we've validated the rerolling, and we're committed to // making changes! Reductions.replaceSelected(); // Remove instructions associated with non-base iterations. for (BasicBlock::reverse_iterator J = Header->rbegin(); J != Header->rend();) { if (AllRootUses.count(&*J)) { Instruction *D = &*J; DEBUG(dbgs() << "LRR: removing: " << *D << "\n"); D->eraseFromParent(); continue; } ++J; } // Insert the new induction variable. const SCEV *Start = RealIVSCEV->getStart(); if (Inc == 1) Start = SE->getMulExpr(Start, SE->getConstant(Start->getType(), Scale)); const SCEVAddRecExpr *H = cast<SCEVAddRecExpr>(SE->getAddRecExpr(Start, SE->getConstant(RealIVSCEV->getType(), 1), L, SCEV::FlagAnyWrap)); { // Limit the lifetime of SCEVExpander. SCEVExpander Expander(*SE, "reroll"); Value *NewIV = Expander.expandCodeFor(H, IV->getType(), Header->begin()); for (DenseSet<Instruction *>::iterator J = BaseUseSet.begin(), JE = BaseUseSet.end(); J != JE; ++J) (*J)->replaceUsesOfWith(IV, NewIV); if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) { if (LoopIncUseSet.count(BI)) { const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE); if (Inc == 1) ICSCEV = SE->getMulExpr(ICSCEV, SE->getConstant(ICSCEV->getType(), Scale)); // Iteration count SCEV minus 1 const SCEV *ICMinus1SCEV = SE->getMinusSCEV(ICSCEV, SE->getConstant(ICSCEV->getType(), 1)); Value *ICMinus1; // Iteration count minus 1 if (isa<SCEVConstant>(ICMinus1SCEV)) { ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), BI); } else { BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) Preheader = InsertPreheaderForLoop(L, this); ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), Preheader->getTerminator()); } Value *Cond = new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, ICMinus1, "exitcond"); BI->setCondition(Cond); if (BI->getSuccessor(1) != Header) BI->swapSuccessors(); } } } SimplifyInstructionsInBlock(Header, DL, TLI); DeleteDeadPHIs(Header, TLI); ++NumRerolledLoops; return true; }
/// Recursively traverse the CFG of the function, renaming loads and /// stores to the allocas which we are promoting. /// /// IncomingVals indicates what value each Alloca contains on exit from the /// predecessor block Pred. void PromoteMem2Reg::RenamePass(BasicBlock *BB, BasicBlock *Pred, RenamePassData::ValVector &IncomingVals, RenamePassData::LocationVector &IncomingLocs, std::vector<RenamePassData> &Worklist) { NextIteration: // If we are inserting any phi nodes into this BB, they will already be in the // block. if (PHINode *APN = dyn_cast<PHINode>(BB->begin())) { // If we have PHI nodes to update, compute the number of edges from Pred to // BB. if (PhiToAllocaMap.count(APN)) { // We want to be able to distinguish between PHI nodes being inserted by // this invocation of mem2reg from those phi nodes that already existed in // the IR before mem2reg was run. We determine that APN is being inserted // because it is missing incoming edges. All other PHI nodes being // inserted by this pass of mem2reg will have the same number of incoming // operands so far. Remember this count. unsigned NewPHINumOperands = APN->getNumOperands(); unsigned NumEdges = std::count(succ_begin(Pred), succ_end(Pred), BB); assert(NumEdges && "Must be at least one edge from Pred to BB!"); // Add entries for all the phis. BasicBlock::iterator PNI = BB->begin(); do { unsigned AllocaNo = PhiToAllocaMap[APN]; // Update the location of the phi node. updateForIncomingValueLocation(APN, IncomingLocs[AllocaNo], APN->getNumIncomingValues() > 0); // Add N incoming values to the PHI node. for (unsigned i = 0; i != NumEdges; ++i) APN->addIncoming(IncomingVals[AllocaNo], Pred); // The currently active variable for this block is now the PHI. IncomingVals[AllocaNo] = APN; for (DbgVariableIntrinsic *DII : AllocaDbgDeclares[AllocaNo]) ConvertDebugDeclareToDebugValue(DII, APN, DIB); // Get the next phi node. ++PNI; APN = dyn_cast<PHINode>(PNI); if (!APN) break; // Verify that it is missing entries. If not, it is not being inserted // by this mem2reg invocation so we want to ignore it. } while (APN->getNumOperands() == NewPHINumOperands); } } // Don't revisit blocks. if (!Visited.insert(BB).second) return; for (BasicBlock::iterator II = BB->begin(); !II->isTerminator();) { Instruction *I = &*II++; // get the instruction, increment iterator if (LoadInst *LI = dyn_cast<LoadInst>(I)) { AllocaInst *Src = dyn_cast<AllocaInst>(LI->getPointerOperand()); if (!Src) continue; DenseMap<AllocaInst *, unsigned>::iterator AI = AllocaLookup.find(Src); if (AI == AllocaLookup.end()) continue; Value *V = IncomingVals[AI->second]; // If the load was marked as nonnull we don't want to lose // that information when we erase this Load. So we preserve // it with an assume. if (AC && LI->getMetadata(LLVMContext::MD_nonnull) && !isKnownNonZero(V, SQ.DL, 0, AC, LI, &DT)) addAssumeNonNull(AC, LI); // Anything using the load now uses the current value. LI->replaceAllUsesWith(V); BB->getInstList().erase(LI); } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) { // Delete this instruction and mark the name as the current holder of the // value AllocaInst *Dest = dyn_cast<AllocaInst>(SI->getPointerOperand()); if (!Dest) continue; DenseMap<AllocaInst *, unsigned>::iterator ai = AllocaLookup.find(Dest); if (ai == AllocaLookup.end()) continue; // what value were we writing? unsigned AllocaNo = ai->second; IncomingVals[AllocaNo] = SI->getOperand(0); // Record debuginfo for the store before removing it. IncomingLocs[AllocaNo] = SI->getDebugLoc(); for (DbgVariableIntrinsic *DII : AllocaDbgDeclares[ai->second]) ConvertDebugDeclareToDebugValue(DII, SI, DIB); BB->getInstList().erase(SI); } } // 'Recurse' to our successors. succ_iterator I = succ_begin(BB), E = succ_end(BB); if (I == E) return; // Keep track of the successors so we don't visit the same successor twice SmallPtrSet<BasicBlock *, 8> VisitedSuccs; // Handle the first successor without using the worklist. VisitedSuccs.insert(*I); Pred = BB; BB = *I; ++I; for (; I != E; ++I) if (VisitedSuccs.insert(*I).second) Worklist.emplace_back(*I, Pred, IncomingVals, IncomingLocs); goto NextIteration; }
bool ModuloSchedulerDriverPass::runOnLoop(Loop *IncomingLoop, LPPassManager &LPM_Ref) { subscripts subs(IncomingLoop); if (!loop_is_ms_able(IncomingLoop) ) return false; // The header before the parallelized loop will be placed here BasicBlock* preheader = IncomingLoop->getLoopPreheader(); assert(preheader && "Unable to get a hold of the preheader"); // Balance all BasicBlocks in this loop for (Loop::block_iterator it=IncomingLoop->block_begin(); it!=IncomingLoop->block_end();++it) { duplicateValuesWithMultipleUses(*it,subs.getInductionVar()); } // For each BB in loop for (Loop::block_iterator it=IncomingLoop->block_begin(); it!=IncomingLoop->block_end();++it) { instructionPriority ip(*it); (*it)->setName("PipelinedLoop"); // ++++++++ Preheader part +++++++++ // Make a copy of the body for each instruction. Place a pointer to the // parallel cloned instruction in the map below. Later on we will replace it // with a PHINode. DenseMap<const Value *, Value *> InstToPreheader; // For each Instruction in body of the loop, clone, store, etc. for (BasicBlock::iterator ib = (*it)->begin(), eb = (*it)->end(); ib!=eb; ++ib) { // If this is NOT a phi node if (!dyn_cast<PHINode>(ib)) { // Get the priority of the instruction unsigned int p = ip.getPriority(ib); // This is the header version of each variable that goes into a PHI node. // The other edge needs to come from the 'prev' iteration // We subtract -1 because this is one iteration before // Store the result into the map of the cloned InstToPreheader[ib] = copyLoopBodyToHeader(ib, subs.getInductionVar(), preheader, p-1); } } // ++++++++ Loop body part +++++++++ // For each of the cloned increment the indexs if needed and place the PHINode. for (BasicBlock::iterator ib = (*it)->begin(), eb = (*it)->end(); ib!=eb; ++ib) { // If this is NOT a phi node if (!dyn_cast<PHINode>(ib)) { unsigned int p = ip.getPriority(ib); // If this variable is not dependent on i (not i:=i+1) // then we need to replace each i to i+5 ... // We also do not need to create a PHI node, etc. if (!subs.isUsedByInductionVariable(ib)) { incrementInductionVarIfUsed(ib,subs.getInductionVar(),p); // Create the new PHI Node to replace the node if (!dyn_cast<StoreInst>(ib) && !ib->isTerminator()) { std::string newname = "glue" + (*it)->getName(); //PHINode* np = PHINode::Create(ib->getType(), "glue", *it); PHINode* np = PHINode::Create(ib->getType(), newname, *it); ib->replaceAllUsesWith(np); np->reserveOperandSpace(2); np->addIncoming(InstToPreheader[ib], preheader); np->addIncoming(ib, *it); np->moveBefore((*it)->begin()); } }// end of if this is not an IV node (i:=i+1) } } } eliminateDuplicatedLoads(preheader); for (Loop::block_iterator it=IncomingLoop->block_begin(); it!=IncomingLoop->block_end();++it) { eliminateDuplicatedLoads(*it); for (BasicBlock::iterator in = (*it)->begin(); in != (*it)->end(); ++in) { foldAddInstructions(in); } } return true; }