void CSDataRando::findArgNodes(Module &M) { // Create function equivalence classes from the global equivalence classes. EquivalenceClasses<const GlobalValue*> &GlobalECs = DSA->getGlobalECs(); EquivalenceClasses<const Function*> FunctionECs; for (auto ei = GlobalECs.begin(), ee = GlobalECs.end(); ei != ee; ei++) { // Ignore non-leader values. if (!ei->isLeader()) { continue; } const Function *Leader = nullptr; for (auto mi = GlobalECs.member_begin(ei), me = GlobalECs.member_end(); mi != me; mi++) { // Only look at functions. if (const Function *F = dyn_cast<Function>(*mi)) { if (Leader) { FunctionECs.unionSets(Leader, F); } else { Leader = FunctionECs.getOrInsertLeaderValue(F); } } } } // Make sure all Functions are part of an equivalence class. This is important // since non-address taken functions may not appear in GlobalECs. for (Function &F : M) { if (!F.isDeclaration()) { FunctionECs.insert(&F); } } // Go through all equivalence classes and determine the additional // arguments. for (auto ei = FunctionECs.begin(), ee = FunctionECs.end();ei != ee; ei++) { if (ei->isLeader()) { NumFunctionECs++; std::vector<const Function*> Functions; Functions.insert(Functions.end(), FunctionECs.member_begin(ei), FunctionECs.member_end()); // If we can't safely replace uses of the function's address with its // clone's address then we can't safely transform indirect calls to this // equivalence class. We still find the arg nodes for each function to // replace direct calls to these functions. if (!DSA->canReplaceAddress(ei->getData())) { NumFunECsWithExternal++; for (const Function *F : Functions) { if (!F->isDeclaration()) { findFunctionArgNodes(F); FunctionInfo[F].CanReplaceAddress = false; } } } else { findFunctionArgNodes(Functions); } } } }
MapVector<Instruction *, uint64_t> llvm::computeMinimumValueSizes(ArrayRef<BasicBlock *> Blocks, DemandedBits &DB, const TargetTransformInfo *TTI) { // DemandedBits will give us every value's live-out bits. But we want // to ensure no extra casts would need to be inserted, so every DAG // of connected values must have the same minimum bitwidth. EquivalenceClasses<Value *> ECs; SmallVector<Value *, 16> Worklist; SmallPtrSet<Value *, 4> Roots; SmallPtrSet<Value *, 16> Visited; DenseMap<Value *, uint64_t> DBits; SmallPtrSet<Instruction *, 4> InstructionSet; MapVector<Instruction *, uint64_t> MinBWs; // Determine the roots. We work bottom-up, from truncs or icmps. bool SeenExtFromIllegalType = false; for (auto *BB : Blocks) for (auto &I : *BB) { InstructionSet.insert(&I); if (TTI && (isa<ZExtInst>(&I) || isa<SExtInst>(&I)) && !TTI->isTypeLegal(I.getOperand(0)->getType())) SeenExtFromIllegalType = true; // Only deal with non-vector integers up to 64-bits wide. if ((isa<TruncInst>(&I) || isa<ICmpInst>(&I)) && !I.getType()->isVectorTy() && I.getOperand(0)->getType()->getScalarSizeInBits() <= 64) { // Don't make work for ourselves. If we know the loaded type is legal, // don't add it to the worklist. if (TTI && isa<TruncInst>(&I) && TTI->isTypeLegal(I.getType())) continue; Worklist.push_back(&I); Roots.insert(&I); } } // Early exit. if (Worklist.empty() || (TTI && !SeenExtFromIllegalType)) return MinBWs; // Now proceed breadth-first, unioning values together. while (!Worklist.empty()) { Value *Val = Worklist.pop_back_val(); Value *Leader = ECs.getOrInsertLeaderValue(Val); if (Visited.count(Val)) continue; Visited.insert(Val); // Non-instructions terminate a chain successfully. if (!isa<Instruction>(Val)) continue; Instruction *I = cast<Instruction>(Val); // If we encounter a type that is larger than 64 bits, we can't represent // it so bail out. if (DB.getDemandedBits(I).getBitWidth() > 64) return MapVector<Instruction *, uint64_t>(); uint64_t V = DB.getDemandedBits(I).getZExtValue(); DBits[Leader] |= V; DBits[I] = V; // Casts, loads and instructions outside of our range terminate a chain // successfully. if (isa<SExtInst>(I) || isa<ZExtInst>(I) || isa<LoadInst>(I) || !InstructionSet.count(I)) continue; // Unsafe casts terminate a chain unsuccessfully. We can't do anything // useful with bitcasts, ptrtoints or inttoptrs and it'd be unsafe to // transform anything that relies on them. if (isa<BitCastInst>(I) || isa<PtrToIntInst>(I) || isa<IntToPtrInst>(I) || !I->getType()->isIntegerTy()) { DBits[Leader] |= ~0ULL; continue; } // We don't modify the types of PHIs. Reductions will already have been // truncated if possible, and inductions' sizes will have been chosen by // indvars. if (isa<PHINode>(I)) continue; if (DBits[Leader] == ~0ULL) // All bits demanded, no point continuing. continue; for (Value *O : cast<User>(I)->operands()) { ECs.unionSets(Leader, O); Worklist.push_back(O); } } // Now we've discovered all values, walk them to see if there are // any users we didn't see. If there are, we can't optimize that // chain. for (auto &I : DBits) for (auto *U : I.first->users()) if (U->getType()->isIntegerTy() && DBits.count(U) == 0) DBits[ECs.getOrInsertLeaderValue(I.first)] |= ~0ULL; for (auto I = ECs.begin(), E = ECs.end(); I != E; ++I) { uint64_t LeaderDemandedBits = 0; for (auto MI = ECs.member_begin(I), ME = ECs.member_end(); MI != ME; ++MI) LeaderDemandedBits |= DBits[*MI]; uint64_t MinBW = (sizeof(LeaderDemandedBits) * 8) - llvm::countLeadingZeros(LeaderDemandedBits); // Round up to a power of 2 if (!isPowerOf2_64((uint64_t)MinBW)) MinBW = NextPowerOf2(MinBW); // We don't modify the types of PHIs. Reductions will already have been // truncated if possible, and inductions' sizes will have been chosen by // indvars. // If we are required to shrink a PHI, abandon this entire equivalence class. bool Abort = false; for (auto MI = ECs.member_begin(I), ME = ECs.member_end(); MI != ME; ++MI) if (isa<PHINode>(*MI) && MinBW < (*MI)->getType()->getScalarSizeInBits()) { Abort = true; break; } if (Abort) continue; for (auto MI = ECs.member_begin(I), ME = ECs.member_end(); MI != ME; ++MI) { if (!isa<Instruction>(*MI)) continue; Type *Ty = (*MI)->getType(); if (Roots.count(*MI)) Ty = cast<Instruction>(*MI)->getOperand(0)->getType(); if (MinBW < Ty->getScalarSizeInBits()) MinBWs[cast<Instruction>(*MI)] = MinBW; } } return MinBWs; }
bool AArch64A57FPLoadBalancing::runOnBasicBlock(MachineBasicBlock &MBB) { bool Changed = false; DEBUG(dbgs() << "Running on MBB: " << MBB << " - scanning instructions...\n"); // First, scan the basic block producing a set of chains. // The currently "active" chains - chains that can be added to and haven't // been killed yet. This is keyed by register - all chains can only have one // "link" register between each inst in the chain. std::map<unsigned, Chain*> ActiveChains; std::set<Chain*> AllChains; unsigned Idx = 0; for (auto &MI : MBB) scanInstruction(&MI, Idx++, ActiveChains, AllChains); DEBUG(dbgs() << "Scan complete, "<< AllChains.size() << " chains created.\n"); // Group the chains into disjoint sets based on their liveness range. This is // a poor-man's version of graph coloring. Ideally we'd create an interference // graph and perform full-on graph coloring on that, but; // (a) That's rather heavyweight for only two colors. // (b) We expect multiple disjoint interference regions - in practice the live // range of chains is quite small and they are clustered between loads // and stores. EquivalenceClasses<Chain*> EC; for (auto *I : AllChains) EC.insert(I); for (auto *I : AllChains) { for (auto *J : AllChains) { if (I != J && I->rangeOverlapsWith(J)) EC.unionSets(I, J); } } DEBUG(dbgs() << "Created " << EC.getNumClasses() << " disjoint sets.\n"); // Now we assume that every member of an equivalence class interferes // with every other member of that class, and with no members of other classes. // Convert the EquivalenceClasses to a simpler set of sets. std::vector<std::vector<Chain*> > V; for (auto I = EC.begin(), E = EC.end(); I != E; ++I) { std::vector<Chain*> Cs(EC.member_begin(I), EC.member_end()); if (Cs.empty()) continue; V.push_back(Cs); } // Now we have a set of sets, order them by start address so // we can iterate over them sequentially. std::sort(V.begin(), V.end(), [](const std::vector<Chain*> &A, const std::vector<Chain*> &B) { return A.front()->startsBefore(B.front()); }); // As we only have two colors, we can track the global (BB-level) balance of // odds versus evens. We aim to keep this near zero to keep both execution // units fed. // Positive means we're even-heavy, negative we're odd-heavy. // // FIXME: If chains have interdependencies, for example: // mul r0, r1, r2 // mul r3, r0, r1 // We do not model this and may color each one differently, assuming we'll // get ILP when we obviously can't. This hasn't been seen to be a problem // in practice so far, so we simplify the algorithm by ignoring it. int Parity = 0; for (auto &I : V) Changed |= colorChainSet(I, MBB, Parity); for (auto *C : AllChains) delete C; return Changed; }