void Vxls::splitCritEdges() { smart::vector<unsigned> preds; preds.resize(unit.blocks.size()); for (auto pred : blocks) { auto succlist = succs(unit.blocks[pred]); for (auto succ : succlist) { preds[succ]++; } } auto resort = false; for (auto pred : blocks) { auto succlist = succs(unit.blocks[pred]); if (succlist.size() <= 1) continue; for (auto& succ : succlist) { if (preds[succ] <= 1) continue; // split the critical edge. auto middle = unit.makeBlock(unit.blocks[succ].area); unit.blocks[middle].code.emplace_back(jmp{succ}); succ = middle; resort = true; } } if (resort) { blocks = sortBlocks(unit); } }
void lowerForARM(Vunit& unit) { assertx(check(unit)); // block order doesn't matter, but only visit reachable blocks. auto blocks = sortBlocks(unit); for (auto b : blocks) { auto oldCode = std::move(unit.blocks[b].code); Vout v{unit, b}; for (auto& inst : oldCode) { v.setOrigin(inst.origin); switch (inst.op) { #define O(nm, imm, use, def) \ case Vinstr::nm: \ lower(inst.nm##_, v); \ break; VASM_OPCODES #undef O } } } assertx(check(unit)); printUnit(kVasmARMFoldLevel, "after lowerForARM", unit); }
// Remove dead instructions by doing a traditional liveness analysis. // instructions that mutate memory, physical registers, or status flags // are considered useful. All branches are considered useful. // // Given SSA, there's a faster sparse version of this algorithm that marks // useful instructions in one pass, then transitively marks pure instructions // that define inputs to useful instructions. However it requires a mapping // from vreg numbers to the instruction that defines them, and a way to address // individual instructions. // // We could remove useless branches by computing the post-dominator tree and // RDF(b) for each block; then a branch is only useful if it controls whether // or not a useful block executes, and useless branches can be forwarded to // the nearest useful post-dominator. void removeDeadCode(Vunit& unit) { auto blocks = sortBlocks(unit); jit::vector<LiveSet> livein(unit.blocks.size()); LiveSet live(unit.next_vr); auto pass = [&](bool mutate) { bool changed = false; for (auto blockIt = blocks.end(); blockIt != blocks.begin();) { auto b = *--blockIt; auto& block = unit.blocks[b]; live.reset(); for (auto s : succs(block)) { if (!livein[s].empty()) { live |= livein[s]; } } for (auto i = block.code.end(); i != block.code.begin();) { auto& inst = *--i; auto useful = effectful(inst); visitDefs(unit, inst, [&](Vreg r) { if (r.isPhys() || live.test(r)) { useful = true; live.reset(r); } }); if (useful) { visitUses(unit, inst, [&](Vreg r) { live.set(r); }); } else if (mutate) { inst = nop{}; changed = true; } } if (mutate) { assert(live == livein[b]); } else { if (live != livein[b]) { livein[b] = live; changed = true; } } } return changed; }; // analyze until livein reaches a fixed point while (pass(false)) {} // nop-out useless instructions if (pass(true)) { for (auto b : blocks) { auto& code = unit.blocks[b].code; auto end = std::remove_if(code.begin(), code.end(), [&](Vinstr& inst) { return inst.op == Vinstr::nop; }); code.erase(end, code.end()); } printUnit(kVasmDCELevel, "after vasm-dead", unit); } }
Clusterizer(Vunit& unit, const Scale& scale) : m_unit(unit) , m_scale(scale) , m_blocks(sortBlocks(unit)) { initClusters(); clusterize(); sortClusters(); splitHotColdClusters(); FTRACE(1, "{}", toString()); }
static void sortBlocks(SWHirschbergBlock list[], int start, int end) { SWHirschbergBlock key; int frontIdx; int backIdx; int pivot; if (start < end) { pivot = (start + end) / 2; swapBlocks(&list[start], &list[pivot]); key = list[start]; frontIdx = start + 1; backIdx = end; while (frontIdx <= backIdx) { while ( frontIdx <= end && compareBlocks(&list[frontIdx], &key) <= 0 ) { frontIdx++; } while ( backIdx >= start && compareBlocks(&list[backIdx], &key) > 0 ) { backIdx--; } if (frontIdx < backIdx) { swapBlocks(&list[frontIdx], &list[backIdx]); } } swapBlocks(&list[start], &list[backIdx]); sortBlocks(list, start, backIdx - 1); sortBlocks(list, backIdx + 1, end); } }
// Remove dead instructions by doing a traditional liveness analysis. // instructions that mutate memory, physical registers, or status flags // are considered useful. All branches are considered useful. // // Given SSA, there's a faster sparse version of this algorithm that marks // useful instructions in one pass, then transitively marks pure instructions // that define inputs to useful instructions. However it requires a mapping // from vreg numbers to the instruction that defines them, and a way to address // individual instructions. // // We could remove useless branches by computing the post-dominator tree and // RDF(b) for each block; then a branch is only useful if it controls whether // or not a useful block executes, and useless branches can be forwarded to // the nearest useful post-dominator. void removeDeadCode(Vunit& unit) { Timer timer(Timer::vasm_dce); auto blocks = sortBlocks(unit); jit::vector<LiveSet> livein(unit.blocks.size()); LiveSet live(unit.next_vr); auto pass = [&](bool mutate) { bool changed = false; for (auto blockIt = blocks.end(); blockIt != blocks.begin();) { auto b = *--blockIt; auto& block = unit.blocks[b]; live.reset(); for (auto s : succs(block)) { if (!livein[s].empty()) { live |= livein[s]; } } for (auto i = block.code.end(); i != block.code.begin();) { auto& inst = *--i; auto useful = effectful(inst); visitDefs(unit, inst, [&](Vreg r) { if (r.isPhys() || live.test(r)) { useful = true; live.reset(r); } }); if (useful) { visitUses(unit, inst, [&](Vreg r) { live.set(r); }); } else if (mutate) { inst = nop{}; changed = true; } } if (mutate) { assertx(live == livein[b]); } else { if (live != livein[b]) { livein[b] = live; changed = true; } } } return changed; }; // analyze until livein reaches a fixed point while (pass(false)) {} auto const changed = pass(true); removeTrivialNops(unit); if (changed) { printUnit(kVasmDCELevel, "after vasm-dead", unit); } }
void logTranslation(const TransEnv& env, const TransRange& range) { auto nanos = HPHP::Timer::GetThreadCPUTimeNanos() - env.unit->startNanos(); auto& cols = *env.unit->logEntry(); auto& context = env.unit->context(); auto kind = show(context.kind); cols.setStr("trans_kind", !debug ? kind : kind + "_debug"); if (context.func) { cols.setStr("func", context.func->fullName()->data()); } cols.setInt("jit_sample_rate", RuntimeOption::EvalJitSampleRate); // timing info cols.setInt("jit_micros", nanos / 1000); // hhir stats cols.setInt("max_tmps", env.unit->numTmps()); cols.setInt("max_blocks", env.unit->numBlocks()); cols.setInt("max_insts", env.unit->numInsts()); auto hhir_blocks = rpoSortCfg(*env.unit); cols.setInt("num_blocks", hhir_blocks.size()); size_t num_insts = 0; for (auto b : hhir_blocks) num_insts += b->instrs().size(); cols.setInt("num_insts", num_insts); // vasm stats if (env.vunit) { cols.setInt("max_vreg", env.vunit->next_vr); cols.setInt("max_vblocks", env.vunit->blocks.size()); cols.setInt("max_vcalls", env.vunit->vcallArgs.size()); size_t max_vinstr = 0; for (auto& blk : env.vunit->blocks) max_vinstr += blk.code.size(); cols.setInt("max_vinstr", max_vinstr); cols.setInt("num_vconst", env.vunit->constToReg.size()); auto vblocks = sortBlocks(*env.vunit); size_t num_vinstr[kNumAreas] = {0, 0, 0}; size_t num_vblocks[kNumAreas] = {0, 0, 0}; for (auto b : vblocks) { const auto& block = env.vunit->blocks[b]; num_vinstr[(int)block.area_idx] += block.code.size(); num_vblocks[(int)block.area_idx]++; } cols.setInt("num_vinstr_main", num_vinstr[(int)AreaIndex::Main]); cols.setInt("num_vinstr_cold", num_vinstr[(int)AreaIndex::Cold]); cols.setInt("num_vinstr_frozen", num_vinstr[(int)AreaIndex::Frozen]); cols.setInt("num_vblocks_main", num_vblocks[(int)AreaIndex::Main]); cols.setInt("num_vblocks_cold", num_vblocks[(int)AreaIndex::Cold]); cols.setInt("num_vblocks_frozen", num_vblocks[(int)AreaIndex::Frozen]); } // x64 stats cols.setInt("main_size", range.main.size()); cols.setInt("cold_size", range.cold.size()); cols.setInt("frozen_size", range.frozen.size()); // finish & log StructuredLog::log("hhvm_jit", cols); }
void Vxls::allocate() { blocks = sortBlocks(unit); splitCritEdges(); computePositions(); analyzeRsp(); buildIntervals(); walkIntervals(); resolveSplits(); lowerCopyargs(); resolveEdges(); renameOperands(); insertCopies(); }
jit::vector<Vlabel> layoutBlocks(const Vunit& unit) { auto blocks = sortBlocks(unit); // Partition into main/cold/frozen areas without changing relative order, and // the end{} block will be last. auto coldIt = std::stable_partition(blocks.begin(), blocks.end(), [&](Vlabel b) { return unit.blocks[b].area == AreaIndex::Main && unit.blocks[b].code.back().op != Vinstr::fallthru; }); std::stable_partition(coldIt, blocks.end(), [&](Vlabel b) { return unit.blocks[b].area == AreaIndex::Cold && unit.blocks[b].code.back().op != Vinstr::fallthru; }); return blocks; }
/* * Branch fusion: * Analyze blocks one at a time, looking for the sequence: * * setcc cc, f1 => b * ... * testb b, b => f2 * ... * jcc E|NE, f2 * * If found, and f2 is only used by the jcc, then change the code to: * * setcc cc, f1 => b * ... * nop * ... * jcc !cc|cc, f1 * * Later, vasm-dead will clean up the nop, and the setcc if b became dead. * * During the search, any other instruction that has a status flag result * will reset the pattern matcher. No instruction can "kill" flags, * since flags are SSA variables. However the transformation we want to * make extends the setcc flags lifetime, and we don't want it to overlap * another flag's lifetime. */ void fuseBranches(Vunit& unit) { auto blocks = sortBlocks(unit); jit::vector<unsigned> uses(unit.next_vr); for (auto b : blocks) { for (auto& inst : unit.blocks[b].code) { visitUses(unit, inst, [&](Vreg r) { uses[r]++; }); } } bool should_print = false; for (auto b : blocks) { auto& code = unit.blocks[b].code; ConditionCode cc; Vreg setcc_flags, setcc_dest, testb_flags; unsigned testb_index; for (unsigned i = 0, n = code.size(); i < n; ++i) { if (code[i].op == Vinstr::setcc) { cc = code[i].setcc_.cc; setcc_flags = code[i].setcc_.sf; setcc_dest = code[i].setcc_.d; continue; } if (setcc_flags.isValid() && match_testb(code[i], setcc_dest) && uses[code[i].testb_.sf] == 1) { testb_flags = code[i].testb_.sf; testb_index = i; continue; } if (match_jcc(code[i], testb_flags)) { code[testb_index] = nop{}; // erase the testb auto& jcc = code[i].jcc_; jcc.cc = jcc.cc == CC_NE ? cc : ccNegate(cc); jcc.sf = setcc_flags; should_print = true; continue; } if (setcc_flags.isValid() && sets_flags(code[i])) { setcc_flags = testb_flags = Vreg{}; } } } if (should_print) { printUnit(kVasmFusionLevel, "after vasm-fusion", unit); } }
void foldImms(Vunit& unit) { assertx(check(unit)); // especially, SSA // block order doesn't matter, but only visit reachable blocks. auto blocks = sortBlocks(unit); // Use flag for each registers. If a SR is used then // certain optimizations will not fire since they do not // set the condition codes as the original instruction(s) // would. jit::vector<bool> used(unit.next_vr); for (auto b : blocks) { for (auto& inst : unit.blocks[b].code) { visitUses(unit, inst, [&](Vreg r) { used[r] = true; }); } } Folder folder(std::move(used)); folder.vals.resize(unit.next_vr); folder.valid.resize(unit.next_vr); // figure out which Vregs are constants and stash their values. for (auto& entry : unit.constToReg) { folder.valid.set(entry.second); folder.vals[entry.second] = entry.first.val; } // now mutate instructions for (auto b : blocks) { for (auto& inst : unit.blocks[b].code) { switch (inst.op) { #define O(name, imms, uses, defs)\ case Vinstr::name: {\ auto origin = inst.origin;\ folder.fold(inst.name##_, inst);\ inst.origin = origin;\ break;\ } VASM_OPCODES #undef O } } } printUnit(kVasmImmsLevel, "after foldImms", unit); }
/** * Chain the retranslation blocks. This method enforces that, for * each region block, all its successor have distinct SrcKeys. */ void RegionDesc::chainRetransBlocks() { jit::vector<Chain> chains; BlockToChainMap block2chain; // 1. Initially assign each region block to its own chain. for (auto b : blocks()) { auto bid = b->id(); auto cid = chains.size(); chains.push_back({cid, {bid}}); block2chain[bid] = cid; } // 2. For each block, if it has 2 successors with the same SrcKey, // then merge the successors' chains into one. for (auto b : blocks()) { auto bid = b->id(); const auto& succSet = succs(bid); for (auto it1 = succSet.begin(); it1 != succSet.end(); it1++) { auto bid1 = *it1; auto cid1 = block2chain[bid1]; for (auto it2 = it1 + 1; it2 != succSet.end(); it2++) { auto bid2 = *it2; auto cid2 = block2chain[bid2]; if (data(bid1).block->start() == data(bid2).block->start()) { mergeChains(chains[cid1], chains[cid2], block2chain); } } } } // 3. Sort each chain. In general, we want to sort each chain in // decreasing order of profile weights. However, note that this // transformation can turn acyclic graphs into cyclic ones (see // example below). Therefore, if JitLoops are disabled, we // instead sort each chain following the original block order, // which prevents loops from being generated if the region was // originally acyclic. // // Here's an example showing how an acyclic CFG can become cyclic // by chaining its retranslation blocks: // // - Region before chaining retranslation blocks, where B2' and B2" // are retranslations starting at the same SrcKey: // B1 -> B2' // B1 -> B2" // B2' -> B3 // B3 -> B2" // // - Region after sorting the chain as B2" -R-> B2': // B1 -> B2" // B2" -R-> B2' // B2' -> B3 // B3 -> B2" // Note the cycle: B2" -R-> B2' -> B3 -> B2". // auto profData = mcg->tx().profData(); auto weight = [&](RegionDesc::BlockId bid) { return hasTransID(bid) ? profData->absTransCounter(getTransID(bid)) : 0; }; auto sortGeneral = [&](RegionDesc::BlockId bid1, RegionDesc::BlockId bid2) { return weight(bid1) > weight(bid2); }; using SortFun = std::function<bool(RegionDesc::BlockId, RegionDesc::BlockId)>; SortFun sortFunc = sortGeneral; hphp_hash_map<RegionDesc::BlockId, uint32_t> origBlockOrder; if (!RuntimeOption::EvalJitLoops) { for (uint32_t i = 0; i < m_blocks.size(); i++) { origBlockOrder[m_blocks[i]->id()] = i; } auto sortAcyclic = [&](RegionDesc::BlockId bid1, RegionDesc::BlockId bid2) { return origBlockOrder[bid1] < origBlockOrder[bid2]; }; sortFunc = sortAcyclic; } TRACE(1, "chainRetransBlocks: computed chains:\n"); for (auto& c : chains) { std::sort(c.blocks.begin(), c.blocks.end(), sortFunc); if (Trace::moduleEnabled(Trace::region, 1) && c.blocks.size() > 0) { FTRACE(1, " -> {} (w={})", c.blocks[0], weight(c.blocks[0])); for (size_t i = 1; i < c.blocks.size(); i++) { FTRACE(1, ", {} (w={})", c.blocks[i], weight(c.blocks[i])); } FTRACE(1, "\n"); } } // 4. Set the nextRetrans blocks according to the computed chains. for (auto& c : chains) { if (c.blocks.size() == 0) continue; for (size_t i = 0; i < c.blocks.size() - 1; i++) { setNextRetrans(c.blocks[i], c.blocks[i + 1]); } } // 5. For each block with multiple successors in the same chain, // only keep the successor that first appears in the chain. for (auto b : blocks()) { auto& succSet = data(b->id()).succs; for (auto s : succSet) { auto& c = chains[block2chain[s]]; auto selectedSucc = findFirstInSet(c, succSet); for (auto other : c.blocks) { if (other == selectedSucc) continue; succSet.erase(other); } } } // 6. Reorder the blocks in the region in topological order (if // region is acyclic), since the previous steps may break it. sortBlocks(); }
static void sort(SWHirschbergData* data) { if (!data->sorted) { data->sorted = 0; sortBlocks(data->blocks, 0, data->blockNmr - 1); } }