void lowerForARM(Vunit& unit) { assertx(check(unit)); // block order doesn't matter, but only visit reachable blocks. auto blocks = sortBlocks(unit); for (auto b : blocks) { auto oldCode = std::move(unit.blocks[b].code); Vout v{unit, b}; for (auto& inst : oldCode) { v.setOrigin(inst.origin); switch (inst.op) { #define O(nm, imm, use, def) \ case Vinstr::nm: \ lower(inst.nm##_, v); \ break; VASM_OPCODES #undef O } } } assertx(check(unit)); printUnit(kVasmARMFoldLevel, "after lowerForARM", unit); }
void SenMLRecord::fieldsToJson() { int bnLength = this->_name.length(); if(bnLength){ printText("\"n\":\"", 5); printText(this->_name.c_str(), bnLength); printText("\"", 1); } if(!isnan(this->_time)){ printText(",\"t\":", 5); printDouble(this->_time, SENML_MAX_DOUBLE_PRECISION); } if(this->_unit != SENML_UNIT_NONE){ printText(",\"u\":\"", 6); printUnit(this->_unit); printText("\"", 1); } if(this->_updateTime != 0){ printText(",\"ut\":", 5); #ifdef __MBED__ char buf[10]; sprintf(buf, "%d", this->_updateTime); String val = buf; #else String val(this->_updateTime); #endif printText(val.c_str(), val.length()); } }
// Remove dead instructions by doing a traditional liveness analysis. // instructions that mutate memory, physical registers, or status flags // are considered useful. All branches are considered useful. // // Given SSA, there's a faster sparse version of this algorithm that marks // useful instructions in one pass, then transitively marks pure instructions // that define inputs to useful instructions. However it requires a mapping // from vreg numbers to the instruction that defines them, and a way to address // individual instructions. // // We could remove useless branches by computing the post-dominator tree and // RDF(b) for each block; then a branch is only useful if it controls whether // or not a useful block executes, and useless branches can be forwarded to // the nearest useful post-dominator. void removeDeadCode(Vunit& unit) { auto blocks = sortBlocks(unit); jit::vector<LiveSet> livein(unit.blocks.size()); LiveSet live(unit.next_vr); auto pass = [&](bool mutate) { bool changed = false; for (auto blockIt = blocks.end(); blockIt != blocks.begin();) { auto b = *--blockIt; auto& block = unit.blocks[b]; live.reset(); for (auto s : succs(block)) { if (!livein[s].empty()) { live |= livein[s]; } } for (auto i = block.code.end(); i != block.code.begin();) { auto& inst = *--i; auto useful = effectful(inst); visitDefs(unit, inst, [&](Vreg r) { if (r.isPhys() || live.test(r)) { useful = true; live.reset(r); } }); if (useful) { visitUses(unit, inst, [&](Vreg r) { live.set(r); }); } else if (mutate) { inst = nop{}; changed = true; } } if (mutate) { assert(live == livein[b]); } else { if (live != livein[b]) { livein[b] = live; changed = true; } } } return changed; }; // analyze until livein reaches a fixed point while (pass(false)) {} // nop-out useless instructions if (pass(true)) { for (auto b : blocks) { auto& code = unit.blocks[b].code; auto end = std::remove_if(code.begin(), code.end(), [&](Vinstr& inst) { return inst.op == Vinstr::nop; }); code.erase(end, code.end()); } printUnit(kVasmDCELevel, "after vasm-dead", unit); } }
/* * Records any type/reffiness predictions we depend on in the region. Guards * for locals and stack cells that are not used will be eliminated by the call * to relaxGuards. */ void RegionFormer::recordDependencies() { // Record the incrementally constructed reffiness predictions. assertx(!m_region->empty()); auto& frontBlock = *m_region->blocks().front(); for (auto const& dep : m_refDeps.m_arMap) { frontBlock.addReffinessPred(m_startSk, {dep.second.m_mask, dep.second.m_vals, dep.first}); } // Relax guards and record the ones that survived. auto& firstBlock = *m_region->blocks().front(); auto blockStart = firstBlock.start(); auto& unit = m_irgs.unit; auto const doRelax = RuntimeOption::EvalHHIRRelaxGuards; bool changed = false; if (doRelax) { Timer _t(Timer::selectTracelet_relaxGuards); // The IR is going to be discarded immediately, so skip reflowing // the types in relaxGuards to save JIT time. RelaxGuardsFlags flags = m_profiling ? RelaxSimple : RelaxNormal; changed = relaxGuards(unit, *m_irgs.irb->guards(), flags); } auto guardMap = std::map<RegionDesc::Location,Type>{}; ITRACE(2, "Visiting guards\n"); visitGuards(unit, [&](const RegionDesc::Location& loc, Type type) { Trace::Indent indent; ITRACE(3, "{}: {}\n", show(loc), type); if (type <= TCls) return; auto inret = guardMap.insert(std::make_pair(loc, type)); if (inret.second) return; auto& oldTy = inret.first->second; if (oldTy == TGen) { // This is the case that we see an inner type prediction for a GuardLoc // that got relaxed to Gen. return; } oldTy &= type; }); for (auto& kv : guardMap) { if (kv.second == TGen) { // Guard was relaxed to Gen---don't record it. continue; } auto const preCond = RegionDesc::TypedLocation { kv.first, kv.second }; ITRACE(1, "selectTracelet adding guard {}\n", show(preCond)); firstBlock.addPreCondition(blockStart, preCond); } if (changed) { printUnit(3, unit, " after guard relaxation ", nullptr, m_irgs.irb->guards()); } }
/* * optimizeExits does two conversions to eliminate common branch-to-exit flows. * * 1. If we see a jcc that leads to two "identical" blocks ending with * bindjmp, then copy the identical part of the targets before the jcc, * and replace the jcc with a bindjcc1st instruction using the bytecode * destinations from the two original bindjmps. For the sake of this pass, * "identical" means matching lea & syncvmsp instructions, and both bindjmp's * are for the same function. * * This leads to more efficient code because the service request stubs will * patch jumps in the main trace instead of off-trace. * * 2. Otherwise, if we see a jcc but only one of the branches is * a normal exit, then convert the jcc to a bindexit with the jcc's condition * and the original bindjmp's dest. */ void optimizeExits(Vunit& unit) { auto const pred_counts = count_predecessors(unit); PostorderWalker{unit} .dfs([&](Vlabel b) { auto& code = unit.blocks[b].code; assertx(!code.empty()); if (code.back().op != Vinstr::jcc) return; auto const ijcc = code.back().jcc_; auto const t0 = ijcc.targets[0]; auto const t1 = ijcc.targets[1]; if (t0 == t1) { code.back() = jmp{t0}; return; } if (pred_counts[t0] != 1 || pred_counts[t1] != 1) return; // copy all but the last instruction in blocks[t] to just before // the last instruction in code. auto hoist_sync = [&](Vlabel t) { const auto& tcode = unit.blocks[t].code; code.insert(std::prev(code.end()), tcode.begin(), std::prev(tcode.end())); }; if (match_bindjcc1st(unit, t0, t1)) { // hoist the sync instructions from t0 to before the jcc, // and replace the jcc with bindjcc1st. const auto& bj0 = unit.blocks[t0].code.back().bindjmp_; const auto& bj1 = unit.blocks[t1].code.back().bindjmp_; hoist_sync(t0); code.back() = bindjcc1st{ijcc.cc, ijcc.sf, {bj0.target, bj1.target}, bj0.args | bj1.args}; return; } auto fold_exit = [&](ConditionCode cc, Vlabel exit, Vlabel next) { const auto& bj = unit.blocks[exit].code.back().bindjmp_; auto origin = code.back().origin; hoist_sync(exit); code.back() = bindjcc{cc, ijcc.sf, bj.target, bj.trflags, bj.args}; code.emplace_back(jmp{next}); code.back().origin = origin; }; // Try to replace jcc to normal exit with bindexit followed by jmp, // as long as the sp adjustment is harmless to hoist (disp==0) Vptr sp; if (match_bindjmp(unit, t1, &sp) && sp == sp.base[0]) { fold_exit(ijcc.cc, t1, t0); } else if (match_bindjmp(unit, t0, &sp) && sp == sp.base[0]) { fold_exit(ccNegate(ijcc.cc), t0, t1); } }); printUnit(kVasmExitsLevel, "after vasm-exits", unit); }
void optimize(IRUnit& unit, IRBuilder& irBuilder, TransKind kind) { Timer timer(Timer::optimize); assertx(checkEverything(unit)); fullDCE(unit); printUnit(6, unit, " after initial DCE "); assertx(checkEverything(unit)); if (RuntimeOption::EvalHHIRTypeCheckHoisting) { doPass(unit, hoistTypeChecks, DCE::Minimal); } if (RuntimeOption::EvalHHIRPredictionOpts) { doPass(unit, optimizePredictions, DCE::None); } if (RuntimeOption::EvalHHIRSimplification) { doPass(unit, simplifyPass, DCE::Full); doPass(unit, cleanCfg, DCE::None); } if (RuntimeOption::EvalHHIRGlobalValueNumbering) { doPass(unit, gvn, DCE::Full); } if (kind != TransKind::Profile && RuntimeOption::EvalHHIRMemoryOpts) { doPass(unit, optimizeLoads, DCE::Full); } if (kind != TransKind::Profile && RuntimeOption::EvalHHIRMemoryOpts) { doPass(unit, optimizeStores, DCE::Full); } if (kind != TransKind::Profile && RuntimeOption::EvalHHIRRefcountOpts) { doPass(unit, optimizeRefcounts2, DCE::Full); } if (RuntimeOption::EvalHHIRLICM && RuntimeOption::EvalJitLoops && cfgHasLoop(unit) && kind != TransKind::Profile) { doPass(unit, optimizeLoopInvariantCode, DCE::Minimal); } doPass(unit, removeExitPlaceholders, DCE::Full); if (RuntimeOption::EvalHHIRGenerateAsserts) { doPass(unit, insertAsserts, DCE::None); } // Perform final cleanup passes to collapse any critical edges that were // split, and simplify our instructions before shipping off to codegen. doPass(unit, cleanCfg, DCE::None); if (kind != TransKind::Profile && RuntimeOption::EvalHHIRSimplification) { doPass(unit, simplifyPass, DCE::Full); } }
// Remove dead instructions by doing a traditional liveness analysis. // instructions that mutate memory, physical registers, or status flags // are considered useful. All branches are considered useful. // // Given SSA, there's a faster sparse version of this algorithm that marks // useful instructions in one pass, then transitively marks pure instructions // that define inputs to useful instructions. However it requires a mapping // from vreg numbers to the instruction that defines them, and a way to address // individual instructions. // // We could remove useless branches by computing the post-dominator tree and // RDF(b) for each block; then a branch is only useful if it controls whether // or not a useful block executes, and useless branches can be forwarded to // the nearest useful post-dominator. void removeDeadCode(Vunit& unit) { Timer timer(Timer::vasm_dce); auto blocks = sortBlocks(unit); jit::vector<LiveSet> livein(unit.blocks.size()); LiveSet live(unit.next_vr); auto pass = [&](bool mutate) { bool changed = false; for (auto blockIt = blocks.end(); blockIt != blocks.begin();) { auto b = *--blockIt; auto& block = unit.blocks[b]; live.reset(); for (auto s : succs(block)) { if (!livein[s].empty()) { live |= livein[s]; } } for (auto i = block.code.end(); i != block.code.begin();) { auto& inst = *--i; auto useful = effectful(inst); visitDefs(unit, inst, [&](Vreg r) { if (r.isPhys() || live.test(r)) { useful = true; live.reset(r); } }); if (useful) { visitUses(unit, inst, [&](Vreg r) { live.set(r); }); } else if (mutate) { inst = nop{}; changed = true; } } if (mutate) { assertx(live == livein[b]); } else { if (live != livein[b]) { livein[b] = live; changed = true; } } } return changed; }; // analyze until livein reaches a fixed point while (pass(false)) {} auto const changed = pass(true); removeTrivialNops(unit); if (changed) { printUnit(kVasmDCELevel, "after vasm-dead", unit); } }
void genCode(CodeBlock& main, CodeBlock& stubs, IRUnit& unit, std::vector<TransBCMapping>* bcMap, JIT::MCGenerator* mcg, const RegAllocInfo& regs) { Timer _t(Timer::codeGen); if (dumpIREnabled()) { AsmInfo ai(unit); genCodeImpl(main, stubs, unit, bcMap, mcg, regs, &ai); printUnit(kCodeGenLevel, unit, " after code gen ", ®s, &ai); } else { genCodeImpl(main, stubs, unit, bcMap, mcg, regs, nullptr); } }
/* * Branch fusion: * Analyze blocks one at a time, looking for the sequence: * * setcc cc, f1 => b * ... * testb b, b => f2 * ... * jcc E|NE, f2 * * If found, and f2 is only used by the jcc, then change the code to: * * setcc cc, f1 => b * ... * nop * ... * jcc !cc|cc, f1 * * Later, vasm-dead will clean up the nop, and the setcc if b became dead. * * During the search, any other instruction that has a status flag result * will reset the pattern matcher. No instruction can "kill" flags, * since flags are SSA variables. However the transformation we want to * make extends the setcc flags lifetime, and we don't want it to overlap * another flag's lifetime. */ void fuseBranches(Vunit& unit) { auto blocks = sortBlocks(unit); jit::vector<unsigned> uses(unit.next_vr); for (auto b : blocks) { for (auto& inst : unit.blocks[b].code) { visitUses(unit, inst, [&](Vreg r) { uses[r]++; }); } } bool should_print = false; for (auto b : blocks) { auto& code = unit.blocks[b].code; ConditionCode cc; Vreg setcc_flags, setcc_dest, testb_flags; unsigned testb_index; for (unsigned i = 0, n = code.size(); i < n; ++i) { if (code[i].op == Vinstr::setcc) { cc = code[i].setcc_.cc; setcc_flags = code[i].setcc_.sf; setcc_dest = code[i].setcc_.d; continue; } if (setcc_flags.isValid() && match_testb(code[i], setcc_dest) && uses[code[i].testb_.sf] == 1) { testb_flags = code[i].testb_.sf; testb_index = i; continue; } if (match_jcc(code[i], testb_flags)) { code[testb_index] = nop{}; // erase the testb auto& jcc = code[i].jcc_; jcc.cc = jcc.cc == CC_NE ? cc : ccNegate(cc); jcc.sf = setcc_flags; should_print = true; continue; } if (setcc_flags.isValid() && sets_flags(code[i])) { setcc_flags = testb_flags = Vreg{}; } } } if (should_print) { printUnit(kVasmFusionLevel, "after vasm-fusion", unit); } }
TCA genFuncPrologue(TransID transID, TransKind kind, Func* func, int argc, CodeCache::View code, CGMeta& fixups) { auto context = prologue_context(transID, kind, func, func->getEntryForNumArgs(argc)); IRUnit unit{context}; irgen::IRGS env{unit, nullptr}; irgen::emitFuncPrologue(env, argc, transID); irgen::sealUnit(env); printUnit(2, unit, "After initial prologue generation"); auto vunit = irlower::lowerUnit(env.unit, CodeKind::CrossTrace); emitVunit(*vunit, env.unit, code, fixups); return unit.prologueStart; }
void SenMLPack::fieldsToJson() { int bnLength = this->_bn.length(); if(bnLength > 0){ printText("\"bn\":\"", 6); printText(this->_bn.c_str(), bnLength); printText("\"", 1); } if(this->_bu){ printText(",\"bu\":\"", 7); printUnit(this->_bu); printText("\"", 1); } if(!isnan(this->_bt)){ printText(",\"bt\":", 6); printDouble(this->_bt, SENML_MAX_DOUBLE_PRECISION); } }
void foldImms(Vunit& unit) { assertx(check(unit)); // especially, SSA // block order doesn't matter, but only visit reachable blocks. auto blocks = sortBlocks(unit); // Use flag for each registers. If a SR is used then // certain optimizations will not fire since they do not // set the condition codes as the original instruction(s) // would. jit::vector<bool> used(unit.next_vr); for (auto b : blocks) { for (auto& inst : unit.blocks[b].code) { visitUses(unit, inst, [&](Vreg r) { used[r] = true; }); } } Folder folder(std::move(used)); folder.vals.resize(unit.next_vr); folder.valid.resize(unit.next_vr); // figure out which Vregs are constants and stash their values. for (auto& entry : unit.constToReg) { folder.valid.set(entry.second); folder.vals[entry.second] = entry.first.val; } // now mutate instructions for (auto b : blocks) { for (auto& inst : unit.blocks[b].code) { switch (inst.op) { #define O(name, imms, uses, defs)\ case Vinstr::name: {\ auto origin = inst.origin;\ folder.fold(inst.name##_, inst);\ inst.origin = origin;\ break;\ } VASM_OPCODES #undef O } } } printUnit(kVasmImmsLevel, "after foldImms", unit); }
TCA genFuncPrologue(TransID transID, TransKind kind, Func* func, int argc, CodeCache::View code, CGMeta& fixups) { auto context = prologue_context(transID, kind, func, func->getEntryForNumArgs(argc)); IRUnit unit{context}; irgen::IRGS env{unit}; auto& cb = code.main(); // Dump the func guard in the TC before anything else. emitFuncGuard(func, cb, fixups); auto const start = cb.frontier(); irgen::emitFuncPrologue(env, argc, transID); irgen::sealUnit(env); printUnit(2, unit, "After initial prologue generation"); auto vunit = irlower::lowerUnit(env.unit, CodeKind::CrossTrace); emitVunit(*vunit, env.unit, code, fixups); return start; }
void optimizeJmps(Vunit& unit) { auto isEmpty = [&](Vlabel b, Vinstr::Opcode op) { auto& code = unit.blocks[b].code; return code.size() == 1 && op == code[0].op; }; bool changed = false; bool ever_changed = false; // The number of incoming edges from (reachable) predecessors for each block. // It is maintained as an upper bound of the actual value during the // transformation. jit::vector<int> npreds(unit.blocks.size(), 0); do { if (changed) { std::fill(begin(npreds), end(npreds), 0); } changed = false; PostorderWalker{unit} .dfs([&](Vlabel b) { for (auto s : succs(unit.blocks[b])) { npreds[s]++; } }); // give entry an extra predecessor to prevent cloning it. npreds[unit.entry]++; PostorderWalker{unit} .dfs([&](Vlabel b) { auto& block = unit.blocks[b]; auto& code = block.code; assertx(!code.empty()); if (code.back().op == Vinstr::jcc) { auto ss = succs(block); if (ss[0] == ss[1]) { // both edges have same target, change to jmp code.back() = jmp{ss[0]}; --npreds[ss[0]]; changed = true; } else { auto jcc_i = code.back().jcc_; if (isEmpty(jcc_i.targets[0], Vinstr::fallback)) { jcc_i = jcc{ccNegate(jcc_i.cc), jcc_i.sf, {jcc_i.targets[1], jcc_i.targets[0]}}; } if (isEmpty(jcc_i.targets[1], Vinstr::fallback)) { // replace jcc with fallbackcc and jmp const auto& fb_i = unit.blocks[jcc_i.targets[1]].code[0].fallback_; const auto t0 = jcc_i.targets[0]; const auto jcc_origin = code.back().origin; code.pop_back(); code.emplace_back( fallbackcc{jcc_i.cc, jcc_i.sf, fb_i.dest, fb_i.trflags}); code.back().origin = jcc_origin; code.emplace_back(jmp{t0}); code.back().origin = jcc_origin; changed = true; } } } for (auto& s : succs(block)) { if (isEmpty(s, Vinstr::jmp)) { // skip over s --npreds[s]; s = unit.blocks[s].code.back().jmp_.target; ++npreds[s]; changed = true; } } if (code.back().op == Vinstr::jmp) { auto s = code.back().jmp_.target; if (npreds[s] == 1 || isEmpty(s, Vinstr::jcc)) { // overwrite jmp with copy of s auto& code2 = unit.blocks[s].code; code.pop_back(); code.insert(code.end(), code2.begin(), code2.end()); if (--npreds[s]) { for (auto ss : succs(block)) { ++npreds[ss]; } } changed = true; } } }); ever_changed |= changed; } while (changed); if (ever_changed) { printUnit(kVasmJumpsLevel, "after vasm-jumps", unit); } }
void optimize(IRUnit& unit, IRBuilder& irBuilder, TransKind kind) { Timer _t(Timer::optimize); auto finishPass = [&](const char* msg) { if (msg) { printUnit(6, unit, folly::format("after {}", msg).str().c_str()); } assert(checkCfg(unit)); assert(checkTmpsSpanningCalls(unit)); if (debug) { forEachInst(rpoSortCfg(unit), [&](IRInstruction* inst) { assert(checkOperandTypes(inst, &unit)); }); } }; auto doPass = [&](void (*fn)(IRUnit&), const char* msg = nullptr) { fn(unit); finishPass(msg); }; auto dce = [&](const char* which) { if (!RuntimeOption::EvalHHIRDeadCodeElim) return; eliminateDeadCode(unit); finishPass(folly::format("{} DCE", which).str().c_str()); }; auto const doReoptimize = RuntimeOption::EvalHHIRExtraOptPass && (RuntimeOption::EvalHHIRCse || RuntimeOption::EvalHHIRSimplification); auto const hasLoop = RuntimeOption::EvalJitLoops && cfgHasLoop(unit); // TODO(#5792564): Guard relaxation doesn't work with loops. if (shouldHHIRRelaxGuards() && !hasLoop) { Timer _t(Timer::optimize_relaxGuards); const bool simple = kind == TransKind::Profile && (RuntimeOption::EvalJitRegionSelector == "tracelet" || RuntimeOption::EvalJitRegionSelector == "method"); RelaxGuardsFlags flags = (RelaxGuardsFlags) (RelaxReflow | (simple ? RelaxSimple : RelaxNormal)); auto changed = relaxGuards(unit, *irBuilder.guards(), flags); if (changed) finishPass("guard relaxation"); if (doReoptimize) { irBuilder.reoptimize(); finishPass("guard relaxation reoptimize"); } } if (RuntimeOption::EvalHHIRRefcountOpts) { optimizeRefcounts(unit, FrameStateMgr{unit.entry()->front().marker()}); finishPass("refcount opts"); } dce("initial"); if (RuntimeOption::EvalHHIRPredictionOpts) { doPass(optimizePredictions, "prediction opts"); } if (doReoptimize) { irBuilder.reoptimize(); finishPass("reoptimize"); dce("reoptimize"); } if (RuntimeOption::EvalHHIRGlobalValueNumbering) { doPass(gvn); dce("gvn"); } if (kind != TransKind::Profile && RuntimeOption::EvalHHIRMemoryOpts) { doPass(optimizeLoads); dce("loadelim"); } /* * Note: doing this pass this late might not be ideal, in particular because * we've already turned some StLoc instructions into StLocNT. * * But right now there are assumptions preventing us from doing it before * refcount opts. (Refcount opts needs to see all the StLocs explicitly * because it makes assumptions about whether references are consumed based * on that.) */ if (kind != TransKind::Profile && RuntimeOption::EvalHHIRMemoryOpts) { doPass(optimizeStores); dce("storeelim"); } if (RuntimeOption::EvalHHIRGenerateAsserts) { doPass(insertAsserts, "RefCnt asserts"); } }
void BackEnd::genCodeImpl(IRUnit& unit, AsmInfo* asmInfo) { ctr++; auto regs = allocateRegs(unit); assert(checkRegisters(unit, regs)); // calls checkCfg internally. Timer _t(Timer::codeGen); LiveRegs live_regs = computeLiveRegs(unit, regs); CodegenState state(unit, regs, live_regs, asmInfo); CodeBlock& mainCodeIn = mcg->code.main(); CodeBlock& coldCodeIn = mcg->code.cold(); CodeBlock* frozenCode = &mcg->code.frozen(); CodeBlock mainCode; CodeBlock coldCode; bool relocate = false; if (RuntimeOption::EvalJitRelocationSize && supportsRelocation() && coldCodeIn.canEmit(RuntimeOption::EvalJitRelocationSize * 3)) { /* * This is mainly to exercise the relocator, and ensure that its * not broken by new non-relocatable code. Later, it will be * used to do some peephole optimizations, such as reducing branch * sizes. * Allocate enough space that the relocated cold code doesn't * overlap the emitted cold code. */ static unsigned seed = 42; auto off = rand_r(&seed) & (cacheLineSize() - 1); coldCode.init(coldCodeIn.frontier() + RuntimeOption::EvalJitRelocationSize + off, RuntimeOption::EvalJitRelocationSize - off, "cgRelocCold"); mainCode.init(coldCode.frontier() + RuntimeOption::EvalJitRelocationSize + off, RuntimeOption::EvalJitRelocationSize - off, "cgRelocMain"); relocate = true; } else { /* * Use separate code blocks, so that attempts to use the mcg's * code blocks directly will fail (eg by overwriting the same * memory being written through these locals). */ coldCode.init(coldCodeIn.frontier(), coldCodeIn.available(), coldCodeIn.name().c_str()); mainCode.init(mainCodeIn.frontier(), mainCodeIn.available(), mainCodeIn.name().c_str()); } if (frozenCode == &coldCodeIn) { frozenCode = &coldCode; } auto frozenStart = frozenCode->frontier(); auto coldStart DEBUG_ONLY = coldCodeIn.frontier(); auto mainStart DEBUG_ONLY = mainCodeIn.frontier(); size_t hhir_count{0}; { mcg->code.lock(); mcg->cgFixups().setBlocks(&mainCode, &coldCode, frozenCode); SCOPE_EXIT { mcg->cgFixups().setBlocks(nullptr, nullptr, nullptr); mcg->code.unlock(); }; if (RuntimeOption::EvalHHIRGenerateAsserts) { emitTraceCall(mainCode, unit.bcOff()); } auto const linfo = layoutBlocks(unit); auto main_start = mainCode.frontier(); auto cold_start = coldCode.frontier(); auto frozen_start = frozenCode->frontier(); Vasm vasm(&state.meta); auto& vunit = vasm.unit(); // create the initial set of vasm numbered the same as hhir blocks. for (uint32_t i = 0, n = unit.numBlocks(); i < n; ++i) { state.labels[i] = vunit.makeBlock(AreaIndex::Main); } vunit.roots.push_back(state.labels[unit.entry()]); vasm.main(mainCode); vasm.cold(coldCode); vasm.frozen(*frozenCode); for (auto it = linfo.blocks.begin(); it != linfo.blocks.end(); ++it) { auto block = *it; auto v = block->hint() == Block::Hint::Unlikely ? vasm.cold() : block->hint() == Block::Hint::Unused ? vasm.frozen() : vasm.main(); FTRACE(6, "genBlock {} on {}\n", block->id(), area_names[(unsigned)v.area()]); auto b = state.labels[block]; vunit.blocks[b].area = v.area(); v.use(b); hhir_count += genBlock(unit, v, vasm, state, block); assert(v.closed()); assert(vasm.main().empty() || vasm.main().closed()); assert(vasm.cold().empty() || vasm.cold().closed()); assert(vasm.frozen().empty() || vasm.frozen().closed()); } printUnit("after code-gen", vasm.unit()); vasm.finish(vasm_abi); if (state.asmInfo) { auto block = unit.entry(); state.asmInfo->asmRanges[block] = {main_start, mainCode.frontier()}; if (mainCode.base() != coldCode.base() && frozenCode != &coldCode) { state.asmInfo->acoldRanges[block] = {cold_start, coldCode.frontier()}; } if (mainCode.base() != frozenCode->base()) { state.asmInfo->afrozenRanges[block] = {frozen_start, frozenCode->frontier()}; } } } auto bcMap = &mcg->cgFixups().m_bcMap; if (!bcMap->empty()) { TRACE(1, "BCMAPS before relocation\n"); for (UNUSED auto& map : *bcMap) { TRACE(1, "%s %-6d %p %p %p\n", map.md5.toString().c_str(), map.bcStart, map.aStart, map.acoldStart, map.afrozenStart); } } assert(coldCodeIn.frontier() == coldStart); assert(mainCodeIn.frontier() == mainStart); if (relocate) { if (asmInfo) { printUnit(kRelocationLevel, unit, " before relocation ", ®s, asmInfo); } auto& be = mcg->backEnd(); RelocationInfo rel; size_t asm_count{0}; asm_count += be.relocate(rel, mainCodeIn, mainCode.base(), mainCode.frontier(), mcg->cgFixups()); asm_count += be.relocate(rel, coldCodeIn, coldCode.base(), coldCode.frontier(), mcg->cgFixups()); TRACE(1, "hhir-inst-count %ld asm %ld\n", hhir_count, asm_count); if (frozenCode != &coldCode) { rel.recordRange(frozenStart, frozenCode->frontier(), frozenStart, frozenCode->frontier()); } be.adjustForRelocation(rel, mcg->cgFixups()); be.adjustForRelocation(rel, asmInfo, mcg->cgFixups()); if (asmInfo) { static int64_t mainDeltaTot = 0, coldDeltaTot = 0; int64_t mainDelta = (mainCodeIn.frontier() - mainStart) - (mainCode.frontier() - mainCode.base()); int64_t coldDelta = (coldCodeIn.frontier() - coldStart) - (coldCode.frontier() - coldCode.base()); mainDeltaTot += mainDelta; HPHP::Trace::traceRelease("main delta after relocation: %" PRId64 " (%" PRId64 ")\n", mainDelta, mainDeltaTot); coldDeltaTot += coldDelta; HPHP::Trace::traceRelease("cold delta after relocation: %" PRId64 " (%" PRId64 ")\n", coldDelta, coldDeltaTot); } #ifndef NDEBUG auto& ip = mcg->cgFixups().m_inProgressTailJumps; for (size_t i = 0; i < ip.size(); ++i) { const auto& ib = ip[i]; assert(!mainCode.contains(ib.toSmash())); assert(!coldCode.contains(ib.toSmash())); } memset(mainCode.base(), 0xcc, mainCode.frontier() - mainCode.base()); memset(coldCode.base(), 0xcc, coldCode.frontier() - coldCode.base()); #endif } else { coldCodeIn.skip(coldCode.frontier() - coldCodeIn.frontier()); mainCodeIn.skip(mainCode.frontier() - mainCodeIn.frontier()); } if (asmInfo) { printUnit(kCodeGenLevel, unit, " after code gen ", ®s, asmInfo); } }
void optimize(IRUnit& unit, IRBuilder& irBuilder, TransKind kind) { Timer _t(Timer::optimize); auto const finishPass = [&] (const char* msg) { if (msg) { printUnit(6, unit, folly::format("after {}", msg).str().c_str()); } assertx(checkCfg(unit)); assertx(checkTmpsSpanningCalls(unit)); if (debug) { forEachInst(rpoSortCfg(unit), [&](IRInstruction* inst) { assertx(checkOperandTypes(inst, &unit)); }); } }; auto const doPass = [&] (void (*fn)(IRUnit&), const char* msg = nullptr) { fn(unit); finishPass(msg); }; auto const dce = [&] (const char* which) { if (!RuntimeOption::EvalHHIRDeadCodeElim) return; eliminateDeadCode(unit); finishPass(folly::format("{} DCE", which).str().c_str()); }; auto const simplifyPass = [] (IRUnit& unit) { boost::dynamic_bitset<> reachable(unit.numBlocks()); reachable.set(unit.entry()->id()); auto const blocks = rpoSortCfg(unit); for (auto block : blocks) { // Skip unreachable blocks, or simplify() cries. if (!reachable.test(block->id())) continue; for (auto& inst : *block) simplify(unit, &inst); if (auto const b = block->back().next()) reachable.set(b->id()); if (auto const b = block->back().taken()) reachable.set(b->id()); } }; auto const doSimplify = RuntimeOption::EvalHHIRExtraOptPass && RuntimeOption::EvalHHIRSimplification; auto const hasLoop = RuntimeOption::EvalJitLoops && cfgHasLoop(unit); auto const traceMode = kind != TransKind::Optimize || RuntimeOption::EvalJitPGORegionSelector == "hottrace"; // TODO (#5792564): Guard relaxation doesn't work with loops. // TODO (#6599498): Guard relaxation is broken in wholecfg mode. if (shouldHHIRRelaxGuards() && !hasLoop && traceMode) { Timer _t(Timer::optimize_relaxGuards); const bool simple = kind == TransKind::Profile && (RuntimeOption::EvalJitRegionSelector == "tracelet" || RuntimeOption::EvalJitRegionSelector == "method"); RelaxGuardsFlags flags = (RelaxGuardsFlags) (RelaxReflow | (simple ? RelaxSimple : RelaxNormal)); auto changed = relaxGuards(unit, *irBuilder.guards(), flags); if (changed) finishPass("guard relaxation"); if (doSimplify) { doPass(simplifyPass, "guard relaxation simplify"); } } // This is vestigial (it removes some instructions needed by the old refcount // opts pass), and will be removed soon. eliminateTakes(unit); dce("initial"); if (RuntimeOption::EvalHHIRPredictionOpts) { doPass(optimizePredictions, "prediction opts"); } if (doSimplify) { doPass(simplifyPass, "simplify"); dce("simplify"); } if (RuntimeOption::EvalHHIRGlobalValueNumbering) { doPass(gvn); dce("gvn"); } if (kind != TransKind::Profile && RuntimeOption::EvalHHIRMemoryOpts) { doPass(optimizeLoads); dce("loadelim"); } /* * Note: doing this pass this late might not be ideal, in particular because * we've already turned some StLoc instructions into StLocNT. * * But right now there are assumptions preventing us from doing it before * refcount opts. (Refcount opts needs to see all the StLocs explicitly * because it makes assumptions about whether references are consumed based * on that.) */ if (kind != TransKind::Profile && RuntimeOption::EvalHHIRMemoryOpts) { doPass(optimizeStores); dce("storeelim"); } if (kind != TransKind::Profile && RuntimeOption::EvalHHIRRefcountOpts) { doPass(optimizeRefcounts2); dce("refcount"); } if (RuntimeOption::EvalHHIRGenerateAsserts) { doPass(insertAsserts); } }
void optimize(IRUnit& unit, IRBuilder& irBuilder, TransKind kind) { Timer timer(Timer::optimize); assertx(checkEverything(unit)); auto const hasLoop = RuntimeOption::EvalJitLoops && cfgHasLoop(unit); auto const func = unit.entry()->front().marker().func(); auto const regionMode = pgoRegionMode(*func); auto const traceMode = kind != TransKind::Optimize || regionMode == PGORegionMode::Hottrace; // TODO (#5792564): Guard relaxation doesn't work with loops. // TODO (#6599498): Guard relaxation is broken in wholecfg mode. if (shouldHHIRRelaxGuards() && !hasLoop && traceMode) { Timer _t(Timer::optimize_relaxGuards); const bool simple = kind == TransKind::Profile && (RuntimeOption::EvalJitRegionSelector == "tracelet" || RuntimeOption::EvalJitRegionSelector == "method"); RelaxGuardsFlags flags = (RelaxGuardsFlags) (RelaxReflow | (simple ? RelaxSimple : RelaxNormal)); auto changed = relaxGuards(unit, *irBuilder.guards(), flags); if (changed) { printUnit(6, unit, "after guard relaxation"); mandatoryDCE(unit); // relaxGuards can leave unreachable preds. } if (RuntimeOption::EvalHHIRSimplification) { doPass(unit, simplifyPass, DCE::Minimal); doPass(unit, cleanCfg, DCE::None); } } fullDCE(unit); printUnit(6, unit, " after initial DCE "); assertx(checkEverything(unit)); if (RuntimeOption::EvalHHIRTypeCheckHoisting) { doPass(unit, hoistTypeChecks, DCE::None); } if (RuntimeOption::EvalHHIRPredictionOpts) { doPass(unit, optimizePredictions, DCE::None); } if (RuntimeOption::EvalHHIRSimplification) { doPass(unit, simplifyPass, DCE::Full); doPass(unit, cleanCfg, DCE::None); } if (RuntimeOption::EvalHHIRGlobalValueNumbering) { doPass(unit, gvn, DCE::Full); } if (kind != TransKind::Profile && RuntimeOption::EvalHHIRMemoryOpts) { doPass(unit, optimizeLoads, DCE::Full); } if (kind != TransKind::Profile && RuntimeOption::EvalHHIRMemoryOpts) { doPass(unit, optimizeStores, DCE::Full); } if (kind != TransKind::Profile && RuntimeOption::EvalHHIRRefcountOpts) { doPass(unit, optimizeRefcounts2, DCE::Full); } if (RuntimeOption::EvalHHIRLICM) { if (kind != TransKind::Profile && hasLoop) { // The clean pass is just to stress lack of pre_headers for now, since // LICM is a disabled prototype pass. doPass(unit, cleanCfg, DCE::None); doPass(unit, optimizeLoopInvariantCode, DCE::Minimal); } } doPass(unit, removeExitPlaceholders, DCE::Full); if (RuntimeOption::EvalHHIRGenerateAsserts) { doPass(unit, insertAsserts, DCE::None); } }
void optimize(IRUnit& unit, TransKind kind) { Timer timer(Timer::optimize, unit.logEntry().get_pointer()); assertx(checkEverything(unit)); fullDCE(unit); printUnit(6, unit, " after initial DCE "); assertx(checkEverything(unit)); if (RuntimeOption::EvalHHIRPredictionOpts) { doPass(unit, optimizePredictions, DCE::None); } if (RuntimeOption::EvalHHIRSimplification) { doPass(unit, simplifyPass, DCE::Full); doPass(unit, cleanCfg, DCE::None); } if (RuntimeOption::EvalHHIRGlobalValueNumbering) { doPass(unit, gvn, DCE::Full); } if (kind != TransKind::Profile && RuntimeOption::EvalHHIRMemoryOpts) { doPass(unit, optimizeLoads, DCE::Full); } if (kind != TransKind::Profile && RuntimeOption::EvalHHIRMemoryOpts) { doPass(unit, optimizeStores, DCE::Full); } if (RuntimeOption::EvalHHIRPartialInlineFrameOpts) { doPass(unit, optimizeInlineReturns, DCE::Full); } doPass(unit, optimizePhis, DCE::Full); if (kind != TransKind::Profile && RuntimeOption::EvalHHIRRefcountOpts) { doPass(unit, optimizeRefcounts, DCE::Full); } if (RuntimeOption::EvalHHIRLICM && cfgHasLoop(unit) && kind != TransKind::Profile) { doPass(unit, optimizeLoopInvariantCode, DCE::Minimal); } doPass(unit, simplifyOrdStrIdx, DCE::Minimal); doPass(unit, removeExitPlaceholders, DCE::Full); if (RuntimeOption::EvalHHIRGenerateAsserts) { doPass(unit, insertAsserts, DCE::None); } // Perform final cleanup passes to collapse any critical edges that were // split, and simplify our instructions before shipping off to codegen. doPass(unit, cleanCfg, DCE::None); if (kind != TransKind::Profile && RuntimeOption::EvalHHIRGlobalValueNumbering) { doPass(unit, gvn, DCE::Full); } if (kind != TransKind::Profile && RuntimeOption::EvalHHIRSimplification) { doPass(unit, simplifyPass, DCE::Full); } doPass(unit, fixBlockHints, DCE::None); }
void optimizeJmps(Vunit& unit) { auto isEmpty = [&](Vlabel b, Vinstr::Opcode op) { auto& code = unit.blocks[b].code; return code.size() == 1 && op == code[0].op; }; bool changed = false; bool ever_changed = false; jit::vector<int> npreds(unit.blocks.size(), 0); do { if (changed) { fill(npreds.begin(), npreds.end(), 0); } changed = false; PostorderWalker{unit}.dfs([&](Vlabel b) { for (auto s : succs(unit.blocks[b])) { npreds[s]++; } }); // give roots an extra predecessor to prevent cloning them. for (auto b : unit.roots) { npreds[b]++; } PostorderWalker{unit}.dfs([&](Vlabel b) { auto& block = unit.blocks[b]; auto& code = block.code; assert(!code.empty()); if (code.back().op == Vinstr::jcc) { auto ss = succs(block); if (ss[0] == ss[1]) { // both edges have same target, change to jmp code.back() = jmp{ss[0]}; changed = true; } } if (code.back().op == Vinstr::jmp) { auto& s = code.back().jmp_.target; if (isEmpty(s, Vinstr::jmp)) { // skip over s s = unit.blocks[s].code.back().jmp_.target; changed = true; } else if (npreds[s] == 1 || isEmpty(s, Vinstr::jcc)) { // overwrite jmp with copy of s auto& code2 = unit.blocks[s].code; code.pop_back(); code.insert(code.end(), code2.begin(), code2.end()); changed = true; } } else { for (auto& s : succs(block)) { if (isEmpty(s, Vinstr::jmp)) { // skip over s s = unit.blocks[s].code.back().jmp_.target; changed = true; } } } }); ever_changed |= changed; } while (changed); if (ever_changed) { printUnit(kVasmJumpsLevel, "after vasm-jumps", unit); } }
static void genCodeImpl(IRUnit& unit, AsmInfo* asmInfo) { auto regs = allocateRegs(unit); assert(checkRegisters(unit, regs)); // calls checkCfg internally. Timer _t(Timer::codeGen); LiveRegs live_regs = computeLiveRegs(unit, regs); CodegenState state(unit, regs, live_regs, asmInfo); // Returns: whether a block has already been emitted. DEBUG_ONLY auto isEmitted = [&](Block* block) { return state.addresses[block]; }; CodeBlock& mainCodeIn = mcg->code.main(); CodeBlock& coldCodeIn = mcg->code.cold(); CodeBlock* frozenCode = &mcg->code.frozen(); CodeBlock mainCode; CodeBlock coldCode; bool relocate = false; if (RuntimeOption::EvalJitRelocationSize && mcg->backEnd().supportsRelocation() && coldCodeIn.canEmit(RuntimeOption::EvalJitRelocationSize * 3)) { /* * This is mainly to exercise the relocator, and ensure that its * not broken by new non-relocatable code. Later, it will be * used to do some peephole optimizations, such as reducing branch * sizes. * Allocate enough space that the relocated cold code doesn't * overlap the emitted cold code. */ static unsigned seed = 42; auto off = rand_r(&seed) & (mcg->backEnd().cacheLineSize() - 1); coldCode.init(coldCodeIn.frontier() + RuntimeOption::EvalJitRelocationSize + off, RuntimeOption::EvalJitRelocationSize - off, "cgRelocCold"); mainCode.init(coldCode.frontier() + RuntimeOption::EvalJitRelocationSize + off, RuntimeOption::EvalJitRelocationSize - off, "cgRelocMain"); relocate = true; } else { /* * Use separate code blocks, so that attempts to use the mcg's * code blocks directly will fail (eg by overwriting the same * memory being written through these locals). */ coldCode.init(coldCodeIn.frontier(), coldCodeIn.available(), coldCodeIn.name().c_str()); mainCode.init(mainCodeIn.frontier(), mainCodeIn.available(), mainCodeIn.name().c_str()); } if (frozenCode == &coldCodeIn) { frozenCode = &coldCode; } auto frozenStart = frozenCode->frontier(); auto coldStart DEBUG_ONLY = coldCodeIn.frontier(); auto mainStart DEBUG_ONLY = mainCodeIn.frontier(); auto bcMap = &mcg->cgFixups().m_bcMap; { mcg->code.lock(); mcg->cgFixups().setBlocks(&mainCode, &coldCode, frozenCode); SCOPE_EXIT { mcg->cgFixups().setBlocks(nullptr, nullptr, nullptr); mcg->code.unlock(); }; /* * Emit the given block on the supplied assembler. The `nextLinear' * is the next block that will be emitted on this assembler. If is * not the next block in control flow order, then emit a patchable jump * to the next flow block. */ auto emitBlock = [&](CodeBlock& cb, Block* block, Block* nextLinear) { assert(!isEmitted(block)); FTRACE(6, "genBlock {} on {}\n", block->id(), cb.base() == coldCode.base() ? "acold" : "a"); auto const aStart = cb.frontier(); auto const acoldStart = coldCode.frontier(); auto const afrozenStart = frozenCode->frontier(); mcg->backEnd().patchJumps(cb, state, block); state.addresses[block] = aStart; // If the block ends with a Jmp and the next block is going to be // its target, we don't need to actually emit it. IRInstruction* last = &block->back(); state.noTerminalJmp = last->op() == Jmp && nextLinear == last->taken(); if (state.asmInfo) { state.asmInfo->asmRanges[block] = TcaRange(aStart, cb.frontier()); } genBlock(unit, cb, coldCode, *frozenCode, state, block, bcMap); auto nextFlow = block->next(); if (nextFlow && nextFlow != nextLinear) { mcg->backEnd().emitFwdJmp(cb, nextFlow, state); } if (state.asmInfo) { state.asmInfo->asmRanges[block] = TcaRange(aStart, cb.frontier()); if (cb.base() != coldCode.base() && frozenCode != &coldCode) { state.asmInfo->acoldRanges[block] = TcaRange(acoldStart, coldCode.frontier()); } if (cb.base() != frozenCode->base()) { state.asmInfo->afrozenRanges[block] = TcaRange(afrozenStart, frozenCode->frontier()); } } }; if (RuntimeOption::EvalHHIRGenerateAsserts) { mcg->backEnd().emitTraceCall(mainCode, unit.bcOff()); } auto const linfo = layoutBlocks(unit); for (auto it = linfo.blocks.begin(); it != linfo.acoldIt; ++it) { Block* nextLinear = boost::next(it) != linfo.acoldIt ? *boost::next(it) : nullptr; emitBlock(mainCode, *it, nextLinear); } for (auto it = linfo.acoldIt; it != linfo.afrozenIt; ++it) { Block* nextLinear = boost::next(it) != linfo.afrozenIt ? *boost::next(it) : nullptr; emitBlock(coldCode, *it, nextLinear); } for (auto it = linfo.afrozenIt; it != linfo.blocks.end(); ++it) { Block* nextLinear = boost::next(it) != linfo.blocks.end() ? *boost::next(it) : nullptr; emitBlock(*frozenCode, *it, nextLinear); } if (debug) { for (Block* UNUSED block : linfo.blocks) { assert(isEmitted(block)); } } } assert(coldCodeIn.frontier() == coldStart); assert(mainCodeIn.frontier() == mainStart); if (relocate) { if (asmInfo) { printUnit(kRelocationLevel, unit, " before relocation ", ®s, asmInfo); } auto& be = mcg->backEnd(); RelocationInfo rel; be.relocate(rel, mainCodeIn, mainCode.base(), mainCode.frontier(), mcg->cgFixups()); be.relocate(rel, coldCodeIn, coldCode.base(), coldCode.frontier(), mcg->cgFixups()); if (frozenCode != &coldCode) { rel.recordRange(frozenStart, frozenCode->frontier(), frozenStart, frozenCode->frontier()); } be.adjustForRelocation(rel, mcg->cgFixups()); be.adjustForRelocation(rel, asmInfo, mcg->cgFixups()); if (asmInfo) { static int64_t mainDeltaTot = 0, coldDeltaTot = 0; int64_t mainDelta = (mainCodeIn.frontier() - mainStart) - (mainCode.frontier() - mainCode.base()); int64_t coldDelta = (coldCodeIn.frontier() - coldStart) - (coldCode.frontier() - coldCode.base()); mainDeltaTot += mainDelta; HPHP::Trace::traceRelease("main delta after relocation: %" PRId64 " (%" PRId64 ")\n", mainDelta, mainDeltaTot); coldDeltaTot += coldDelta; HPHP::Trace::traceRelease("cold delta after relocation: %" PRId64 " (%" PRId64 ")\n", coldDelta, coldDeltaTot); } #ifndef NDEBUG auto& ip = mcg->cgFixups().m_inProgressTailJumps; for (size_t i = 0; i < ip.size(); ++i) { const auto& ib = ip[i]; assert(!mainCode.contains(ib.toSmash())); assert(!coldCode.contains(ib.toSmash())); } memset(mainCode.base(), 0xcc, mainCode.frontier() - mainCode.base()); memset(coldCode.base(), 0xcc, coldCode.frontier() - coldCode.base()); #endif } else { coldCodeIn.skip(coldCode.frontier() - coldCodeIn.frontier()); mainCodeIn.skip(mainCode.frontier() - mainCodeIn.frontier()); } if (asmInfo) { printUnit(kCodeGenLevel, unit, " after code gen ", ®s, asmInfo); } }