TCA genFuncBodyDispatch(Func* func, const DVFuncletsVec& dvs, CodeCache::View code) { auto context = prologue_context(kInvalidTransID, TransKind::Live, func, func->base()); IRUnit unit{context}; irgen::IRGS env{unit}; auto& main = code.main(); auto& frozen = code.frozen(); auto const start = main.frontier(); irgen::emitFuncBodyDispatch(env, dvs); irgen::sealUnit(env); CGMeta fixups; irlower::genCode(env.unit, code, fixups, CodeKind::CrossTrace); if (RuntimeOption::EvalPerfRelocate) { GrowableVector<IncomingBranch> ibs; recordPerfRelocMap(start, main.frontier(), frozen.frontier(), frozen.frontier(), SrcKey { func, dvs[0].second, false }, 0, ibs, fixups); } fixups.process(nullptr); return start; }
template<class T> uint64_t test_const(T val) { using testfunc = double (*)(); static const Abi test_abi = { .gpUnreserved = RegSet{}, .gpReserved = x64::abi().gp(), .simdUnreserved = RegSet{xmm0}, .simdReserved = x64::abi().simd() - RegSet{xmm0}, .calleeSaved = x64::abi().calleeSaved, .sf = x64::abi().sf }; static uint8_t code[1000]; // None of these tests should use any data. static uint8_t data_buffer[0]; CodeBlock main; main.init(code, sizeof(code), "test"); DataBlock data; data.init(data_buffer, sizeof(data), "data"); Vunit unit; Vasm vasm{unit}; Vtext text { main, data }; auto& v = vasm.main(); unit.entry = v; v << copy{v.cns(val), Vreg{xmm0}}; v << ret{RegSet{xmm0}}; optimizeX64(vasm.unit(), test_abi, true /* regalloc */); CGMeta fixups; LeaseHolder writer{Translator::WriteLease()}; EXPECT_TRUE(writer.canWrite()); emitX64(unit, text, fixups, nullptr); // The above code might use fixups.literals but shouldn't use anything else. fixups.literals.clear(); EXPECT_TRUE(fixups.empty()); union { double d; uint64_t c; } u; u.d = ((testfunc)code)(); return u.c; }
template<class T> uint64_t test_const(T val) { using testfunc = double (*)(); static const Abi test_abi = { .gpUnreserved = RegSet{}, .gpReserved = x64::abi().gp(), .simdUnreserved = RegSet{xmm0}, .simdReserved = x64::abi().simd() - RegSet{xmm0}, .calleeSaved = x64::abi().calleeSaved, .sf = x64::abi().sf }; static uint8_t code[1000]; CodeBlock main; main.init(code, sizeof(code), "test"); Vasm vasm; Vtext text { main }; auto& unit = vasm.unit(); auto& v = vasm.main(); unit.entry = v; v << copy{v.cns(val), Vreg{xmm0}}; v << ret{RegSet{xmm0}}; optimizeX64(vasm.unit(), test_abi); CGMeta fixups; emitX64(unit, text, fixups, nullptr); // The above code might use fixups.literals but shouldn't use anything else. fixups.literals.clear(); EXPECT_TRUE(fixups.empty()); union { double d; uint64_t c; } u; u.d = ((testfunc)code)(); return u.c; }
static TCA emitFuncPrologueImpl(Func* func, int argc, TransKind kind) { if (!newTranslation()) { return nullptr; } const int nparams = func->numNonVariadicParams(); const int paramIndex = argc <= nparams ? argc : nparams + 1; auto const funcBody = SrcKey{func, func->getEntryForNumArgs(argc), false}; profileSetHotFuncAttr(); auto codeLock = lockCode(); auto codeView = code().view(kind); TCA mainOrig = codeView.main().frontier(); CGMeta fixups; // If we're close to a cache line boundary, just burn some space to // try to keep the func and its body on fewer total lines. align(codeView.main(), &fixups, Alignment::CacheLineRoundUp, AlignContext::Dead); TransLocMaker maker(codeView); maker.markStart(); // Careful: this isn't necessarily the real entry point. For funcIsMagic // prologues, this is just a possible prologue. TCA aStart = codeView.main().frontier(); // Give the prologue a TransID if we have profiling data. auto const transID = [&]{ if (kind == TransKind::ProfPrologue) { auto const profData = jit::profData(); auto const id = profData->allocTransID(); profData->addTransProfPrologue(id, funcBody, paramIndex); return id; } if (profData() && transdb::enabled()) { return profData()->allocTransID(); } return kInvalidTransID; }(); TCA start = genFuncPrologue(transID, kind, func, argc, codeView, fixups); auto loc = maker.markEnd(); auto metaLock = lockMetadata(); if (RuntimeOption::EvalEnableReusableTC) { TCA UNUSED ms = loc.mainStart(), me = loc.mainEnd(), cs = loc.coldStart(), ce = loc.coldEnd(), fs = loc.frozenStart(), fe = loc.frozenEnd(), oldStart = start; auto const did_relocate = relocateNewTranslation(loc, codeView, fixups, &start); if (did_relocate) { FTRACE_MOD(Trace::reusetc, 1, "Relocated prologue for func {} (id = {}) " "from M[{}, {}], C[{}, {}], F[{}, {}] to M[{}, {}] " "C[{}, {}] F[{}, {}] orig start @ {} new start @ {}\n", func->fullName()->data(), func->getFuncId(), ms, me, cs, ce, fs, fe, loc.mainStart(), loc.mainEnd(), loc.coldStart(), loc.coldEnd(), loc.frozenStart(), loc.frozenEnd(), oldStart, start); } else { FTRACE_MOD(Trace::reusetc, 1, "Created prologue for func {} (id = {}) at " "M[{}, {}], C[{}, {}], F[{}, {}] start @ {}\n", func->fullName()->data(), func->getFuncId(), ms, me, cs, ce, fs, fe, oldStart); } recordFuncPrologue(func, loc); if (loc.mainStart() != aStart) { codeView.main().setFrontier(mainOrig); // we may have shifted to align } } if (RuntimeOption::EvalPerfRelocate) { GrowableVector<IncomingBranch> incomingBranches; recordPerfRelocMap(loc.mainStart(), loc.mainEnd(), loc.coldCodeStart(), loc.coldEnd(), funcBody, paramIndex, incomingBranches, fixups); } fixups.process(nullptr); assertx(funcGuardMatches(funcGuardFromPrologue(start, func), func)); assertx(code().isValidCodeAddress(start)); TRACE(2, "funcPrologue %s(%d) setting prologue %p\n", func->fullName()->data(), argc, start); func->setPrologue(paramIndex, start); assertx(kind == TransKind::LivePrologue || kind == TransKind::ProfPrologue || kind == TransKind::OptPrologue); auto tr = maker.rec(funcBody, transID, kind); transdb::addTranslation(tr); if (RuntimeOption::EvalJitUseVtuneAPI) { reportTraceletToVtune(func->unit(), func, tr); } recordGdbTranslation(funcBody, func, codeView.main(), loc.mainStart(), false, true); recordBCInstr(OpFuncPrologue, loc.mainStart(), loc.mainEnd(), false); return start; }
TCA emitFreeLocalsHelpers(CodeBlock& cb, UniqueStubs& us) { // The address of the first local is passed in the second argument register. // We use the third and fourth as scratch registers. auto const local = rarg(1); auto const last = rarg(2); auto const type = rarg(3); CGMeta fixups; // This stub is very hot; keep it cache-aligned. align(cb, &fixups, Alignment::CacheLine, AlignContext::Dead); auto const release = emitDecRefHelper(cb, fixups, local, type, local | last); auto const decref_local = [&] (Vout& v) { auto const sf = v.makeReg(); // We can't do a byte load here---we have to sign-extend since we use // `type' as a 32-bit array index to the destructor table. v << loadzbl{local[TVOFF(m_type)], type}; emitCmpTVType(v, sf, KindOfRefCountThreshold, type); ifThen(v, CC_G, sf, [&] (Vout& v) { v << call{release, arg_regs(3)}; }); }; auto const next_local = [&] (Vout& v) { v << addqi{static_cast<int>(sizeof(TypedValue)), local, local, v.makeReg()}; }; alignJmpTarget(cb); us.freeManyLocalsHelper = vwrap(cb, fixups, [&] (Vout& v) { // We always unroll the final `kNumFreeLocalsHelpers' decrefs, so only loop // until we hit that point. v << lea{rvmfp()[localOffset(kNumFreeLocalsHelpers - 1)], last}; doWhile(v, CC_NZ, {}, [&] (const VregList& in, const VregList& out) { auto const sf = v.makeReg(); decref_local(v); next_local(v); v << cmpq{local, last, sf}; return sf; } ); }); for (auto i = kNumFreeLocalsHelpers - 1; i >= 0; --i) { us.freeLocalsHelpers[i] = vwrap(cb, [&] (Vout& v) { decref_local(v); if (i != 0) next_local(v); }); } // All the stub entrypoints share the same ret. vwrap(cb, fixups, [] (Vout& v) { v << ret{}; }); // This stub is hot, so make sure to keep it small. // Alas, we have more work to do in this under Windows, // so we can't be this small :( #ifndef _WIN32 always_assert(Stats::enabled() || (cb.frontier() - release <= 4 * x64::cache_line_size())); #endif fixups.process(nullptr); return release; }
TCA emitFreeLocalsHelpers(CodeBlock& cb, DataBlock& data, UniqueStubs& us) { // The address of the first local is passed in the second argument register. // We use the third and fourth as scratch registers. auto const local = rarg(1); auto const last = rarg(2); auto const type = rarg(3); CGMeta fixups; // This stub is very hot; keep it cache-aligned. align(cb, &fixups, Alignment::CacheLine, AlignContext::Dead); auto const release = emitDecRefHelper(cb, data, fixups, local, type, local | last); auto const decref_local = [&] (Vout& v) { auto const sf = v.makeReg(); // We can't do a byte load here---we have to sign-extend since we use // `type' as a 32-bit array index to the destructor table. v << loadzbl{local[TVOFF(m_type)], type}; emitCmpTVType(v, sf, KindOfRefCountThreshold, type); ifThen(v, CC_G, sf, [&] (Vout& v) { auto const dword_size = sizeof(int64_t); // saving return value on the stack, but keeping it 16-byte aligned v << mflr{rfuncln()}; v << lea {rsp()[-2 * dword_size], rsp()}; v << store{rfuncln(), rsp()[0]}; v << call{release, arg_regs(3)}; // restore the return value from the stack v << load{rsp()[0], rfuncln()}; v << lea {rsp()[2 * dword_size], rsp()}; v << mtlr{rfuncln()}; }); }; auto const next_local = [&] (Vout& v) { v << addqi{static_cast<int>(sizeof(TypedValue)), local, local, v.makeReg()}; }; alignJmpTarget(cb); us.freeManyLocalsHelper = vwrap(cb, data, fixups, [&] (Vout& v) { // We always unroll the final `kNumFreeLocalsHelpers' decrefs, so only loop // until we hit that point. v << lea{rvmfp()[localOffset(kNumFreeLocalsHelpers - 1)], last}; doWhile(v, CC_NZ, {}, [&] (const VregList& in, const VregList& out) { auto const sf = v.makeReg(); decref_local(v); next_local(v); v << cmpq{local, last, sf}; return sf; } ); }); for (auto i = kNumFreeLocalsHelpers - 1; i >= 0; --i) { us.freeLocalsHelpers[i] = vwrap(cb, data, [&] (Vout& v) { decref_local(v); if (i != 0) next_local(v); }); } // All the stub entrypoints share the same ret. vwrap(cb, data, fixups, [] (Vout& v) { v << ret{}; }); // This stub is hot, so make sure to keep it small. #if 0 // TODO(gut): Currently this assert fails. // Take a closer look when looking at performance always_assert(Stats::enabled() || (cb.frontier() - release <= 4 * cache_line_size())); #endif fixups.process(nullptr); return release; }
TCA emitFreeLocalsHelpers(CodeBlock& cb, DataBlock& data, UniqueStubs& us) { // The address of the first local is passed in the second argument register. // We use the third and fourth as scratch registers. auto const local = rarg(1); auto const last = rarg(2); auto const type = rarg(3); CGMeta fixups; TCA freeLocalsHelpers[kNumFreeLocalsHelpers]; TCA freeManyLocalsHelper; // This stub is very hot; keep it cache-aligned. align(cb, &fixups, Alignment::CacheLine, AlignContext::Dead); auto const release = emitDecRefHelper(cb, data, fixups, local, type, local | last); auto const decref_local = [&] (Vout& v) { auto const sf = v.makeReg(); // We can't use emitLoadTVType() here because it does a byte load, and we // need to sign-extend since we use `type' as a 32-bit array index to the // destructor table. v << loadzbl{local[TVOFF(m_type)], type}; emitCmpTVType(v, sf, KindOfRefCountThreshold, type); ifThen(v, CC_G, sf, [&] (Vout& v) { v << call{release, arg_regs(3)}; }); }; auto const next_local = [&] (Vout& v) { v << addqi{static_cast<int>(sizeof(TypedValue)), local, local, v.makeReg()}; }; alignJmpTarget(cb); freeManyLocalsHelper = vwrap(cb, data, [&] (Vout& v) { // We always unroll the final `kNumFreeLocalsHelpers' decrefs, so only loop // until we hit that point. v << lea{rvmfp()[localOffset(kNumFreeLocalsHelpers - 1)], last}; // Set up frame linkage to avoid an indirect fixup. v << copy{rsp(), rfp()}; doWhile(v, CC_NZ, {}, [&] (const VregList& in, const VregList& out) { auto const sf = v.makeReg(); decref_local(v); next_local(v); v << cmpq{local, last, sf}; return sf; } ); }); for (auto i = kNumFreeLocalsHelpers - 1; i >= 0; --i) { freeLocalsHelpers[i] = vwrap(cb, data, [&] (Vout& v) { decref_local(v); if (i != 0) next_local(v); }); } // All the stub entrypoints share the same ret. vwrap(cb, data, fixups, [] (Vout& v) { v << popp{rfp(), rlr()}; v << ret{}; }); // Create a table of branches us.freeManyLocalsHelper = vwrap(cb, data, [&] (Vout& v) { v << pushp{rlr(), rfp()}; // rvmfp() is needed by the freeManyLocalsHelper stub above, so frame // linkage setup is deferred until after its use in freeManyLocalsHelper. v << jmpi{freeManyLocalsHelper}; }); for (auto i = kNumFreeLocalsHelpers - 1; i >= 0; --i) { us.freeLocalsHelpers[i] = vwrap(cb, data, [&] (Vout& v) { // We set up frame linkage to avoid an indirect fixup. v << pushp{rlr(), rfp()}; v << copy{rsp(), rfp()}; v << jmpi{freeLocalsHelpers[i]}; }); } // FIXME: This stub is hot, so make sure to keep it small. #if 0 always_assert(Stats::enabled() || (cb.frontier() - release <= 4 * x64::cache_line_size())); #endif fixups.process(nullptr); return release; }
void relocate(std::vector<TransRelocInfo>& relocs, CodeBlock& dest, CGMeta& fixups) { assertOwnsCodeLock(); assert(!Func::s_treadmill); auto newRelocMapName = Debug::DebugInfo::Get()->getRelocMapName() + ".tmp"; auto newRelocMap = fopen(newRelocMapName.c_str(), "w+"); if (!newRelocMap) return; SCOPE_EXIT { if (newRelocMap) fclose(newRelocMap); }; Func::s_treadmill = true; SCOPE_EXIT { Func::s_treadmill = false; }; auto ignoreEntry = [](const SrcKey& sk) { // We can have entries such as UniqueStubs with no SrcKey // These are ok to process. if (!sk.valid()) return false; // But if the func has been removed from the AtomicHashMap, // we don't want to process it. return !Func::isFuncIdValid(sk.funcID()); }; RelocationInfo rel; size_t num = 0; assert(fixups.alignments.empty()); for (size_t sz = relocs.size(); num < sz; num++) { auto& reloc = relocs[num]; if (ignoreEntry(reloc.sk)) continue; auto start DEBUG_ONLY = dest.frontier(); try { x64::relocate(rel, dest, reloc.start, reloc.end, reloc.fixups, nullptr); } catch (const DataBlockFull& dbf) { break; } if (Trace::moduleEnabledRelease(Trace::mcg, 1)) { Trace::traceRelease( folly::sformat("Relocate: 0x{:08x}+0x{:04x} => 0x{:08x}+0x{:04x}\n", (uintptr_t)reloc.start, reloc.end - reloc.start, (uintptr_t)start, dest.frontier() - start)); } } swap_trick(fixups.alignments); assert(fixups.empty()); x64::adjustForRelocation(rel); for (size_t i = 0; i < num; i++) { if (!ignoreEntry(relocs[i].sk)) { x64::adjustMetaDataForRelocation(rel, nullptr, relocs[i].fixups); } } for (size_t i = 0; i < num; i++) { if (!ignoreEntry(relocs[i].sk)) { relocs[i].fixups.process_only(nullptr); } } // At this point, all the relocated code should be correct, and runable. // But eg if it has unlikely paths into cold code that has not been relocated, // then the cold code will still point back to the original, not the relocated // versions. Similarly reusable stubs will still point to the old code. // Since we can now execute the relocated code, its ok to start fixing these // things now. for (auto& it : srcDB()) { it.second->relocate(rel); } std::unordered_set<Func*> visitedFuncs; CodeSmasher s; for (size_t i = 0; i < num; i++) { auto& reloc = relocs[i]; if (ignoreEntry(reloc.sk)) continue; for (auto& ib : reloc.incomingBranches) { ib.relocate(rel); } if (!reloc.sk.valid()) continue; auto f = const_cast<Func*>(reloc.sk.func()); x64::adjustCodeForRelocation(rel, reloc.fixups); reloc.fixups.clear(); // fixup code references in the corresponding cold block to point // to the new code x64::adjustForRelocation(rel, reloc.coldStart, reloc.coldEnd); if (visitedFuncs.insert(f).second) { if (auto adjusted = rel.adjustedAddressAfter(f->getFuncBody())) { f->setFuncBody(adjusted); } int num = Func::getMaxNumPrologues(f->numParams()); if (num < kNumFixedPrologues) num = kNumFixedPrologues; while (num--) { auto addr = f->getPrologue(num); if (auto adjusted = rel.adjustedAddressAfter(addr)) { f->setPrologue(num, adjusted); } } } if (reloc.end != reloc.start) { s.entries.emplace_back(reloc.start, reloc.end); } } auto relocMap = Debug::DebugInfo::Get()->getRelocMap(); always_assert(relocMap); fseek(relocMap, 0, SEEK_SET); auto deadStubs = getFreeTCStubs(); auto param = PostProcessParam { rel, deadStubs, newRelocMap }; std::set<TCA> liveStubs; readRelocations(relocMap, &liveStubs, postProcess, ¶m); // ensure that any reusable stubs are updated for the relocated code for (auto stub : liveStubs) { FTRACE(1, "Stub: 0x{:08x}\n", (uintptr_t)stub); fixups.reusedStubs.emplace_back(stub); always_assert(!rel.adjustedAddressAfter(stub)); fprintf(newRelocMap, "%" PRIxPTR " 0 %s\n", uintptr_t(stub), "NewStub"); } x64::adjustCodeForRelocation(rel, fixups); unlink(Debug::DebugInfo::Get()->getRelocMapName().c_str()); rename(newRelocMapName.c_str(), Debug::DebugInfo::Get()->getRelocMapName().c_str()); fclose(newRelocMap); newRelocMap = nullptr; freopen(Debug::DebugInfo::Get()->getRelocMapName().c_str(), "r+", relocMap); fseek(relocMap, 0, SEEK_END); okToRelocate = false; Treadmill::enqueue(std::move(s)); }