Example #1
0
TCA genFuncBodyDispatch(Func* func, const DVFuncletsVec& dvs,
                        CodeCache::View code) {
  auto context = prologue_context(kInvalidTransID, TransKind::Live,
                                  func, func->base());
  IRUnit unit{context};
  irgen::IRGS env{unit};

  auto& main = code.main();
  auto& frozen = code.frozen();

  auto const start = main.frontier();

  irgen::emitFuncBodyDispatch(env, dvs);
  irgen::sealUnit(env);

  CGMeta fixups;
  irlower::genCode(env.unit, code, fixups, CodeKind::CrossTrace);

  if (RuntimeOption::EvalPerfRelocate) {
    GrowableVector<IncomingBranch> ibs;
    recordPerfRelocMap(start, main.frontier(),
                       frozen.frontier(), frozen.frontier(),
                       SrcKey { func, dvs[0].second, false },
                       0, ibs, fixups);
  }
  fixups.process(nullptr);

  return start;
}
Example #2
0
template<class T> uint64_t test_const(T val) {
  using testfunc = double (*)();
  static const Abi test_abi = {
    .gpUnreserved = RegSet{},
    .gpReserved = x64::abi().gp(),
    .simdUnreserved = RegSet{xmm0},
    .simdReserved = x64::abi().simd() - RegSet{xmm0},
    .calleeSaved = x64::abi().calleeSaved,
    .sf = x64::abi().sf
  };
  static uint8_t code[1000];
  // None of these tests should use any data.
  static uint8_t data_buffer[0];

  CodeBlock main;
  main.init(code, sizeof(code), "test");
  DataBlock data;
  data.init(data_buffer, sizeof(data), "data");

  Vunit unit;
  Vasm vasm{unit};
  Vtext text { main, data };

  auto& v = vasm.main();
  unit.entry = v;

  v << copy{v.cns(val), Vreg{xmm0}};
  v << ret{RegSet{xmm0}};

  optimizeX64(vasm.unit(), test_abi, true /* regalloc */);
  CGMeta fixups;
  LeaseHolder writer{Translator::WriteLease()};
  EXPECT_TRUE(writer.canWrite());
  emitX64(unit, text, fixups, nullptr);
  // The above code might use fixups.literals but shouldn't use anything else.
  fixups.literals.clear();
  EXPECT_TRUE(fixups.empty());

  union { double d; uint64_t c; } u;
  u.d = ((testfunc)code)();
  return u.c;
}
Example #3
0
template<class T> uint64_t test_const(T val) {
  using testfunc = double (*)();
  static const Abi test_abi = {
    .gpUnreserved = RegSet{},
    .gpReserved = x64::abi().gp(),
    .simdUnreserved = RegSet{xmm0},
    .simdReserved = x64::abi().simd() - RegSet{xmm0},
    .calleeSaved = x64::abi().calleeSaved,
    .sf = x64::abi().sf
  };
  static uint8_t code[1000];

  CodeBlock main;
  main.init(code, sizeof(code), "test");

  Vasm vasm;
  Vtext text { main };

  auto& unit = vasm.unit();
  auto& v = vasm.main();
  unit.entry = v;

  v << copy{v.cns(val), Vreg{xmm0}};
  v << ret{RegSet{xmm0}};

  optimizeX64(vasm.unit(), test_abi);
  CGMeta fixups;
  emitX64(unit, text, fixups, nullptr);
  // The above code might use fixups.literals but shouldn't use anything else.
  fixups.literals.clear();
  EXPECT_TRUE(fixups.empty());

  union { double d; uint64_t c; } u;
  u.d = ((testfunc)code)();
  return u.c;
}
Example #4
0
static TCA emitFuncPrologueImpl(Func* func, int argc, TransKind kind) {
  if (!newTranslation()) {
    return nullptr;
  }

  const int nparams = func->numNonVariadicParams();
  const int paramIndex = argc <= nparams ? argc : nparams + 1;

  auto const funcBody = SrcKey{func, func->getEntryForNumArgs(argc), false};

  profileSetHotFuncAttr();
  auto codeLock = lockCode();
  auto codeView = code().view(kind);
  TCA mainOrig = codeView.main().frontier();
  CGMeta fixups;

  // If we're close to a cache line boundary, just burn some space to
  // try to keep the func and its body on fewer total lines.
  align(codeView.main(), &fixups, Alignment::CacheLineRoundUp,
        AlignContext::Dead);

  TransLocMaker maker(codeView);
  maker.markStart();

  // Careful: this isn't necessarily the real entry point. For funcIsMagic
  // prologues, this is just a possible prologue.
  TCA aStart = codeView.main().frontier();

  // Give the prologue a TransID if we have profiling data.
  auto const transID = [&]{
    if (kind == TransKind::ProfPrologue) {
      auto const profData = jit::profData();
      auto const id = profData->allocTransID();
      profData->addTransProfPrologue(id, funcBody, paramIndex);
      return id;
    }
    if (profData() && transdb::enabled()) {
      return profData()->allocTransID();
    }
    return kInvalidTransID;
  }();

  TCA start = genFuncPrologue(transID, kind, func, argc, codeView, fixups);

  auto loc = maker.markEnd();
  auto metaLock = lockMetadata();

  if (RuntimeOption::EvalEnableReusableTC) {
    TCA UNUSED ms = loc.mainStart(), me = loc.mainEnd(),
               cs = loc.coldStart(), ce = loc.coldEnd(),
               fs = loc.frozenStart(), fe = loc.frozenEnd(),
               oldStart = start;

    auto const did_relocate = relocateNewTranslation(loc, codeView, fixups,
                                                     &start);

    if (did_relocate) {
      FTRACE_MOD(Trace::reusetc, 1,
                 "Relocated prologue for func {} (id = {}) "
                 "from M[{}, {}], C[{}, {}], F[{}, {}] to M[{}, {}] "
                 "C[{}, {}] F[{}, {}] orig start @ {} new start @ {}\n",
                 func->fullName()->data(), func->getFuncId(),
                 ms, me, cs, ce, fs, fe, loc.mainStart(), loc.mainEnd(),
                 loc.coldStart(), loc.coldEnd(), loc.frozenStart(),
                 loc.frozenEnd(), oldStart, start);
    } else {
      FTRACE_MOD(Trace::reusetc, 1,
                 "Created prologue for func {} (id = {}) at "
                 "M[{}, {}], C[{}, {}], F[{}, {}] start @ {}\n",
                 func->fullName()->data(), func->getFuncId(),
                 ms, me, cs, ce, fs, fe, oldStart);
    }

    recordFuncPrologue(func, loc);
    if (loc.mainStart() != aStart) {
      codeView.main().setFrontier(mainOrig); // we may have shifted to align
    }
  }
  if (RuntimeOption::EvalPerfRelocate) {
    GrowableVector<IncomingBranch> incomingBranches;
    recordPerfRelocMap(loc.mainStart(), loc.mainEnd(),
                       loc.coldCodeStart(), loc.coldEnd(),
                       funcBody, paramIndex,
                       incomingBranches,
                       fixups);
  }
  fixups.process(nullptr);

  assertx(funcGuardMatches(funcGuardFromPrologue(start, func), func));
  assertx(code().isValidCodeAddress(start));

  TRACE(2, "funcPrologue %s(%d) setting prologue %p\n",
        func->fullName()->data(), argc, start);
  func->setPrologue(paramIndex, start);

  assertx(kind == TransKind::LivePrologue ||
          kind == TransKind::ProfPrologue ||
          kind == TransKind::OptPrologue);

  auto tr = maker.rec(funcBody, transID, kind);
  transdb::addTranslation(tr);
  if (RuntimeOption::EvalJitUseVtuneAPI) {
    reportTraceletToVtune(func->unit(), func, tr);
  }


  recordGdbTranslation(funcBody, func, codeView.main(), loc.mainStart(),
                       false, true);
  recordBCInstr(OpFuncPrologue, loc.mainStart(), loc.mainEnd(), false);

  return start;
}
TCA emitFreeLocalsHelpers(CodeBlock& cb, UniqueStubs& us) {
  // The address of the first local is passed in the second argument register.
  // We use the third and fourth as scratch registers.
  auto const local = rarg(1);
  auto const last = rarg(2);
  auto const type = rarg(3);
  CGMeta fixups;

  // This stub is very hot; keep it cache-aligned.
  align(cb, &fixups, Alignment::CacheLine, AlignContext::Dead);
  auto const release = emitDecRefHelper(cb, fixups, local, type, local | last);

  auto const decref_local = [&] (Vout& v) {
    auto const sf = v.makeReg();

    // We can't do a byte load here---we have to sign-extend since we use
    // `type' as a 32-bit array index to the destructor table.
    v << loadzbl{local[TVOFF(m_type)], type};
    emitCmpTVType(v, sf, KindOfRefCountThreshold, type);

    ifThen(v, CC_G, sf, [&] (Vout& v) {
      v << call{release, arg_regs(3)};
    });
  };

  auto const next_local = [&] (Vout& v) {
    v << addqi{static_cast<int>(sizeof(TypedValue)),
               local, local, v.makeReg()};
  };

  alignJmpTarget(cb);

  us.freeManyLocalsHelper = vwrap(cb, fixups, [&] (Vout& v) {
    // We always unroll the final `kNumFreeLocalsHelpers' decrefs, so only loop
    // until we hit that point.
    v << lea{rvmfp()[localOffset(kNumFreeLocalsHelpers - 1)], last};

    doWhile(v, CC_NZ, {},
      [&] (const VregList& in, const VregList& out) {
        auto const sf = v.makeReg();

        decref_local(v);
        next_local(v);
        v << cmpq{local, last, sf};
        return sf;
      }
    );
  });

  for (auto i = kNumFreeLocalsHelpers - 1; i >= 0; --i) {
    us.freeLocalsHelpers[i] = vwrap(cb, [&] (Vout& v) {
      decref_local(v);
      if (i != 0) next_local(v);
    });
  }

  // All the stub entrypoints share the same ret.
  vwrap(cb, fixups, [] (Vout& v) { v << ret{}; });

  // This stub is hot, so make sure to keep it small.
  // Alas, we have more work to do in this under Windows,
  // so we can't be this small :(
#ifndef _WIN32
  always_assert(Stats::enabled() ||
                (cb.frontier() - release <= 4 * x64::cache_line_size()));
#endif

  fixups.process(nullptr);
  return release;
}
TCA emitFreeLocalsHelpers(CodeBlock& cb, DataBlock& data, UniqueStubs& us) {
  // The address of the first local is passed in the second argument register.
  // We use the third and fourth as scratch registers.
  auto const local = rarg(1);
  auto const last = rarg(2);
  auto const type = rarg(3);
  CGMeta fixups;

  // This stub is very hot; keep it cache-aligned.
  align(cb, &fixups, Alignment::CacheLine, AlignContext::Dead);
  auto const release =
    emitDecRefHelper(cb, data, fixups, local, type, local | last);

  auto const decref_local = [&] (Vout& v) {
    auto const sf = v.makeReg();

    // We can't do a byte load here---we have to sign-extend since we use
    // `type' as a 32-bit array index to the destructor table.
    v << loadzbl{local[TVOFF(m_type)], type};
    emitCmpTVType(v, sf, KindOfRefCountThreshold, type);

    ifThen(v, CC_G, sf, [&] (Vout& v) {
      auto const dword_size = sizeof(int64_t);

      // saving return value on the stack, but keeping it 16-byte aligned
      v << mflr{rfuncln()};
      v << lea {rsp()[-2 * dword_size], rsp()};
      v << store{rfuncln(), rsp()[0]};

      v << call{release, arg_regs(3)};

      // restore the return value from the stack
      v << load{rsp()[0], rfuncln()};
      v << lea {rsp()[2 * dword_size], rsp()};
      v << mtlr{rfuncln()};
    });
  };

  auto const next_local = [&] (Vout& v) {
    v << addqi{static_cast<int>(sizeof(TypedValue)),
               local, local, v.makeReg()};
  };

  alignJmpTarget(cb);

  us.freeManyLocalsHelper = vwrap(cb, data, fixups, [&] (Vout& v) {
    // We always unroll the final `kNumFreeLocalsHelpers' decrefs, so only loop
    // until we hit that point.
    v << lea{rvmfp()[localOffset(kNumFreeLocalsHelpers - 1)], last};

    doWhile(v, CC_NZ, {},
      [&] (const VregList& in, const VregList& out) {
        auto const sf = v.makeReg();

        decref_local(v);
        next_local(v);
        v << cmpq{local, last, sf};
        return sf;
      }
    );
  });

  for (auto i = kNumFreeLocalsHelpers - 1; i >= 0; --i) {
    us.freeLocalsHelpers[i] = vwrap(cb, data, [&] (Vout& v) {
      decref_local(v);
      if (i != 0) next_local(v);
    });
  }

  // All the stub entrypoints share the same ret.
  vwrap(cb, data, fixups, [] (Vout& v) { v << ret{}; });

  // This stub is hot, so make sure to keep it small.
#if 0
  // TODO(gut): Currently this assert fails.
  // Take a closer look when looking at performance
  always_assert(Stats::enabled() ||
                (cb.frontier() - release <= 4 * cache_line_size()));
#endif

  fixups.process(nullptr);
  return release;
}
Example #7
0
TCA emitFreeLocalsHelpers(CodeBlock& cb, DataBlock& data, UniqueStubs& us) {
  // The address of the first local is passed in the second argument register.
  // We use the third and fourth as scratch registers.
  auto const local = rarg(1);
  auto const last = rarg(2);
  auto const type = rarg(3);
  CGMeta fixups;
  TCA freeLocalsHelpers[kNumFreeLocalsHelpers];
  TCA freeManyLocalsHelper;

  // This stub is very hot; keep it cache-aligned.
  align(cb, &fixups, Alignment::CacheLine, AlignContext::Dead);
  auto const release =
    emitDecRefHelper(cb, data, fixups, local, type, local | last);

  auto const decref_local = [&] (Vout& v) {
    auto const sf = v.makeReg();

    // We can't use emitLoadTVType() here because it does a byte load, and we
    // need to sign-extend since we use `type' as a 32-bit array index to the
    // destructor table.
    v << loadzbl{local[TVOFF(m_type)], type};
    emitCmpTVType(v, sf, KindOfRefCountThreshold, type);

    ifThen(v, CC_G, sf, [&] (Vout& v) {
      v << call{release, arg_regs(3)};
    });
  };

  auto const next_local = [&] (Vout& v) {
    v << addqi{static_cast<int>(sizeof(TypedValue)),
               local, local, v.makeReg()};
  };

  alignJmpTarget(cb);

  freeManyLocalsHelper = vwrap(cb, data, [&] (Vout& v) {
    // We always unroll the final `kNumFreeLocalsHelpers' decrefs, so only loop
    // until we hit that point.
    v << lea{rvmfp()[localOffset(kNumFreeLocalsHelpers - 1)], last};

    // Set up frame linkage to avoid an indirect fixup.
    v << copy{rsp(), rfp()};

    doWhile(v, CC_NZ, {},
      [&] (const VregList& in, const VregList& out) {
        auto const sf = v.makeReg();

        decref_local(v);
        next_local(v);
        v << cmpq{local, last, sf};
        return sf;
      }
    );
  });

  for (auto i = kNumFreeLocalsHelpers - 1; i >= 0; --i) {
    freeLocalsHelpers[i] = vwrap(cb, data, [&] (Vout& v) {
      decref_local(v);
      if (i != 0) next_local(v);
    });
  }

  // All the stub entrypoints share the same ret.
  vwrap(cb, data, fixups, [] (Vout& v) {
    v << popp{rfp(), rlr()};
    v << ret{};
  });

  // Create a table of branches
  us.freeManyLocalsHelper = vwrap(cb, data, [&] (Vout& v) {
    v << pushp{rlr(), rfp()};

    // rvmfp() is needed by the freeManyLocalsHelper stub above, so frame
    // linkage setup is deferred until after its use in freeManyLocalsHelper.
    v << jmpi{freeManyLocalsHelper};
  });
  for (auto i = kNumFreeLocalsHelpers - 1; i >= 0; --i) {
    us.freeLocalsHelpers[i] = vwrap(cb, data, [&] (Vout& v) {
      // We set up frame linkage to avoid an indirect fixup.
      v << pushp{rlr(), rfp()};
      v << copy{rsp(), rfp()};
      v << jmpi{freeLocalsHelpers[i]};
    });
  }

  // FIXME: This stub is hot, so make sure to keep it small.
#if 0
  always_assert(Stats::enabled() ||
                (cb.frontier() - release <= 4 * x64::cache_line_size()));
#endif

  fixups.process(nullptr);
  return release;
}
Example #8
0
void relocate(std::vector<TransRelocInfo>& relocs, CodeBlock& dest,
              CGMeta& fixups) {
  assertOwnsCodeLock();
  assert(!Func::s_treadmill);

  auto newRelocMapName = Debug::DebugInfo::Get()->getRelocMapName() + ".tmp";
  auto newRelocMap = fopen(newRelocMapName.c_str(), "w+");
  if (!newRelocMap) return;

  SCOPE_EXIT {
    if (newRelocMap) fclose(newRelocMap);
  };

  Func::s_treadmill = true;
  SCOPE_EXIT {
    Func::s_treadmill = false;
  };

  auto ignoreEntry = [](const SrcKey& sk) {
    // We can have entries such as UniqueStubs with no SrcKey
    // These are ok to process.
    if (!sk.valid()) return false;
    // But if the func has been removed from the AtomicHashMap,
    // we don't want to process it.
    return !Func::isFuncIdValid(sk.funcID());
  };

  RelocationInfo rel;
  size_t num = 0;
  assert(fixups.alignments.empty());
  for (size_t sz = relocs.size(); num < sz; num++) {
    auto& reloc = relocs[num];
    if (ignoreEntry(reloc.sk)) continue;
    auto start DEBUG_ONLY = dest.frontier();
    try {
      x64::relocate(rel, dest,
                    reloc.start, reloc.end, reloc.fixups, nullptr);
    } catch (const DataBlockFull& dbf) {
      break;
    }
    if (Trace::moduleEnabledRelease(Trace::mcg, 1)) {
      Trace::traceRelease(
        folly::sformat("Relocate: 0x{:08x}+0x{:04x} => 0x{:08x}+0x{:04x}\n",
                       (uintptr_t)reloc.start, reloc.end - reloc.start,
                       (uintptr_t)start, dest.frontier() - start));
    }
  }
  swap_trick(fixups.alignments);
  assert(fixups.empty());

  x64::adjustForRelocation(rel);

  for (size_t i = 0; i < num; i++) {
    if (!ignoreEntry(relocs[i].sk)) {
      x64::adjustMetaDataForRelocation(rel, nullptr, relocs[i].fixups);
    }
  }

  for (size_t i = 0; i < num; i++) {
    if (!ignoreEntry(relocs[i].sk)) {
      relocs[i].fixups.process_only(nullptr);
    }
  }

  // At this point, all the relocated code should be correct, and runable.
  // But eg if it has unlikely paths into cold code that has not been relocated,
  // then the cold code will still point back to the original, not the relocated
  // versions. Similarly reusable stubs will still point to the old code.
  // Since we can now execute the relocated code, its ok to start fixing these
  // things now.

  for (auto& it : srcDB()) {
    it.second->relocate(rel);
  }

  std::unordered_set<Func*> visitedFuncs;
  CodeSmasher s;
  for (size_t i = 0; i < num; i++) {
    auto& reloc = relocs[i];
    if (ignoreEntry(reloc.sk)) continue;
    for (auto& ib : reloc.incomingBranches) {
      ib.relocate(rel);
    }
    if (!reloc.sk.valid()) continue;
    auto f = const_cast<Func*>(reloc.sk.func());

    x64::adjustCodeForRelocation(rel, reloc.fixups);
    reloc.fixups.clear();

    // fixup code references in the corresponding cold block to point
    // to the new code
    x64::adjustForRelocation(rel, reloc.coldStart, reloc.coldEnd);

    if (visitedFuncs.insert(f).second) {
      if (auto adjusted = rel.adjustedAddressAfter(f->getFuncBody())) {
        f->setFuncBody(adjusted);
      }
      int num = Func::getMaxNumPrologues(f->numParams());
      if (num < kNumFixedPrologues) num = kNumFixedPrologues;
      while (num--) {
        auto addr = f->getPrologue(num);
        if (auto adjusted = rel.adjustedAddressAfter(addr)) {
          f->setPrologue(num, adjusted);
        }
      }
    }
    if (reloc.end != reloc.start) {
      s.entries.emplace_back(reloc.start, reloc.end);
    }
  }

  auto relocMap = Debug::DebugInfo::Get()->getRelocMap();
  always_assert(relocMap);
  fseek(relocMap, 0, SEEK_SET);

  auto deadStubs = getFreeTCStubs();
  auto param = PostProcessParam { rel, deadStubs, newRelocMap };
  std::set<TCA> liveStubs;
  readRelocations(relocMap, &liveStubs, postProcess, &param);

  // ensure that any reusable stubs are updated for the relocated code
  for (auto stub : liveStubs) {
    FTRACE(1, "Stub: 0x{:08x}\n", (uintptr_t)stub);
    fixups.reusedStubs.emplace_back(stub);
    always_assert(!rel.adjustedAddressAfter(stub));
    fprintf(newRelocMap, "%" PRIxPTR " 0 %s\n", uintptr_t(stub), "NewStub");
  }
  x64::adjustCodeForRelocation(rel, fixups);

  unlink(Debug::DebugInfo::Get()->getRelocMapName().c_str());
  rename(newRelocMapName.c_str(),
         Debug::DebugInfo::Get()->getRelocMapName().c_str());
  fclose(newRelocMap);
  newRelocMap = nullptr;
  freopen(Debug::DebugInfo::Get()->getRelocMapName().c_str(), "r+", relocMap);
  fseek(relocMap, 0, SEEK_END);

  okToRelocate = false;
  Treadmill::enqueue(std::move(s));
}