std::pair< std::vector<std::unique_ptr<UnitEmitter>>, std::unique_ptr<ArrayTypeTable::Builder> > whole_program(std::vector<std::unique_ptr<UnitEmitter>> ues, int num_threads) { trace_time tracer("whole program"); RuntimeOption::EvalLowStaticArrays = false; if (num_threads > 0) { parallel::num_threads = num_threads; } LitstrTable::get().setReading(); auto program = parse_program(std::move(ues)); state_after("parse", *program); Index index{borrow(program)}; if (!options.NoOptimizations) { assert(check(*program)); constant_pass(index, *program); analyze_iteratively(index, *program, AnalyzeMode::NormalPass); if (options.AnalyzePublicStatics) { analyze_public_statics(index, *program); analyze_iteratively(index, *program, AnalyzeMode::NormalPass); } final_pass(index, *program); state_after("optimize", *program); } if (options.AnalyzePublicStatics) { mark_persistent_static_properties(index, *program); } debug_dump_program(index, *program); print_stats(index, *program); LitstrTable::get().setWriting(); ues = make_unit_emitters(index, *program); return { std::move(ues), std::move(index.array_table_builder()) }; }
void optimize(Index& index, php::Program& program) { assert(check(program)); trace_time tracer("optimize"); SCOPE_EXIT { state_after("optimize", program); }; // Counters, just for debug printing. std::atomic<uint32_t> total_funcs{0}; auto round = uint32_t{0}; /* * Algorithm: * * Start by running an analyze pass on every function. During * analysis, information about functions or classes will be * requested from the Index, which initially won't really know much, * but will record a dependency. This part is done in parallel: no * passes are mutating anything, just reading from the Index. * * After a pass, we do a single-threaded "update" step to prepare * for the next pass: for each function that was analyzed, note the * facts we learned that may aid analyzing other functions in the * program, and register them in the index. At this point, if any * of these facts are more useful than they used to be, add all the * Contexts that had a dependency on the new information to the work * list again, in case they can do better based on the new fact. * * Repeat until the work list is empty. */ auto work = initial_work(program); while (!work.empty()) { auto const results = [&] { trace_time trace( "analyzing", folly::format("round {} -- {} work items", round, work.size()).str() ); return parallel_map( work, [&] (const Context& ctx) -> folly::Optional<FuncAnalysis> { total_funcs.fetch_add(1, std::memory_order_relaxed); return analyze_func(index, ctx); } ); }(); work.clear(); ++round; trace_time update_time("updating"); std::set<Context> revisit; for (auto i = size_t{0}; i < results.size(); ++i) { auto& result = *results[i]; assert(result.ctx.func == work[i].func); assert(result.ctx.cls == work[i].cls); assert(result.ctx.unit == work[i].unit); auto deps = index.refine_return_type( result.ctx.func, result.inferredReturn ); for (auto& d : deps) revisit.insert(d); } std::copy(begin(revisit), end(revisit), std::back_inserter(work)); } if (Trace::moduleEnabledRelease(Trace::hhbbc_time, 1)) { Trace::traceRelease("total function visits %u\n", total_funcs.load()); } /* * Finally, use the results of all these iterations to perform * optimization. This reanalyzes every function using our * now-very-updated Index, and then runs optimize_func with the * results. * * We do this in parallel: all the shared information is queried out * of the index, and each thread is allowed to modify the bytecode * for the function it is looking at. * * NOTE: currently they can't modify anything other than the * bytecode/Blocks, because other threads may be doing unlocked * queries to php::Func and php::Class structures. */ trace_time final_pass("final pass"); work = initial_work(program); parallel_for_each( initial_work(program), [&] (Context ctx) { optimize_func(index, analyze_func(index, ctx)); } ); }