TEST(SumNode, GenerateAndSumHaveEqualAmount2) { std::function<void(Context&)> start_func = [](Context& ctx) { // TODO(ms): Replace this with some test-specific rendered file auto input = ReadLines(ctx, "inputs/test1") .Map([](const std::string& line) { return std::stoi(line); }); auto ones = input.Map([](int in) { return in; }); auto add_function = [](int in1, int in2) { return in1 + in2; }; DIA<int> coll = ones.Collapse(); ASSERT_EQ(136, coll.Sum(add_function)); ASSERT_EQ(16u, coll.Size()); }; api::RunLocalTests(start_func); }
auto PageRank(const DIA<std::string, InStack>&in, api::Context & ctx, int iter) { DIA<Page_Link> input = in.Map( [](const std::string& input) { auto split = thrill::common::Split(input, " "); LOG0 << "input " << (std::stoi(split[0]) - 1) << " " << (std::stoi(split[1]) - 1); // set base of page_id to 0 return Page_Link((size_t)(std::stoi(split[0]) - 1), (size_t)(std::stoi(split[1]) - 1)); }).Cache(); // TODO(SL): when Cache() is removed, code doesn't compile, // auto cannot be used either // aggregate all outgoing links of a page in this format: // // URL OUTGOING // ([linked_url, linked_url, ...]) // ([linked_url, linked_url, ...]) // ([linked_url, linked_url, ...]) // ... // get number of nodes by finding max page_id // add 1 to max node_id to get number of nodes because of node_id 0 const auto number_nodes = input.Sum( [](const Page_Link& in1, const Page_Link& in2) { Node first = std::max(in1.first, in2.first); Node second = std::max(in1.second, in2.second); return std::make_pair(std::max(first, second), first); }).first + 1; LOG << "number_nodes " << number_nodes; // group outgoing links auto links = input.GroupByIndex<Outgoings>( [](Page_Link p) { return p.first; }, [](auto& r, Key) { std::vector<Node> all; while (r.HasNext()) { all.push_back(r.Next().second); } // std::string s = "{"; // for (auto e : all) { // s+= std::to_string(e) + ", "; // } // LOG << "links " << s << "}"; return all; }, number_nodes).Cache(); // initialize all ranks to 1.0 // // (url, rank) // (url, rank) // (url, rank) // ... // auto ranks = Generate(ctx, [](const size_t& index) { // return std::make_pair(index, 1.0); // }, number_nodes).Cache(); auto ranks = Generate(ctx, [](const size_t&) { return (Rank)1.0; }, number_nodes).Cache(); auto node_ids = Generate(ctx, [](const size_t& index) { return index + 1; }, number_nodes); // do iterations for (int i = 0; i < iter; ++i) { LOG << "iteration " << i; // for all outgoing link, get their rank contribution from all // links by doing: // // 1) group all outgoing links with rank of its parent page: (Zip) // // ([linked_url, linked_url, ...], rank_parent) // ([linked_url, linked_url, ...], rank_parent) // ([linked_url, linked_url, ...], rank_parent) // // 2) compute rank contribution for each linked_url: (FlatMap) // // (linked_url, rank / OUTGOING.size) // (linked_url, rank / OUTGOING.size) // (linked_url, rank / OUTGOING.size) // ... std::cout << links.Size() << std::endl; std::cout << ranks.Size() << std::endl; assert(links.Size() == ranks.Size()); // TODO(SL): when Zip/FlatMap chained, code doesn't compile, please check DIA<Outgoings_Rank> outs_rank = links.Zip(ranks, [](const Outgoings& l, const Rank r) { // std::string s = "{"; // for (auto e : l) { // s += std::to_string(e) + ", "; // } // s += "}"; // LOG << "contribs1 " << s << " " << r; return std::make_pair(l, r); }); auto contribs = outs_rank.FlatMap<Page_Rank>( [](const Outgoings_Rank& p, auto emit) { if (p.first.size() > 0) { Rank rank_contrib = p.second / p.first.size(); // assert (rank_contrib <= 1); for (auto e : p.first) { LOG << "contribs2 " << e << " " << rank_contrib; emit(std::make_pair(e, rank_contrib)); } } }); // reduce all rank contributions by adding all rank contributions // and compute the new rank with 0.15 * 0.85 * sum_rank_contribs // // (url, rank) // (url, rank) // (url, rank) // ... // auto sum_rank_contrib_fn = [](const Page_Rank& p1, const Page_Rank& p2) { // assert(p1.first == p2.first); // return p1.second + p2.second; // }; ranks = contribs.ReduceToIndex( [](const Page_Rank& p) { return p.first; }, [](const Page_Rank& p1, const Page_Rank& p2) { return std::make_pair(p1.first, p1.second + p2.second); }, number_nodes) .Map( [](const Page_Rank p) { LOG << "ranks2 in " << p.first << "-" << p.second; if (std::fabs(p.second) <= 1E-5) { LOG << "ranks2 " << 0.0; return (Rank)0.0; } else { LOG << "ranks2 " << f + s * p.second; return f + s * p.second; } }).Keep().Collapse(); } // write result to line. add 1 to node_ids to revert back to normal auto res = ranks.Zip(node_ids, [](const Rank r, const Node n) { return std::to_string(n) + ": " + std::to_string(r); }); assert(res.Size() == links.Size()); return res; }