auto PageRank(const DIA<std::string, InStack>&in, api::Context & ctx, int iter) { DIA<Page_Link> input = in.Map( [](const std::string& input) { auto split = thrill::common::Split(input, " "); LOG0 << "input " << (std::stoi(split[0]) - 1) << " " << (std::stoi(split[1]) - 1); // set base of page_id to 0 return Page_Link((size_t)(std::stoi(split[0]) - 1), (size_t)(std::stoi(split[1]) - 1)); }).Cache(); // TODO(SL): when Cache() is removed, code doesn't compile, // auto cannot be used either // aggregate all outgoing links of a page in this format: // // URL OUTGOING // ([linked_url, linked_url, ...]) // ([linked_url, linked_url, ...]) // ([linked_url, linked_url, ...]) // ... // get number of nodes by finding max page_id // add 1 to max node_id to get number of nodes because of node_id 0 const auto number_nodes = input.Sum( [](const Page_Link& in1, const Page_Link& in2) { Node first = std::max(in1.first, in2.first); Node second = std::max(in1.second, in2.second); return std::make_pair(std::max(first, second), first); }).first + 1; LOG << "number_nodes " << number_nodes; // group outgoing links auto links = input.GroupByIndex<Outgoings>( [](Page_Link p) { return p.first; }, [](auto& r, Key) { std::vector<Node> all; while (r.HasNext()) { all.push_back(r.Next().second); } // std::string s = "{"; // for (auto e : all) { // s+= std::to_string(e) + ", "; // } // LOG << "links " << s << "}"; return all; }, number_nodes).Cache(); // initialize all ranks to 1.0 // // (url, rank) // (url, rank) // (url, rank) // ... // auto ranks = Generate(ctx, [](const size_t& index) { // return std::make_pair(index, 1.0); // }, number_nodes).Cache(); auto ranks = Generate(ctx, [](const size_t&) { return (Rank)1.0; }, number_nodes).Cache(); auto node_ids = Generate(ctx, [](const size_t& index) { return index + 1; }, number_nodes); // do iterations for (int i = 0; i < iter; ++i) { LOG << "iteration " << i; // for all outgoing link, get their rank contribution from all // links by doing: // // 1) group all outgoing links with rank of its parent page: (Zip) // // ([linked_url, linked_url, ...], rank_parent) // ([linked_url, linked_url, ...], rank_parent) // ([linked_url, linked_url, ...], rank_parent) // // 2) compute rank contribution for each linked_url: (FlatMap) // // (linked_url, rank / OUTGOING.size) // (linked_url, rank / OUTGOING.size) // (linked_url, rank / OUTGOING.size) // ... std::cout << links.Size() << std::endl; std::cout << ranks.Size() << std::endl; assert(links.Size() == ranks.Size()); // TODO(SL): when Zip/FlatMap chained, code doesn't compile, please check DIA<Outgoings_Rank> outs_rank = links.Zip(ranks, [](const Outgoings& l, const Rank r) { // std::string s = "{"; // for (auto e : l) { // s += std::to_string(e) + ", "; // } // s += "}"; // LOG << "contribs1 " << s << " " << r; return std::make_pair(l, r); }); auto contribs = outs_rank.FlatMap<Page_Rank>( [](const Outgoings_Rank& p, auto emit) { if (p.first.size() > 0) { Rank rank_contrib = p.second / p.first.size(); // assert (rank_contrib <= 1); for (auto e : p.first) { LOG << "contribs2 " << e << " " << rank_contrib; emit(std::make_pair(e, rank_contrib)); } } }); // reduce all rank contributions by adding all rank contributions // and compute the new rank with 0.15 * 0.85 * sum_rank_contribs // // (url, rank) // (url, rank) // (url, rank) // ... // auto sum_rank_contrib_fn = [](const Page_Rank& p1, const Page_Rank& p2) { // assert(p1.first == p2.first); // return p1.second + p2.second; // }; ranks = contribs.ReduceToIndex( [](const Page_Rank& p) { return p.first; }, [](const Page_Rank& p1, const Page_Rank& p2) { return std::make_pair(p1.first, p1.second + p2.second); }, number_nodes) .Map( [](const Page_Rank p) { LOG << "ranks2 in " << p.first << "-" << p.second; if (std::fabs(p.second) <= 1E-5) { LOG << "ranks2 " << 0.0; return (Rank)0.0; } else { LOG << "ranks2 " << f + s * p.second; return f + s * p.second; } }).Keep().Collapse(); } // write result to line. add 1 to node_ids to revert back to normal auto res = ranks.Zip(node_ids, [](const Rank r, const Node n) { return std::to_string(n) + ": " + std::to_string(r); }); assert(res.Size() == links.Size()); return res; }
ValueType Select(const DIA<ValueType, InStack>& data, size_t rank, const Compare& compare = Compare()) { api::Context& ctx = data.context(); const size_t size = data.Size(); assert(0 <= rank && rank < size); if (size <= base_case_size) { // base case, gather all data at worker with rank 0 ValueType result = ValueType(); auto elements = data.Gather(); if (ctx.my_rank() == 0) { assert(rank < elements.size()); std::nth_element(elements.begin(), elements.begin() + rank, elements.end(), compare); result = elements[rank]; LOG << "base case: " << size << " elements remaining, result is " << result; } result = ctx.net.Broadcast(result); return result; } ValueType left_pivot, right_pivot; std::tie(left_pivot, right_pivot) = PickPivots(data, size, rank, compare); size_t left_size, middle_size, right_size; using PartSizes = std::pair<size_t, size_t>; std::tie(left_size, middle_size) = data.Map( [&](const ValueType& elem) -> PartSizes { if (compare(elem, left_pivot)) return PartSizes { 1, 0 }; else if (!compare(right_pivot, elem)) return PartSizes { 0, 1 }; else return PartSizes { 0, 0 }; }) .Sum( [](const PartSizes& a, const PartSizes& b) -> PartSizes { return PartSizes { a.first + b.first, a.second + b.second }; }, PartSizes { 0, 0 }); right_size = size - left_size - middle_size; LOGM << "left_size = " << left_size << ", middle_size = " << middle_size << ", right_size = " << right_size << ", rank = " << rank; if (rank == left_size) { // all the elements strictly smaller than the left pivot are on the left // side -> left_size-th element is the left pivot LOGM << "result is left pivot: " << left_pivot; return left_pivot; } else if (rank == left_size + middle_size - 1) { // only the elements strictly greater than the right pivot are on the // right side, so the result is the right pivot in this case LOGM << "result is right pivot: " << right_pivot; return right_pivot; } else if (rank < left_size) { // recurse on the left partition LOGM << "Recursing left, " << left_size << " elements remaining (rank = " << rank << ")\n"; auto left = data.Filter( [&](const ValueType& elem) -> bool { return compare(elem, left_pivot); }).Collapse(); assert(left.Size() == left_size); return Select(left, rank, compare); } else if (left_size + middle_size <= rank) { // recurse on the right partition LOGM << "Recursing right, " << right_size << " elements remaining (rank = " << rank - left_size - middle_size << ")\n"; auto right = data.Filter( [&](const ValueType& elem) -> bool { return compare(right_pivot, elem); }).Collapse(); assert(right.Size() == right_size); return Select(right, rank - left_size - middle_size, compare); } else { // recurse on the middle partition LOGM << "Recursing middle, " << middle_size << " elements remaining (rank = " << rank - left_size << ")\n"; auto middle = data.Filter( [&](const ValueType& elem) -> bool { return !compare(elem, left_pivot) && !compare(right_pivot, elem); }).Collapse(); assert(middle.Size() == middle_size); return Select(middle, rank - left_size, compare); } }