Ejemplo n.º 1
0
auto PageRank(const DIA<std::string, InStack>&in, api::Context & ctx, int iter) {

    DIA<Page_Link> input = in.Map(
        [](const std::string& input) {
            auto split = thrill::common::Split(input, " ");
            LOG0 << "input "
                 << (std::stoi(split[0]) - 1)
                 << " "
                 << (std::stoi(split[1]) - 1);
                    // set base of page_id to 0
            return Page_Link((size_t)(std::stoi(split[0]) - 1),
                             (size_t)(std::stoi(split[1]) - 1));
        }).Cache(); // TODO(SL): when Cache() is removed, code doesn't compile,
    // auto cannot be used either

    // aggregate all outgoing links of a page in this format:
    //
    //  URL   OUTGOING
    // ([linked_url, linked_url, ...])
    // ([linked_url, linked_url, ...])
    // ([linked_url, linked_url, ...])
    // ...

    // get number of nodes by finding max page_id
    // add 1 to max node_id to get number of nodes because of node_id 0
    const auto number_nodes = input.Sum(
        [](const Page_Link& in1, const Page_Link& in2) {
            Node first = std::max(in1.first, in2.first);
            Node second = std::max(in1.second, in2.second);
            return std::make_pair(std::max(first, second), first);
        }).first + 1;

    LOG << "number_nodes " << number_nodes;

    // group outgoing links
    auto links = input.GroupByIndex<Outgoings>(
        [](Page_Link p) { return p.first; },
        [](auto& r, Key) {
            std::vector<Node> all;
            while (r.HasNext()) {
                all.push_back(r.Next().second);
            }

            // std::string s = "{";
            // for (auto e : all) {
            //     s+= std::to_string(e) + ", ";
            // }
            // LOG << "links " << s << "}";

            return all;
        },
        number_nodes).Cache();

    // initialize all ranks to 1.0
    //
    // (url, rank)
    // (url, rank)
    // (url, rank)
    // ...
    // auto ranks = Generate(ctx, [](const size_t& index) {
    //     return std::make_pair(index, 1.0);
    // }, number_nodes).Cache();
    auto ranks = Generate(ctx,
                          [](const size_t&) {
                              return (Rank)1.0;
                          }, number_nodes).Cache();

    auto node_ids = Generate(ctx,
                             [](const size_t& index) {
                                 return index + 1;
                             }, number_nodes);

    // do iterations
    for (int i = 0; i < iter; ++i) {
        LOG << "iteration " << i;

        // for all outgoing link, get their rank contribution from all
        // links by doing:
        //
        // 1) group all outgoing links with rank of its parent page: (Zip)
        //
        // ([linked_url, linked_url, ...], rank_parent)
        // ([linked_url, linked_url, ...], rank_parent)
        // ([linked_url, linked_url, ...], rank_parent)
        //
        // 2) compute rank contribution for each linked_url: (FlatMap)
        //
        // (linked_url, rank / OUTGOING.size)
        // (linked_url, rank / OUTGOING.size)
        // (linked_url, rank / OUTGOING.size)
        // ...

        std::cout << links.Size() << std::endl;
        std::cout << ranks.Size() << std::endl;

        assert(links.Size() == ranks.Size());

        // TODO(SL): when Zip/FlatMap chained, code doesn't compile, please check
        DIA<Outgoings_Rank> outs_rank = links.Zip(ranks,
                                                  [](const Outgoings& l, const Rank r) {
                                                      // std::string s = "{";
                                                      // for (auto e : l) {
                                                      //     s += std::to_string(e) + ", ";
                                                      // }
                                                      // s += "}";
                                                      // LOG << "contribs1 " << s << " " << r;

                                                      return std::make_pair(l, r);
                                                  });
        auto contribs = outs_rank.FlatMap<Page_Rank>(
            [](const Outgoings_Rank& p, auto emit) {
                if (p.first.size() > 0) {
                    Rank rank_contrib = p.second / p.first.size();
                    // assert (rank_contrib <= 1);
                    for (auto e : p.first) {
                        LOG << "contribs2 " << e << " " << rank_contrib;
                        emit(std::make_pair(e, rank_contrib));
                    }
                }
            });

        // reduce all rank contributions by adding all rank contributions
        // and compute the new rank with 0.15 * 0.85 * sum_rank_contribs
        //
        // (url, rank)
        // (url, rank)
        // (url, rank)
        // ...

        // auto sum_rank_contrib_fn = [](const Page_Rank& p1, const Page_Rank& p2) {
        //     assert(p1.first == p2.first);
        //     return p1.second + p2.second;
        // };
        ranks = contribs.ReduceToIndex(
            [](const Page_Rank& p) { return p.first; },
            [](const Page_Rank& p1, const Page_Rank& p2) {
                return std::make_pair(p1.first, p1.second + p2.second);
            }, number_nodes)
                .Map(
            [](const Page_Rank p) {
                LOG << "ranks2 in " << p.first << "-" << p.second;
                if (std::fabs(p.second) <= 1E-5) {
                    LOG << "ranks2 " << 0.0;
                    return (Rank)0.0;
                }
                else {
                    LOG << "ranks2 " << f + s * p.second;
                    return f + s * p.second;
                }
            }).Keep().Collapse();
    }

    // write result to line. add 1 to node_ids to revert back to normal
    auto res = ranks.Zip(node_ids,
                         [](const Rank r, const Node n) {
                             return std::to_string(n)
                             + ": " + std::to_string(r);
                         });

    assert(res.Size() == links.Size());

    return res;
}
Ejemplo n.º 2
0
ValueType Select(const DIA<ValueType, InStack>& data, size_t rank,
                 const Compare& compare = Compare()) {
    api::Context& ctx = data.context();
    const size_t size = data.Size();

    assert(0 <= rank && rank < size);

    if (size <= base_case_size) {
        // base case, gather all data at worker with rank 0
        ValueType result = ValueType();
        auto elements = data.Gather();

        if (ctx.my_rank() == 0) {
            assert(rank < elements.size());
            std::nth_element(elements.begin(), elements.begin() + rank,
                             elements.end(), compare);

            result = elements[rank];

            LOG << "base case: " << size << " elements remaining, result is "
                << result;
        }

        result = ctx.net.Broadcast(result);
        return result;
    }

    ValueType left_pivot, right_pivot;
    std::tie(left_pivot, right_pivot) = PickPivots(data, size, rank, compare);

    size_t left_size, middle_size, right_size;

    using PartSizes = std::pair<size_t, size_t>;
    std::tie(left_size, middle_size) =
        data.Map(
            [&](const ValueType& elem) -> PartSizes {
                if (compare(elem, left_pivot))
                    return PartSizes { 1, 0 };
                else if (!compare(right_pivot, elem))
                    return PartSizes { 0, 1 };
                else
                    return PartSizes { 0, 0 };
            })
        .Sum(
            [](const PartSizes& a, const PartSizes& b) -> PartSizes {
                return PartSizes { a.first + b.first, a.second + b.second };
            },
            PartSizes { 0, 0 });
    right_size = size - left_size - middle_size;

    LOGM << "left_size = " << left_size << ", middle_size = " << middle_size
         << ", right_size = " << right_size << ", rank = " << rank;

    if (rank == left_size) {
        // all the elements strictly smaller than the left pivot are on the left
        // side -> left_size-th element is the left pivot
        LOGM << "result is left pivot: " << left_pivot;
        return left_pivot;
    }
    else if (rank == left_size + middle_size - 1) {
        // only the elements strictly greater than the right pivot are on the
        // right side, so the result is the right pivot in this case
        LOGM << "result is right pivot: " << right_pivot;
        return right_pivot;
    }
    else if (rank < left_size) {
        // recurse on the left partition
        LOGM << "Recursing left, " << left_size
             << " elements remaining (rank = " << rank << ")\n";

        auto left = data.Filter(
            [&](const ValueType& elem) -> bool {
                return compare(elem, left_pivot);
            }).Collapse();
        assert(left.Size() == left_size);

        return Select(left, rank, compare);
    }
    else if (left_size + middle_size <= rank) {
        // recurse on the right partition
        LOGM << "Recursing right, " << right_size
             << " elements remaining (rank = " << rank - left_size - middle_size
             << ")\n";

        auto right = data.Filter(
            [&](const ValueType& elem) -> bool {
                return compare(right_pivot, elem);
            }).Collapse();
        assert(right.Size() == right_size);

        return Select(right, rank - left_size - middle_size, compare);
    }
    else {
        // recurse on the middle partition
        LOGM << "Recursing middle, " << middle_size
             << " elements remaining (rank = " << rank - left_size << ")\n";

        auto middle = data.Filter(
            [&](const ValueType& elem) -> bool {
                return !compare(elem, left_pivot) &&
                !compare(right_pivot, elem);
            }).Collapse();
        assert(middle.Size() == middle_size);

        return Select(middle, rank - left_size, compare);
    }
}