コード例 #1
0
ファイル: sum_node_test.cpp プロジェクト: ShauryaRawat/thrill
TEST(SumNode, GenerateAndSumHaveEqualAmount2) {

    std::function<void(Context&)> start_func =
        [](Context& ctx) {

            // TODO(ms): Replace this with some test-specific rendered file
            auto input = ReadLines(ctx, "inputs/test1")
                         .Map([](const std::string& line) {
                                  return std::stoi(line);
                              });

            auto ones = input.Map([](int in) {
                                      return in;
                                  });

            auto add_function = [](int in1, int in2) {
                                    return in1 + in2;
                                };

            DIA<int> coll = ones.Collapse();

            ASSERT_EQ(136, coll.Sum(add_function));
            ASSERT_EQ(16u, coll.Size());
        };

    api::RunLocalTests(start_func);
}
コード例 #2
0
TEST(Stage, CountReferencesLOpNode) {

    std::function<void(Context&)> start_func =
        [](Context& ctx) {

            auto integers = Generate(
                ctx,
                [](const size_t& index) {
                    return static_cast<int>(index) + 1;
                },
                16);

            auto duplicate_elements = [](int in, auto emit) {
                                          emit(in);
                                          emit(in);
                                      };

            auto modulo_two = [](int in) {
                                  return (in % 2);
                              };

            auto add_function = [](int in1, int in2) {
                                    return in1 + in2;
                                };

            // Create a new DIA references to Generate
            auto doubles = integers.FlatMap(duplicate_elements);

            // Create a child references to Generate
            // Create a new DIA reference to LOpNode
            DIA<int> quadruples = integers.FlatMap(duplicate_elements).Cache();

            // Create new child reference to LOpNode
            auto reduced = quadruples.ReduceBy(modulo_two, add_function);

            // Trigger execution
            std::vector<int> out_vec = reduced.AllGather();

            // 2x DIA reference + 1x child reference
            ASSERT_EQ(integers.node_refcount(), 3u);
            ASSERT_EQ(doubles.node_refcount(), 3u);
            // 1x DIA reference + 1x child reference
            ASSERT_EQ(quadruples.node_refcount(), 2u);
            // 1x DIA reference + 0x child reference
            ASSERT_EQ(reduced.node_refcount(), 1u);
        };

    api::RunLocalTests(start_func);
}
コード例 #3
0
ファイル: select.hpp プロジェクト: Cyaagain/thrill
std::pair<ValueType, ValueType>
PickPivots(const DIA<ValueType, InStack>& data, size_t size, size_t rank,
           const Compare& compare = Compare()) {
    api::Context& ctx = data.context();

    const size_t num_workers(ctx.num_workers());
    const double size_d = static_cast<double>(size);

    const double p = 20 * sqrt(static_cast<double>(num_workers)) / size_d;

    // materialized at worker 0
    auto sample = data.BernoulliSample(p).Gather();

    std::pair<ValueType, ValueType> pivots;
    if (ctx.my_rank() == 0) {
        LOG << "got " << sample.size() << " samples (p = " << p << ")";
        // Sort the samples
        std::sort(sample.begin(), sample.end(), compare);

        const double base_pos =
            static_cast<double>(rank * sample.size()) / size_d;
        const double offset = pow(size_d, 0.25 + delta);

        long lower_pos = static_cast<long>(floor(base_pos - offset));
        long upper_pos = static_cast<long>(floor(base_pos + offset));

        size_t lower = static_cast<size_t>(std::max(0L, lower_pos));
        size_t upper = static_cast<size_t>(
            std::min(upper_pos, static_cast<long>(sample.size() - 1)));

        assert(0 <= lower && lower < sample.size());
        assert(0 <= upper && upper < sample.size());

        LOG << "Selected pivots at positions " << lower << " and " << upper
            << ": " << sample[lower] << " and " << sample[upper];

        pivots = std::make_pair(sample[lower], sample[upper]);
    }

    pivots = ctx.net.Broadcast(pivots);

    LOGM << "pivots: " << pivots.first << " and " << pivots.second;

    return pivots;
}
コード例 #4
0
ファイル: collapse.hpp プロジェクト: bingmann/thrill
    static DIA<ValueType> MakeCollapse(const DIA<ValueType, Stack>& dia) {
        assert(dia.IsValid());

        // Create new CollapseNode. Transfer stack from rhs to
        // CollapseNode. Build new DIA with empty stack and CollapseNode
        using CollapseNode = api::CollapseNode<ValueType>;

        return DIA<ValueType>(tlx::make_counting<CollapseNode>(dia));
    }
コード例 #5
0
ファイル: graph_test.cpp プロジェクト: chen--oRanGe/thrill
TEST(Graph, WhileLoop) {

    std::function<void(Context&)> start_func =
        [](Context& ctx) {

            auto integers = Generate(
                ctx,
                [](const size_t& index) -> size_t {
                    return index;
                },
                16);

            auto flatmap_duplicate = [](size_t in, auto emit) {
                                         emit(in);
                                         emit(in);
                                     };

            auto map_multiply = [](size_t in) {
                                    return 2 * in;
                                };

            DIA<size_t> squares = integers.Collapse();
            size_t sum = 0;

            // run loop four times, inflating DIA of 16 items -> 256
            while (sum < 64) {
                auto pairs = squares.FlatMap(flatmap_duplicate);
                auto multiplied = pairs.Map(map_multiply);
                squares = multiplied.Cache();
                sum = squares.Size();
            }

            std::vector<size_t> out_vec = squares.AllGather();

            ASSERT_EQ(64u, out_vec.size());
            ASSERT_EQ(64u, squares.Size());

            ctx.stats_graph().BuildLayout("loop.out");
        };

    api::RunLocalTests(start_func);
}
コード例 #6
0
ファイル: collapse.hpp プロジェクト: ShauryaRawat/thrill
DIA<ValueType, Stack>::DIA(const DIA<ValueType, AnyStack>& rhs) {

    // Create new CollapseNode. Transfer stack from rhs to CollapseNode. Build
    // new DIA with empty stack and CollapseNode
    using CollapseNode =
              api::CollapseNode<ValueType, DIA<ValueType, AnyStack> >;

    LOG0 << "WARNING: cast to DIA creates CollapseNode instead of inline chaining.";
    LOG0 << "Consider whether you can use auto instead of DIA.";

    StatsNode* stats_node = rhs.AddChildStatsNode("Collapse", DIANodeType::COLLAPSE);

    node_ = std::make_shared<CollapseNode>(rhs, stats_node);
    // stack_ is default constructed.
    stats_parents_.emplace_back(stats_node);
}
コード例 #7
0
ファイル: k-means_run.cpp プロジェクト: Cyaagain/thrill
void OutputSVG(const std::string& svg_path, double svg_scale,
               const DIA<Point<2> >& point_dia,
               const KMeansModel<Point<2> >& model) {
    double width = 0, height = 0;

    using Point2D = Point<2>;

    const std::vector<Point2D>& centroids = model.centroids();
    std::vector<PointClusterId<Point2D> > list =
        model.ClassifyPairs(point_dia).Gather();

    for (const PointClusterId<Point2D>& p : list) {
        width = std::max(width, p.first.x[0]);
        height = std::max(height, p.first.x[1]);
    }

    if (point_dia.context().my_rank() != 0) return;

    std::ofstream os(svg_path);

    os << "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n";
    os << "<svg\n";
    os << "   xmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n";
    os << "   xmlns:cc=\"http://creativecommons.org/ns#\"\n";
    os << "   xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n";
    os << "   xmlns:svg=\"http://www.w3.org/2000/svg\"\n";
    os << "   xmlns=\"http://www.w3.org/2000/svg\"\n";
    os << "   version=\"1.1\" id=\"svg2\" width=\"" << width * svg_scale
       << "\" height=\"" << height * svg_scale << "\">\n";
    os << "  <g id=\"layer1\">\n";

    for (const PointClusterId<Point2D>& p : list) {
        os << "    <circle r=\"1\" cx=\"" << p.first.x[0] * svg_scale
           << "\" cy=\"" << p.first.x[1] * svg_scale
           << "\" style=\"stroke:none;stroke-opacity:1;fill:"
           << SVGColor(p.second) << ";fill-opacity:1\" />\n";
    }
    for (size_t i = 0; i < centroids.size(); ++i) {
        const Point2D& p = centroids[i];
        os << "    <circle r=\"4\" cx=\"" << p.x[0] * svg_scale
           << "\" cy=\"" << p.x[1] * svg_scale
           << "\" style=\"stroke:black;stroke-opacity:1;fill:"
           << SVGColor(i) << ";fill-opacity:1\" />\n";
    }
    os << " </g>\n";
    os << "</svg>\n";
}
コード例 #8
0
ファイル: zip_node_test.cpp プロジェクト: chen--oRanGe/thrill
TEST(ZipNode, TwoDisbalancedStringArrays) {

    // first DIA is heavily balanced to the first workers, second DIA is
    // balanced to the last workers.
    std::function<void(Context&)> start_func =
        [](Context& ctx) {

            // generate random strings with 10..20 characters
            auto input_gen = Generate(
                ctx,
                [](size_t index) -> std::string {
                    std::default_random_engine rng(
                        123456 + static_cast<unsigned>(index));
                    std::uniform_int_distribution<size_t> length(10, 20);
                    rng(); // skip one number

                    return common::RandomString(
                        length(rng), rng, "abcdefghijklmnopqrstuvwxyz")
                    + std::to_string(index);
                },
                test_size);

            DIA<std::string> input = input_gen.Cache();

            std::vector<std::string> vinput = input.AllGather();
            ASSERT_EQ(test_size, vinput.size());

            // Filter out strings that start with a-e
            auto input1 = input.Filter(
                [](const std::string& str) { return str[0] <= 'e'; });

            // Filter out strings that start with w-z
            auto input2 = input.Filter(
                [](const std::string& str) { return str[0] >= 'w'; });

            // zip
            auto zip_result = input1.Zip(
                input2, [](const std::string& a, const std::string& b) {
                    return a + b;
                });

            // check result
            std::vector<std::string> res = zip_result.AllGather();

            // recalculate result locally
            std::vector<std::string> check;
            {
                std::vector<std::string> v1, v2;

                for (size_t index = 0; index < vinput.size(); ++index) {
                    const std::string& s1 = vinput[index];
                    if (s1[0] <= 'e') v1.push_back(s1);
                    if (s1[0] >= 'w') v2.push_back(s1);
                }

                ASSERT_EQ(v1, input1.AllGather());
                ASSERT_EQ(v2, input2.AllGather());

                for (size_t i = 0; i < std::min(v1.size(), v2.size()); ++i) {
                    check.push_back(v1[i] + v2[i]);
                    // sLOG1 << check.back();
                }
            }

            for (size_t i = 0; i != res.size(); ++i) {
                sLOG0 << res[i] << " " << check[i] << (res[i] == check[i]);
            }

            ASSERT_EQ(check.size(), res.size());
            ASSERT_EQ(check, res);
        };

    api::RunLocalTests(start_func);
}
コード例 #9
0
ファイル: page_rank.hpp プロジェクト: ShauryaRawat/thrill
auto PageRank(const DIA<std::string, InStack>&in, api::Context & ctx, int iter) {

    DIA<Page_Link> input = in.Map(
        [](const std::string& input) {
            auto split = thrill::common::Split(input, " ");
            LOG0 << "input "
                 << (std::stoi(split[0]) - 1)
                 << " "
                 << (std::stoi(split[1]) - 1);
                    // set base of page_id to 0
            return Page_Link((size_t)(std::stoi(split[0]) - 1),
                             (size_t)(std::stoi(split[1]) - 1));
        }).Cache(); // TODO(SL): when Cache() is removed, code doesn't compile,
    // auto cannot be used either

    // aggregate all outgoing links of a page in this format:
    //
    //  URL   OUTGOING
    // ([linked_url, linked_url, ...])
    // ([linked_url, linked_url, ...])
    // ([linked_url, linked_url, ...])
    // ...

    // get number of nodes by finding max page_id
    // add 1 to max node_id to get number of nodes because of node_id 0
    const auto number_nodes = input.Sum(
        [](const Page_Link& in1, const Page_Link& in2) {
            Node first = std::max(in1.first, in2.first);
            Node second = std::max(in1.second, in2.second);
            return std::make_pair(std::max(first, second), first);
        }).first + 1;

    LOG << "number_nodes " << number_nodes;

    // group outgoing links
    auto links = input.GroupByIndex<Outgoings>(
        [](Page_Link p) { return p.first; },
        [](auto& r, Key) {
            std::vector<Node> all;
            while (r.HasNext()) {
                all.push_back(r.Next().second);
            }

            // std::string s = "{";
            // for (auto e : all) {
            //     s+= std::to_string(e) + ", ";
            // }
            // LOG << "links " << s << "}";

            return all;
        },
        number_nodes).Cache();

    // initialize all ranks to 1.0
    //
    // (url, rank)
    // (url, rank)
    // (url, rank)
    // ...
    // auto ranks = Generate(ctx, [](const size_t& index) {
    //     return std::make_pair(index, 1.0);
    // }, number_nodes).Cache();
    auto ranks = Generate(ctx,
                          [](const size_t&) {
                              return (Rank)1.0;
                          }, number_nodes).Cache();

    auto node_ids = Generate(ctx,
                             [](const size_t& index) {
                                 return index + 1;
                             }, number_nodes);

    // do iterations
    for (int i = 0; i < iter; ++i) {
        LOG << "iteration " << i;

        // for all outgoing link, get their rank contribution from all
        // links by doing:
        //
        // 1) group all outgoing links with rank of its parent page: (Zip)
        //
        // ([linked_url, linked_url, ...], rank_parent)
        // ([linked_url, linked_url, ...], rank_parent)
        // ([linked_url, linked_url, ...], rank_parent)
        //
        // 2) compute rank contribution for each linked_url: (FlatMap)
        //
        // (linked_url, rank / OUTGOING.size)
        // (linked_url, rank / OUTGOING.size)
        // (linked_url, rank / OUTGOING.size)
        // ...

        std::cout << links.Size() << std::endl;
        std::cout << ranks.Size() << std::endl;

        assert(links.Size() == ranks.Size());

        // TODO(SL): when Zip/FlatMap chained, code doesn't compile, please check
        DIA<Outgoings_Rank> outs_rank = links.Zip(ranks,
                                                  [](const Outgoings& l, const Rank r) {
                                                      // std::string s = "{";
                                                      // for (auto e : l) {
                                                      //     s += std::to_string(e) + ", ";
                                                      // }
                                                      // s += "}";
                                                      // LOG << "contribs1 " << s << " " << r;

                                                      return std::make_pair(l, r);
                                                  });
        auto contribs = outs_rank.FlatMap<Page_Rank>(
            [](const Outgoings_Rank& p, auto emit) {
                if (p.first.size() > 0) {
                    Rank rank_contrib = p.second / p.first.size();
                    // assert (rank_contrib <= 1);
                    for (auto e : p.first) {
                        LOG << "contribs2 " << e << " " << rank_contrib;
                        emit(std::make_pair(e, rank_contrib));
                    }
                }
            });

        // reduce all rank contributions by adding all rank contributions
        // and compute the new rank with 0.15 * 0.85 * sum_rank_contribs
        //
        // (url, rank)
        // (url, rank)
        // (url, rank)
        // ...

        // auto sum_rank_contrib_fn = [](const Page_Rank& p1, const Page_Rank& p2) {
        //     assert(p1.first == p2.first);
        //     return p1.second + p2.second;
        // };
        ranks = contribs.ReduceToIndex(
            [](const Page_Rank& p) { return p.first; },
            [](const Page_Rank& p1, const Page_Rank& p2) {
                return std::make_pair(p1.first, p1.second + p2.second);
            }, number_nodes)
                .Map(
            [](const Page_Rank p) {
                LOG << "ranks2 in " << p.first << "-" << p.second;
                if (std::fabs(p.second) <= 1E-5) {
                    LOG << "ranks2 " << 0.0;
                    return (Rank)0.0;
                }
                else {
                    LOG << "ranks2 " << f + s * p.second;
                    return f + s * p.second;
                }
            }).Keep().Collapse();
    }

    // write result to line. add 1 to node_ids to revert back to normal
    auto res = ranks.Zip(node_ids,
                         [](const Rank r, const Node n) {
                             return std::to_string(n)
                             + ": " + std::to_string(r);
                         });

    assert(res.Size() == links.Size());

    return res;
}
コード例 #10
0
void OutputSVG(const std::string& svg_path, double svg_scale,
               const DIA<DataPoint<Vector> >& point_dia,
               const Vector& model) {
    double width = 0, height = 0, min_vert = 0, max_vert = 0, min_hor = 0, max_hor = 0;

    std::vector<DataPoint<Vector> > list = point_dia.Gather();

    for (const DataPoint<Vector>& p : list) {
        min_hor = std::min(min_hor, p.data.x[0]);
        max_hor = std::max(max_hor, p.data.x[0]);
        min_vert = std::min(min_vert, p.label);
        max_vert = std::max(max_vert, p.label);
    }

    double weight = model.x[0];
    double y1 = min_hor * weight;
    double y2 = max_hor * weight;
    min_vert = std::min(min_vert, y1);
    min_vert = std::min(min_vert, y2);
    max_vert = std::max(max_vert, y1);
    max_vert = std::max(max_vert, y2);

    width = max_hor - min_hor;
    height = max_vert - min_vert;

    std::ofstream os(svg_path);

    os << "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n";
    os << "<svg\n";
    os << "   xmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n";
    os << "   xmlns:cc=\"http://creativecommons.org/ns#\"\n";
    os << "   xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n";
    os << "   xmlns:svg=\"http://www.w3.org/2000/svg\"\n";
    os << "   xmlns=\"http://www.w3.org/2000/svg\"\n";
    os << "   version=\"1.1\" id=\"svg2\" width=\"" << width * svg_scale
       << "\" height=\"" << height * svg_scale << "\">\n";
    os << "  <g id=\"layer1\">\n";

    // Draw grid
    os << "    <line x1=\"0\" y1=\"" << (height + min_vert) * svg_scale
       << "\" x2=\"" << width * svg_scale << "\" y2=\"" << (height + min_vert) * svg_scale
       << "\" stroke-width=\"1\" stroke=\"#777777\" style=\"stroke-opacity:0.3\" />\n";
    os << "    <line x1=\"" << -min_hor * svg_scale << "\" y1=\"0\""
       << " x2=\"" << -min_hor * svg_scale << "\" y2=\"" << height * svg_scale
       << "\" stroke-width=\"1\" stroke=\"#777777\" style=\"stroke-opacity:0.3\" />\n";

    // Draw points
    for (const DataPoint<Vector>& p : list) {
        os << "    <circle r=\"1\" cx=\"" << (p.data.x[0] - min_hor) * svg_scale
           << "\" cy=\"" << (height - p.label + min_vert) * svg_scale
           << "\" style=\"stroke:none;stroke-opacity:1;fill:#45a2d1;fill-opacity:1\" />\n";
    }

    // Draw line
    os << "    <line x1=\"0\" y1=\"" << (height - y1 + min_vert) * svg_scale
       << "\" x2=\"" << width * svg_scale << "\" y2=\"" << (height - y2 + min_vert) * svg_scale
       << "\" stroke-width=\"1\" stroke=\"#ff9900\" />\n";

    os << " </g>\n";
    os << "</svg>\n";
}
コード例 #11
0
ファイル: select.hpp プロジェクト: Cyaagain/thrill
ValueType Select(const DIA<ValueType, InStack>& data, size_t rank,
                 const Compare& compare = Compare()) {
    api::Context& ctx = data.context();
    const size_t size = data.Size();

    assert(0 <= rank && rank < size);

    if (size <= base_case_size) {
        // base case, gather all data at worker with rank 0
        ValueType result = ValueType();
        auto elements = data.Gather();

        if (ctx.my_rank() == 0) {
            assert(rank < elements.size());
            std::nth_element(elements.begin(), elements.begin() + rank,
                             elements.end(), compare);

            result = elements[rank];

            LOG << "base case: " << size << " elements remaining, result is "
                << result;
        }

        result = ctx.net.Broadcast(result);
        return result;
    }

    ValueType left_pivot, right_pivot;
    std::tie(left_pivot, right_pivot) = PickPivots(data, size, rank, compare);

    size_t left_size, middle_size, right_size;

    using PartSizes = std::pair<size_t, size_t>;
    std::tie(left_size, middle_size) =
        data.Map(
            [&](const ValueType& elem) -> PartSizes {
                if (compare(elem, left_pivot))
                    return PartSizes { 1, 0 };
                else if (!compare(right_pivot, elem))
                    return PartSizes { 0, 1 };
                else
                    return PartSizes { 0, 0 };
            })
        .Sum(
            [](const PartSizes& a, const PartSizes& b) -> PartSizes {
                return PartSizes { a.first + b.first, a.second + b.second };
            },
            PartSizes { 0, 0 });
    right_size = size - left_size - middle_size;

    LOGM << "left_size = " << left_size << ", middle_size = " << middle_size
         << ", right_size = " << right_size << ", rank = " << rank;

    if (rank == left_size) {
        // all the elements strictly smaller than the left pivot are on the left
        // side -> left_size-th element is the left pivot
        LOGM << "result is left pivot: " << left_pivot;
        return left_pivot;
    }
    else if (rank == left_size + middle_size - 1) {
        // only the elements strictly greater than the right pivot are on the
        // right side, so the result is the right pivot in this case
        LOGM << "result is right pivot: " << right_pivot;
        return right_pivot;
    }
    else if (rank < left_size) {
        // recurse on the left partition
        LOGM << "Recursing left, " << left_size
             << " elements remaining (rank = " << rank << ")\n";

        auto left = data.Filter(
            [&](const ValueType& elem) -> bool {
                return compare(elem, left_pivot);
            }).Collapse();
        assert(left.Size() == left_size);

        return Select(left, rank, compare);
    }
    else if (left_size + middle_size <= rank) {
        // recurse on the right partition
        LOGM << "Recursing right, " << right_size
             << " elements remaining (rank = " << rank - left_size - middle_size
             << ")\n";

        auto right = data.Filter(
            [&](const ValueType& elem) -> bool {
                return compare(right_pivot, elem);
            }).Collapse();
        assert(right.Size() == right_size);

        return Select(right, rank - left_size - middle_size, compare);
    }
    else {
        // recurse on the middle partition
        LOGM << "Recursing middle, " << middle_size
             << " elements remaining (rank = " << rank - left_size << ")\n";

        auto middle = data.Filter(
            [&](const ValueType& elem) -> bool {
                return !compare(elem, left_pivot) &&
                !compare(right_pivot, elem);
            }).Collapse();
        assert(middle.Size() == middle_size);

        return Select(middle, rank - left_size, compare);
    }
}