TEST(SumNode, GenerateAndSumHaveEqualAmount2) { std::function<void(Context&)> start_func = [](Context& ctx) { // TODO(ms): Replace this with some test-specific rendered file auto input = ReadLines(ctx, "inputs/test1") .Map([](const std::string& line) { return std::stoi(line); }); auto ones = input.Map([](int in) { return in; }); auto add_function = [](int in1, int in2) { return in1 + in2; }; DIA<int> coll = ones.Collapse(); ASSERT_EQ(136, coll.Sum(add_function)); ASSERT_EQ(16u, coll.Size()); }; api::RunLocalTests(start_func); }
TEST(Stage, CountReferencesLOpNode) { std::function<void(Context&)> start_func = [](Context& ctx) { auto integers = Generate( ctx, [](const size_t& index) { return static_cast<int>(index) + 1; }, 16); auto duplicate_elements = [](int in, auto emit) { emit(in); emit(in); }; auto modulo_two = [](int in) { return (in % 2); }; auto add_function = [](int in1, int in2) { return in1 + in2; }; // Create a new DIA references to Generate auto doubles = integers.FlatMap(duplicate_elements); // Create a child references to Generate // Create a new DIA reference to LOpNode DIA<int> quadruples = integers.FlatMap(duplicate_elements).Cache(); // Create new child reference to LOpNode auto reduced = quadruples.ReduceBy(modulo_two, add_function); // Trigger execution std::vector<int> out_vec = reduced.AllGather(); // 2x DIA reference + 1x child reference ASSERT_EQ(integers.node_refcount(), 3u); ASSERT_EQ(doubles.node_refcount(), 3u); // 1x DIA reference + 1x child reference ASSERT_EQ(quadruples.node_refcount(), 2u); // 1x DIA reference + 0x child reference ASSERT_EQ(reduced.node_refcount(), 1u); }; api::RunLocalTests(start_func); }
std::pair<ValueType, ValueType> PickPivots(const DIA<ValueType, InStack>& data, size_t size, size_t rank, const Compare& compare = Compare()) { api::Context& ctx = data.context(); const size_t num_workers(ctx.num_workers()); const double size_d = static_cast<double>(size); const double p = 20 * sqrt(static_cast<double>(num_workers)) / size_d; // materialized at worker 0 auto sample = data.BernoulliSample(p).Gather(); std::pair<ValueType, ValueType> pivots; if (ctx.my_rank() == 0) { LOG << "got " << sample.size() << " samples (p = " << p << ")"; // Sort the samples std::sort(sample.begin(), sample.end(), compare); const double base_pos = static_cast<double>(rank * sample.size()) / size_d; const double offset = pow(size_d, 0.25 + delta); long lower_pos = static_cast<long>(floor(base_pos - offset)); long upper_pos = static_cast<long>(floor(base_pos + offset)); size_t lower = static_cast<size_t>(std::max(0L, lower_pos)); size_t upper = static_cast<size_t>( std::min(upper_pos, static_cast<long>(sample.size() - 1))); assert(0 <= lower && lower < sample.size()); assert(0 <= upper && upper < sample.size()); LOG << "Selected pivots at positions " << lower << " and " << upper << ": " << sample[lower] << " and " << sample[upper]; pivots = std::make_pair(sample[lower], sample[upper]); } pivots = ctx.net.Broadcast(pivots); LOGM << "pivots: " << pivots.first << " and " << pivots.second; return pivots; }
static DIA<ValueType> MakeCollapse(const DIA<ValueType, Stack>& dia) { assert(dia.IsValid()); // Create new CollapseNode. Transfer stack from rhs to // CollapseNode. Build new DIA with empty stack and CollapseNode using CollapseNode = api::CollapseNode<ValueType>; return DIA<ValueType>(tlx::make_counting<CollapseNode>(dia)); }
TEST(Graph, WhileLoop) { std::function<void(Context&)> start_func = [](Context& ctx) { auto integers = Generate( ctx, [](const size_t& index) -> size_t { return index; }, 16); auto flatmap_duplicate = [](size_t in, auto emit) { emit(in); emit(in); }; auto map_multiply = [](size_t in) { return 2 * in; }; DIA<size_t> squares = integers.Collapse(); size_t sum = 0; // run loop four times, inflating DIA of 16 items -> 256 while (sum < 64) { auto pairs = squares.FlatMap(flatmap_duplicate); auto multiplied = pairs.Map(map_multiply); squares = multiplied.Cache(); sum = squares.Size(); } std::vector<size_t> out_vec = squares.AllGather(); ASSERT_EQ(64u, out_vec.size()); ASSERT_EQ(64u, squares.Size()); ctx.stats_graph().BuildLayout("loop.out"); }; api::RunLocalTests(start_func); }
DIA<ValueType, Stack>::DIA(const DIA<ValueType, AnyStack>& rhs) { // Create new CollapseNode. Transfer stack from rhs to CollapseNode. Build // new DIA with empty stack and CollapseNode using CollapseNode = api::CollapseNode<ValueType, DIA<ValueType, AnyStack> >; LOG0 << "WARNING: cast to DIA creates CollapseNode instead of inline chaining."; LOG0 << "Consider whether you can use auto instead of DIA."; StatsNode* stats_node = rhs.AddChildStatsNode("Collapse", DIANodeType::COLLAPSE); node_ = std::make_shared<CollapseNode>(rhs, stats_node); // stack_ is default constructed. stats_parents_.emplace_back(stats_node); }
void OutputSVG(const std::string& svg_path, double svg_scale, const DIA<Point<2> >& point_dia, const KMeansModel<Point<2> >& model) { double width = 0, height = 0; using Point2D = Point<2>; const std::vector<Point2D>& centroids = model.centroids(); std::vector<PointClusterId<Point2D> > list = model.ClassifyPairs(point_dia).Gather(); for (const PointClusterId<Point2D>& p : list) { width = std::max(width, p.first.x[0]); height = std::max(height, p.first.x[1]); } if (point_dia.context().my_rank() != 0) return; std::ofstream os(svg_path); os << "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n"; os << "<svg\n"; os << " xmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n"; os << " xmlns:cc=\"http://creativecommons.org/ns#\"\n"; os << " xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n"; os << " xmlns:svg=\"http://www.w3.org/2000/svg\"\n"; os << " xmlns=\"http://www.w3.org/2000/svg\"\n"; os << " version=\"1.1\" id=\"svg2\" width=\"" << width * svg_scale << "\" height=\"" << height * svg_scale << "\">\n"; os << " <g id=\"layer1\">\n"; for (const PointClusterId<Point2D>& p : list) { os << " <circle r=\"1\" cx=\"" << p.first.x[0] * svg_scale << "\" cy=\"" << p.first.x[1] * svg_scale << "\" style=\"stroke:none;stroke-opacity:1;fill:" << SVGColor(p.second) << ";fill-opacity:1\" />\n"; } for (size_t i = 0; i < centroids.size(); ++i) { const Point2D& p = centroids[i]; os << " <circle r=\"4\" cx=\"" << p.x[0] * svg_scale << "\" cy=\"" << p.x[1] * svg_scale << "\" style=\"stroke:black;stroke-opacity:1;fill:" << SVGColor(i) << ";fill-opacity:1\" />\n"; } os << " </g>\n"; os << "</svg>\n"; }
TEST(ZipNode, TwoDisbalancedStringArrays) { // first DIA is heavily balanced to the first workers, second DIA is // balanced to the last workers. std::function<void(Context&)> start_func = [](Context& ctx) { // generate random strings with 10..20 characters auto input_gen = Generate( ctx, [](size_t index) -> std::string { std::default_random_engine rng( 123456 + static_cast<unsigned>(index)); std::uniform_int_distribution<size_t> length(10, 20); rng(); // skip one number return common::RandomString( length(rng), rng, "abcdefghijklmnopqrstuvwxyz") + std::to_string(index); }, test_size); DIA<std::string> input = input_gen.Cache(); std::vector<std::string> vinput = input.AllGather(); ASSERT_EQ(test_size, vinput.size()); // Filter out strings that start with a-e auto input1 = input.Filter( [](const std::string& str) { return str[0] <= 'e'; }); // Filter out strings that start with w-z auto input2 = input.Filter( [](const std::string& str) { return str[0] >= 'w'; }); // zip auto zip_result = input1.Zip( input2, [](const std::string& a, const std::string& b) { return a + b; }); // check result std::vector<std::string> res = zip_result.AllGather(); // recalculate result locally std::vector<std::string> check; { std::vector<std::string> v1, v2; for (size_t index = 0; index < vinput.size(); ++index) { const std::string& s1 = vinput[index]; if (s1[0] <= 'e') v1.push_back(s1); if (s1[0] >= 'w') v2.push_back(s1); } ASSERT_EQ(v1, input1.AllGather()); ASSERT_EQ(v2, input2.AllGather()); for (size_t i = 0; i < std::min(v1.size(), v2.size()); ++i) { check.push_back(v1[i] + v2[i]); // sLOG1 << check.back(); } } for (size_t i = 0; i != res.size(); ++i) { sLOG0 << res[i] << " " << check[i] << (res[i] == check[i]); } ASSERT_EQ(check.size(), res.size()); ASSERT_EQ(check, res); }; api::RunLocalTests(start_func); }
auto PageRank(const DIA<std::string, InStack>&in, api::Context & ctx, int iter) { DIA<Page_Link> input = in.Map( [](const std::string& input) { auto split = thrill::common::Split(input, " "); LOG0 << "input " << (std::stoi(split[0]) - 1) << " " << (std::stoi(split[1]) - 1); // set base of page_id to 0 return Page_Link((size_t)(std::stoi(split[0]) - 1), (size_t)(std::stoi(split[1]) - 1)); }).Cache(); // TODO(SL): when Cache() is removed, code doesn't compile, // auto cannot be used either // aggregate all outgoing links of a page in this format: // // URL OUTGOING // ([linked_url, linked_url, ...]) // ([linked_url, linked_url, ...]) // ([linked_url, linked_url, ...]) // ... // get number of nodes by finding max page_id // add 1 to max node_id to get number of nodes because of node_id 0 const auto number_nodes = input.Sum( [](const Page_Link& in1, const Page_Link& in2) { Node first = std::max(in1.first, in2.first); Node second = std::max(in1.second, in2.second); return std::make_pair(std::max(first, second), first); }).first + 1; LOG << "number_nodes " << number_nodes; // group outgoing links auto links = input.GroupByIndex<Outgoings>( [](Page_Link p) { return p.first; }, [](auto& r, Key) { std::vector<Node> all; while (r.HasNext()) { all.push_back(r.Next().second); } // std::string s = "{"; // for (auto e : all) { // s+= std::to_string(e) + ", "; // } // LOG << "links " << s << "}"; return all; }, number_nodes).Cache(); // initialize all ranks to 1.0 // // (url, rank) // (url, rank) // (url, rank) // ... // auto ranks = Generate(ctx, [](const size_t& index) { // return std::make_pair(index, 1.0); // }, number_nodes).Cache(); auto ranks = Generate(ctx, [](const size_t&) { return (Rank)1.0; }, number_nodes).Cache(); auto node_ids = Generate(ctx, [](const size_t& index) { return index + 1; }, number_nodes); // do iterations for (int i = 0; i < iter; ++i) { LOG << "iteration " << i; // for all outgoing link, get their rank contribution from all // links by doing: // // 1) group all outgoing links with rank of its parent page: (Zip) // // ([linked_url, linked_url, ...], rank_parent) // ([linked_url, linked_url, ...], rank_parent) // ([linked_url, linked_url, ...], rank_parent) // // 2) compute rank contribution for each linked_url: (FlatMap) // // (linked_url, rank / OUTGOING.size) // (linked_url, rank / OUTGOING.size) // (linked_url, rank / OUTGOING.size) // ... std::cout << links.Size() << std::endl; std::cout << ranks.Size() << std::endl; assert(links.Size() == ranks.Size()); // TODO(SL): when Zip/FlatMap chained, code doesn't compile, please check DIA<Outgoings_Rank> outs_rank = links.Zip(ranks, [](const Outgoings& l, const Rank r) { // std::string s = "{"; // for (auto e : l) { // s += std::to_string(e) + ", "; // } // s += "}"; // LOG << "contribs1 " << s << " " << r; return std::make_pair(l, r); }); auto contribs = outs_rank.FlatMap<Page_Rank>( [](const Outgoings_Rank& p, auto emit) { if (p.first.size() > 0) { Rank rank_contrib = p.second / p.first.size(); // assert (rank_contrib <= 1); for (auto e : p.first) { LOG << "contribs2 " << e << " " << rank_contrib; emit(std::make_pair(e, rank_contrib)); } } }); // reduce all rank contributions by adding all rank contributions // and compute the new rank with 0.15 * 0.85 * sum_rank_contribs // // (url, rank) // (url, rank) // (url, rank) // ... // auto sum_rank_contrib_fn = [](const Page_Rank& p1, const Page_Rank& p2) { // assert(p1.first == p2.first); // return p1.second + p2.second; // }; ranks = contribs.ReduceToIndex( [](const Page_Rank& p) { return p.first; }, [](const Page_Rank& p1, const Page_Rank& p2) { return std::make_pair(p1.first, p1.second + p2.second); }, number_nodes) .Map( [](const Page_Rank p) { LOG << "ranks2 in " << p.first << "-" << p.second; if (std::fabs(p.second) <= 1E-5) { LOG << "ranks2 " << 0.0; return (Rank)0.0; } else { LOG << "ranks2 " << f + s * p.second; return f + s * p.second; } }).Keep().Collapse(); } // write result to line. add 1 to node_ids to revert back to normal auto res = ranks.Zip(node_ids, [](const Rank r, const Node n) { return std::to_string(n) + ": " + std::to_string(r); }); assert(res.Size() == links.Size()); return res; }
void OutputSVG(const std::string& svg_path, double svg_scale, const DIA<DataPoint<Vector> >& point_dia, const Vector& model) { double width = 0, height = 0, min_vert = 0, max_vert = 0, min_hor = 0, max_hor = 0; std::vector<DataPoint<Vector> > list = point_dia.Gather(); for (const DataPoint<Vector>& p : list) { min_hor = std::min(min_hor, p.data.x[0]); max_hor = std::max(max_hor, p.data.x[0]); min_vert = std::min(min_vert, p.label); max_vert = std::max(max_vert, p.label); } double weight = model.x[0]; double y1 = min_hor * weight; double y2 = max_hor * weight; min_vert = std::min(min_vert, y1); min_vert = std::min(min_vert, y2); max_vert = std::max(max_vert, y1); max_vert = std::max(max_vert, y2); width = max_hor - min_hor; height = max_vert - min_vert; std::ofstream os(svg_path); os << "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n"; os << "<svg\n"; os << " xmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n"; os << " xmlns:cc=\"http://creativecommons.org/ns#\"\n"; os << " xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n"; os << " xmlns:svg=\"http://www.w3.org/2000/svg\"\n"; os << " xmlns=\"http://www.w3.org/2000/svg\"\n"; os << " version=\"1.1\" id=\"svg2\" width=\"" << width * svg_scale << "\" height=\"" << height * svg_scale << "\">\n"; os << " <g id=\"layer1\">\n"; // Draw grid os << " <line x1=\"0\" y1=\"" << (height + min_vert) * svg_scale << "\" x2=\"" << width * svg_scale << "\" y2=\"" << (height + min_vert) * svg_scale << "\" stroke-width=\"1\" stroke=\"#777777\" style=\"stroke-opacity:0.3\" />\n"; os << " <line x1=\"" << -min_hor * svg_scale << "\" y1=\"0\"" << " x2=\"" << -min_hor * svg_scale << "\" y2=\"" << height * svg_scale << "\" stroke-width=\"1\" stroke=\"#777777\" style=\"stroke-opacity:0.3\" />\n"; // Draw points for (const DataPoint<Vector>& p : list) { os << " <circle r=\"1\" cx=\"" << (p.data.x[0] - min_hor) * svg_scale << "\" cy=\"" << (height - p.label + min_vert) * svg_scale << "\" style=\"stroke:none;stroke-opacity:1;fill:#45a2d1;fill-opacity:1\" />\n"; } // Draw line os << " <line x1=\"0\" y1=\"" << (height - y1 + min_vert) * svg_scale << "\" x2=\"" << width * svg_scale << "\" y2=\"" << (height - y2 + min_vert) * svg_scale << "\" stroke-width=\"1\" stroke=\"#ff9900\" />\n"; os << " </g>\n"; os << "</svg>\n"; }
ValueType Select(const DIA<ValueType, InStack>& data, size_t rank, const Compare& compare = Compare()) { api::Context& ctx = data.context(); const size_t size = data.Size(); assert(0 <= rank && rank < size); if (size <= base_case_size) { // base case, gather all data at worker with rank 0 ValueType result = ValueType(); auto elements = data.Gather(); if (ctx.my_rank() == 0) { assert(rank < elements.size()); std::nth_element(elements.begin(), elements.begin() + rank, elements.end(), compare); result = elements[rank]; LOG << "base case: " << size << " elements remaining, result is " << result; } result = ctx.net.Broadcast(result); return result; } ValueType left_pivot, right_pivot; std::tie(left_pivot, right_pivot) = PickPivots(data, size, rank, compare); size_t left_size, middle_size, right_size; using PartSizes = std::pair<size_t, size_t>; std::tie(left_size, middle_size) = data.Map( [&](const ValueType& elem) -> PartSizes { if (compare(elem, left_pivot)) return PartSizes { 1, 0 }; else if (!compare(right_pivot, elem)) return PartSizes { 0, 1 }; else return PartSizes { 0, 0 }; }) .Sum( [](const PartSizes& a, const PartSizes& b) -> PartSizes { return PartSizes { a.first + b.first, a.second + b.second }; }, PartSizes { 0, 0 }); right_size = size - left_size - middle_size; LOGM << "left_size = " << left_size << ", middle_size = " << middle_size << ", right_size = " << right_size << ", rank = " << rank; if (rank == left_size) { // all the elements strictly smaller than the left pivot are on the left // side -> left_size-th element is the left pivot LOGM << "result is left pivot: " << left_pivot; return left_pivot; } else if (rank == left_size + middle_size - 1) { // only the elements strictly greater than the right pivot are on the // right side, so the result is the right pivot in this case LOGM << "result is right pivot: " << right_pivot; return right_pivot; } else if (rank < left_size) { // recurse on the left partition LOGM << "Recursing left, " << left_size << " elements remaining (rank = " << rank << ")\n"; auto left = data.Filter( [&](const ValueType& elem) -> bool { return compare(elem, left_pivot); }).Collapse(); assert(left.Size() == left_size); return Select(left, rank, compare); } else if (left_size + middle_size <= rank) { // recurse on the right partition LOGM << "Recursing right, " << right_size << " elements remaining (rank = " << rank - left_size - middle_size << ")\n"; auto right = data.Filter( [&](const ValueType& elem) -> bool { return compare(right_pivot, elem); }).Collapse(); assert(right.Size() == right_size); return Select(right, rank - left_size - middle_size, compare); } else { // recurse on the middle partition LOGM << "Recursing middle, " << middle_size << " elements remaining (rank = " << rank - left_size << ")\n"; auto middle = data.Filter( [&](const ValueType& elem) -> bool { return !compare(elem, left_pivot) && !compare(right_pivot, elem); }).Collapse(); assert(middle.Size() == middle_size); return Select(middle, rank - left_size, compare); } }