static void RunKMeansGenerated( thrill::Context& ctx, size_t dimensions, size_t num_clusters, size_t iterations, const std::string& svg_path, double svg_scale, const std::vector<std::string>& input_paths) { std::default_random_engine rng(std::random_device { } ()); std::uniform_real_distribution<float> dist(0.0, 1000.0); size_t num_points; if (input_paths.size() != 1 || !thrill::common::from_str<size_t>(input_paths[0], num_points)) die("For generated data, set input_path to the number of points."); auto points = Generate( ctx, [&](const size_t& /* index */) { return Point::Random(dimensions, dist, rng); }, num_points); auto result = KMeans(points, dimensions, num_clusters, iterations); double cost = result.ComputeCost(points); if (ctx.my_rank() == 0) LOG1 << "k-means cost: " << cost; if (svg_path.size() && dimensions == 2) { OutputSVG(svg_path, svg_scale, points, result); } }
static void RunStochasticGradGenerated( thrill::Context& ctx, size_t dimensions, size_t iterations, size_t num_points, double mini_batch_fraction, double step_size, double tolerance, const std::string& svg_path, double svg_scale, size_t repetitions) { std::default_random_engine rng(2342); std::uniform_real_distribution<double> uni_dist(-100.0, 100.0); std::normal_distribution<double> norm_dist(1.0, 0.1); std::normal_distribution<double> weight_dist(1.0, 5); Vector weights = Vector::Random(dimensions, weight_dist, rng); if (ctx.my_rank() == 0) LOG1 << "Generated weights: " << weights; auto points = Generate( ctx, num_points, [&](const size_t& /* index */) { auto x = Vector::Random(dimensions, uni_dist, rng); auto y = weights.dot(x) * norm_dist(rng); return DataPoint<Vector>({ x, y }); }) .Cache().KeepForever().Execute(); auto start = std::chrono::high_resolution_clock::now(); Vector result; for (size_t r = 0; r < repetitions; r++) { auto grad_descent = StochasticGradientDescent<Vector>( iterations, mini_batch_fraction, step_size, tolerance); auto initial_weights = Vector::Make(dimensions).fill(1.0); result = grad_descent.optimize(points, initial_weights); } auto end = std::chrono::high_resolution_clock::now(); if (ctx.my_rank() == 0) { LOG1 << "Estimated weights: " << result; LOG1 << "Computation time: " << (std::chrono::duration_cast<std::chrono::duration<double> >(end - start)).count() / repetitions << "s"; } if (svg_path.size() && dimensions == 1) { OutputSVG(svg_path, svg_scale, points.Collapse(), result); } }
static void RunKMeansFile( thrill::Context& ctx, size_t dimensions, size_t num_clusters, size_t iterations, const std::string& svg_path, double svg_scale, const std::vector<std::string>& input_paths) { auto points = ReadLines(ctx, input_paths).Map( [dimensions](const std::string& input) { // parse "<pt> <pt> <pt> ..." lines Point p = Point::Make(dimensions); char* endptr = const_cast<char*>(input.c_str()); for (size_t i = 0; i < dimensions; ++i) { while (*endptr == ' ') ++endptr; p.x[i] = std::strtod(endptr, &endptr); if (!endptr || (*endptr != ' ' && i != dimensions - 1)) { die("Could not parse point coordinates: " << input); } } while (*endptr == ' ') ++endptr; if (!endptr || *endptr != 0) { die("Could not parse point coordinates: " << input); } return p; }); auto result = KMeans(points, dimensions, num_clusters, iterations); double cost = result.ComputeCost(points); if (ctx.my_rank() == 0) LOG1 << "k-means cost: " << cost; if (svg_path.size() && dimensions == 2) { OutputSVG(svg_path, svg_scale, points.Collapse(), result); } }
static void RunStochasticGradFile( thrill::Context& ctx, size_t dimensions, size_t iterations, double mini_batch_fraction, double step_size, double tolerance, const std::string& svg_path, double svg_scale, const std::string& input_path, size_t repetitions) { auto points = ReadLines(ctx, input_path) .Filter( [](const std::string& input) { // filter empty lines and comments return (!input.empty() && input.at(0) != '#'); }) .Map( [dimensions](const std::string& input) { // parse "<pt> <pt> <pt> ... <lbl>" lines Vector v = Vector::Make(dimensions); double l; char* endptr = const_cast<char*>(input.c_str()); for (size_t i = 0; i < dimensions; ++i) { while (*endptr == ' ') ++endptr; v.x[i] = std::strtod(endptr, &endptr); if (!endptr || *endptr != ' ') { die("Could not parse point coordinates: " << input); } } while (*endptr == ' ') ++endptr; l = std::strtod(endptr, &endptr); if (!endptr) { die("Could not parse point coordinates: " << input); } while (*endptr == ' ') ++endptr; if (!endptr || *endptr != 0) { die("Could not parse point coordinates: " << input); } return DataPoint<Vector>({ v, l }); }) .Cache().KeepForever().Execute(); auto start = std::chrono::high_resolution_clock::now(); Vector result; for (size_t r = 0; r < repetitions; r++) { auto grad_descent = StochasticGradientDescent<Vector>( iterations, mini_batch_fraction, step_size, tolerance); auto initial_weights = Vector::Make(dimensions).fill(1.0); result = grad_descent.optimize(points, initial_weights); } auto end = std::chrono::high_resolution_clock::now(); if (ctx.my_rank() == 0) { LOG1 << "Estimated weights: " << result; LOG1 << "Computation time: " << (std::chrono::duration_cast<std::chrono::duration<double> >(end - start)).count() / repetitions << "s"; } if (svg_path.size() && dimensions == 1) { OutputSVG(svg_path, svg_scale, points.Collapse(), result); } }