Ejemplo n.º 1
0
static void RunKMeansGenerated(
    thrill::Context& ctx,
    size_t dimensions, size_t num_clusters, size_t iterations,
    const std::string& svg_path, double svg_scale,
    const std::vector<std::string>& input_paths) {

    std::default_random_engine rng(std::random_device { } ());
    std::uniform_real_distribution<float> dist(0.0, 1000.0);

    size_t num_points;
    if (input_paths.size() != 1 ||
        !thrill::common::from_str<size_t>(input_paths[0], num_points))
        die("For generated data, set input_path to the number of points.");

    auto points = Generate(
        ctx, [&](const size_t& /* index */) {
            return Point::Random(dimensions, dist, rng);
        }, num_points);

    auto result = KMeans(points, dimensions, num_clusters, iterations);

    double cost = result.ComputeCost(points);
    if (ctx.my_rank() == 0)
        LOG1 << "k-means cost: " << cost;

    if (svg_path.size() && dimensions == 2) {
        OutputSVG(svg_path, svg_scale, points, result);
    }
}
static void RunStochasticGradGenerated(
    thrill::Context& ctx, size_t dimensions, size_t iterations,
    size_t num_points, double mini_batch_fraction,
    double step_size, double tolerance,
    const std::string& svg_path, double svg_scale, size_t repetitions) {

    std::default_random_engine rng(2342);
    std::uniform_real_distribution<double> uni_dist(-100.0, 100.0);
    std::normal_distribution<double> norm_dist(1.0, 0.1);
    std::normal_distribution<double> weight_dist(1.0, 5);

    Vector weights = Vector::Random(dimensions, weight_dist, rng);
    if (ctx.my_rank() == 0)
        LOG1 << "Generated weights: " << weights;

    auto points =
        Generate(
            ctx, num_points,
            [&](const size_t& /* index */) {
                auto x = Vector::Random(dimensions, uni_dist, rng);
                auto y = weights.dot(x) * norm_dist(rng);
                return DataPoint<Vector>({ x, y });
            })
        .Cache().KeepForever().Execute();

    auto start = std::chrono::high_resolution_clock::now();

    Vector result;

    for (size_t r = 0; r < repetitions; r++) {
        auto grad_descent = StochasticGradientDescent<Vector>(
            iterations, mini_batch_fraction, step_size, tolerance);

        auto initial_weights = Vector::Make(dimensions).fill(1.0);
        result = grad_descent.optimize(points, initial_weights);
    }

    auto end = std::chrono::high_resolution_clock::now();
    if (ctx.my_rank() == 0) {
        LOG1 << "Estimated weights: " << result;
        LOG1 << "Computation time: " << (std::chrono::duration_cast<std::chrono::duration<double> >(end - start)).count() / repetitions << "s";
    }

    if (svg_path.size() && dimensions == 1) {
        OutputSVG(svg_path, svg_scale, points.Collapse(), result);
    }
}
Ejemplo n.º 3
0
static void RunKMeansFile(
    thrill::Context& ctx,
    size_t dimensions, size_t num_clusters, size_t iterations,
    const std::string& svg_path, double svg_scale,
    const std::vector<std::string>& input_paths) {

    auto points =
        ReadLines(ctx, input_paths).Map(
            [dimensions](const std::string& input) {
                // parse "<pt> <pt> <pt> ..." lines
                Point p = Point::Make(dimensions);
                char* endptr = const_cast<char*>(input.c_str());
                for (size_t i = 0; i < dimensions; ++i) {
                    while (*endptr == ' ') ++endptr;
                    p.x[i] = std::strtod(endptr, &endptr);
                    if (!endptr || (*endptr != ' ' && i != dimensions - 1)) {
                        die("Could not parse point coordinates: " << input);
                    }
                }
                while (*endptr == ' ') ++endptr;
                if (!endptr || *endptr != 0) {
                    die("Could not parse point coordinates: " << input);
                }
                return p;
            });

    auto result = KMeans(points, dimensions, num_clusters, iterations);

    double cost = result.ComputeCost(points);
    if (ctx.my_rank() == 0)
        LOG1 << "k-means cost: " << cost;

    if (svg_path.size() && dimensions == 2) {
        OutputSVG(svg_path, svg_scale, points.Collapse(), result);
    }
}
static void RunStochasticGradFile(
    thrill::Context& ctx, size_t dimensions, size_t iterations,
    double mini_batch_fraction, double step_size, double tolerance,
    const std::string& svg_path, double svg_scale,
    const std::string& input_path, size_t repetitions) {

    auto points =
        ReadLines(ctx, input_path)
        .Filter(
            [](const std::string& input) {
                // filter empty lines and comments
                return (!input.empty() && input.at(0) != '#');
            })
        .Map(
            [dimensions](const std::string& input) {
                // parse "<pt> <pt> <pt> ... <lbl>" lines
                Vector v = Vector::Make(dimensions);
                double l;
                char* endptr = const_cast<char*>(input.c_str());
                for (size_t i = 0; i < dimensions; ++i) {
                    while (*endptr == ' ') ++endptr;
                    v.x[i] = std::strtod(endptr, &endptr);
                    if (!endptr || *endptr != ' ') {
                        die("Could not parse point coordinates: " << input);
                    }
                }
                while (*endptr == ' ') ++endptr;
                l = std::strtod(endptr, &endptr);
                if (!endptr) {
                    die("Could not parse point coordinates: " << input);
                }
                while (*endptr == ' ') ++endptr;
                if (!endptr || *endptr != 0) {
                    die("Could not parse point coordinates: " << input);
                }
                return DataPoint<Vector>({ v, l });
            })
        .Cache().KeepForever().Execute();

    auto start = std::chrono::high_resolution_clock::now();

    Vector result;

    for (size_t r = 0; r < repetitions; r++) {
        auto grad_descent = StochasticGradientDescent<Vector>(
            iterations, mini_batch_fraction, step_size, tolerance);

        auto initial_weights = Vector::Make(dimensions).fill(1.0);
        result = grad_descent.optimize(points, initial_weights);
    }

    auto end = std::chrono::high_resolution_clock::now();

    if (ctx.my_rank() == 0) {
        LOG1 << "Estimated weights: " << result;
        LOG1 << "Computation time: " << (std::chrono::duration_cast<std::chrono::duration<double> >(end - start)).count() / repetitions << "s";
    }

    if (svg_path.size() && dimensions == 1) {
        OutputSVG(svg_path, svg_scale, points.Collapse(), result);
    }
}