コード例 #1
0
ファイル: camera_pipe.cpp プロジェクト: alinas/Halide
Func process(Func raw, Type result_type,
             ImageParam matrix_3200, ImageParam matrix_7000, Param<float> color_temp,
             Param<float> gamma, Param<float> contrast, Param<int> blackLevel, Param<int> whiteLevel) {

    Var yii, xi;

    Func denoised = hot_pixel_suppression(raw);
    Func deinterleaved = deinterleave(denoised);
    Func demosaiced = demosaic(deinterleaved);
    Func corrected = color_correct(demosaiced, matrix_3200, matrix_7000, color_temp);
    Func curved = apply_curve(corrected, result_type, gamma, contrast, blackLevel, whiteLevel);

    processed(x, y, c) = curved(x, y, c);

    // Schedule
    Expr out_width = processed.output_buffer().width();
    Expr out_height = processed.output_buffer().height();

    int strip_size = 32;
    int vec = target.natural_vector_size(UInt(16));
    if (target.has_feature(Target::HVX_64)) {
        vec = 32;
    } else if (target.has_feature(Target::HVX_128)) {
        vec = 64;
    }
    denoised.compute_at(processed, yi).store_at(processed, yo)
        .fold_storage(y, 8)
        .vectorize(x, vec);
    deinterleaved.compute_at(processed, yi).store_at(processed, yo)
        .fold_storage(y, 4)
        .vectorize(x, 2*vec, TailStrategy::RoundUp)
        .reorder(c, x, y)
        .unroll(c);
    corrected.compute_at(processed, x)
        .vectorize(x, vec)
        .reorder(c, x, y)
        .unroll(c);
    processed.compute_root()
        .split(y, yo, yi, strip_size)
        .split(yi, yi, yii, 2)
        .split(x, x, xi, 2*vec, TailStrategy::RoundUp)
        .reorder(xi, c, yii, x, yi, yo)
        .vectorize(xi, 2*vec)
        .parallel(yo);

    if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
        processed.hexagon();
        denoised.align_storage(x, vec);
        deinterleaved.align_storage(x, vec);
        corrected.align_storage(x, vec);
    }

    // We can generate slightly better code if we know the splits divide the extent.
    processed
        .bound(c, 0, 3)
        .bound(x, 0, ((out_width)/(2*vec))*(2*vec))
        .bound(y, 0, (out_height/strip_size)*strip_size);

    return processed;
}
コード例 #2
0
ファイル: extern_producer.cpp プロジェクト: AheadIO/Halide
int main(int argc, char **argv) {
    Func source;
    source.define_extern("make_data",
                         std::vector<ExternFuncArgument>(),
                         Float(32), 2);
    Func sink;
    Var x, y;
    sink(x, y) = source(x, y) - sin(x + y);

    Var xi, yi;
    sink.tile(x, y, xi, yi, 32, 32);

    // Compute the source per tile of sink
    source.compute_at(sink, x);

    Image<float> output = sink.realize(100, 100);

    // Should be all zeroes.
    RDom r(output);
    float error = evaluate_may_gpu<float>(sum(abs(output(r.x, r.y))));
    if (error != 0) {
        printf("Something went wrong\n");
        return -1;
    }

    Func multi;
    std::vector<Type> types;
    types.push_back(Float(32));
    types.push_back(Float(32));
    multi.define_extern("make_data_multi",
                        std::vector<ExternFuncArgument>(),
			types, 2);
    Func sink_multi;
    sink_multi(x, y) = multi(x, y)[0] - sin(x + y) +
                       multi(x, y)[1] - cos(x + y);

    sink_multi.tile(x, y, xi, yi, 32, 32);

    // Compute the source per tile of sink
    multi.compute_at(sink_multi, x);

    Image<float> output_multi = sink_multi.realize(100, 100);

    // Should be all zeroes.
    float error_multi = evaluate<float>(sum(abs(output_multi(r.x, r.y))));
    if (error_multi != 0) {
        printf("Something went wrong in multi case\n");
        return -1;
    }

    printf("Success!\n");
    return 0;

}
コード例 #3
0
ファイル: diffuse.cpp プロジェクト: CarVac/filmulator-gui
Func blur_then_transpose(Func f, Func coeff, Expr size, Expr sigma) {

    Func blurred = performBlur(f, coeff, size, sigma);

    // Also compute attenuation due to zero boundary condition by
    // blurring an image of ones in the same way. This gives a
    // boundary condition equivalent to reweighting the Gaussian
    // near the edge. (TODO: add a generator param to select
    // different boundary conditions).
    Func ones;
    ones(x, y) = 1.0f;
    Func attenuation = performBlur(ones, coeff, size, sigma);

    // Invert the attenuation so we can multiply by it. The
    // attenuation is the same for every row/channel so we only
    // need one column.
    Func inverse_attenuation;
    inverse_attenuation(y) = 1.0f / attenuation(0, y);

    // Transpose it
    Func transposed;
    transposed(x, y) = blurred(y, x);

    // Correct for attenuation
    Func out;
    out(x, y) = transposed(x, y) * inverse_attenuation(x);

    // Schedule it.
    Var yi, xi, yii, xii;

    attenuation.compute_root();
    inverse_attenuation.compute_root().vectorize(y, 8);
    out.compute_root()
        .tile(x, y, xi, yi, 8, 32)
        .tile(xi, yi, xii, yii, 8, 8)
        .vectorize(xii).unroll(yii).parallel(y);
    blurred.compute_at(out, y);
    transposed.compute_at(out, xi).vectorize(y).unroll(x);

    for (int i = 0; i < blurred.num_update_definitions(); i++) {
        RDom r = blurred.reduction_domain(i);
        if (r.defined()) {
            blurred.update(i).reorder(x, r);
        }
        blurred.update(i).vectorize(x, 8).unroll(x);
    }

    return out;
}
コード例 #4
0
int main(int argc, char **argv) {
    Func mandelbrot;
    Var x, y;

    Param<float> x_min, x_max, y_min, y_max, c_real, c_imag;
    Param<int> w, h, iters;
    Complex initial(lerp(x_min, x_max, cast<float>(x)/w),
                    lerp(y_min, y_max, cast<float>(y)/h));
    Complex c(c_real, c_imag);

    Var z;
    mandelbrot(x, y, z) = initial;
    RDom t(1, iters);
    Complex current = mandelbrot(x, y, t-1);
    mandelbrot(x, y, t) = current*current + c;

    // How many iterations until something escapes a circle of radius 2?
    Func count;
    Tuple escape = argmin(magnitude(mandelbrot(x, y, t)) < 4);

    // If it never escapes, use the value 0
    count(x, y) = select(escape[1], 0, escape[0]);

    Var xi, yi, xo, yo;
    count.tile(x, y, xo, yo, xi, yi, 8, 8);
    count.parallel(yo).vectorize(xi, 4).unroll(xi).unroll(yi, 2);
    mandelbrot.compute_at(count, xo);

    Argument args[] = {x_min, x_max, y_min, y_max, c_real, c_imag, iters, w, h};

    count.compile_to_file("mandelbrot", std::vector<Argument>(args, args + 9));

    return 0;
}
コード例 #5
0
int main(int argc, char **argv)
{
  Image<uint8_t> input = load<uint8_t>("P1070046.png");

  timeval t1, t2;
  gettimeofday(&t1, NULL);
  Var x,y,c;
  Func toFloat;
  toFloat(c,x,y) = cast<float>(input(x,y,c))/255.0;
  Func toHSV;
  toHSV = hsv(toFloat);
  Func saturated;
  saturated(c,x,y) = select(c != 1,
                            toHSV(c,x,y),
                            clamp(1*fast_pow(toHSV(c,x,y),0.5),
                                  0,1));
  Func toRGB,toInt;
  toRGB = rgb(saturated);
  toInt(x,y,c) = cast<uint8_t>(toRGB(c,x,y)*255.0);
  Var y_outer,y_inner;
  toInt.reorder(c,x,y);
  toInt.split(y,y_outer, y_inner, 256);
  toInt.parallel(y_outer);
  toHSV.compute_at(toInt,x);
  Halide::Image<uint8_t> output = toInt.realize(input.width(),input.height(),input.channels());
  gettimeofday(&t2, NULL);
  save(output,"vibSat.png");
  std::cout<<float(t2.tv_sec - t1.tv_sec) + float(t2.tv_usec - t1.tv_usec)/1000000.0f << std::endl;
  return 0;
}
コード例 #6
0
    // Now we define methods that give our pipeline several different
    // schedules.
    void schedule_for_cpu() {
        // Compute the look-up-table ahead of time.
        lut.compute_root();

        // Compute color channels innermost. Promise that there will
        // be three of them and unroll across them.
        curved.reorder(c, x, y)
              .bound(c, 0, 3)
              .unroll(c);

        // Look-up-tables don't vectorize well, so just parallelize
        // curved in slices of 16 scanlines.
        Var yo, yi;
        curved.split(y, yo, yi, 16)
              .parallel(yo);

        // Compute sharpen as needed per scanline of curved.
        sharpen.compute_at(curved, yi);

        // Vectorize the sharpen. It's 16-bit so we'll vectorize it 8-wide.
        sharpen.vectorize(x, 8);

        // Compute the padded input as needed per scanline of curved,
        // reusing previous values computed within the same strip of
        // 16 scanlines.
        padded.store_at(curved, yo)
              .compute_at(curved, yi);

        // Also vectorize the padding. It's 8-bit, so we'll vectorize
        // 16-wide.
        padded.vectorize(x, 16);

        // JIT-compile the pipeline for the CPU.
        curved.compile_jit();
    }
コード例 #7
0
ファイル: extern_producer.cpp プロジェクト: 202198/Halide
int main(int argc, char **argv) {
    Func source;
    source.define_extern("make_data",
                         std::vector<ExternFuncArgument>(),
                         Float(32), 2);
    Func sink;
    Var x, y;
    sink(x, y) = source(x, y) - sin(x + y);

    Var xi, yi;
    sink.tile(x, y, xi, yi, 32, 32);

    // Compute the source per tile of sink
    source.compute_at(sink, x);

    Image<float> output = sink.realize(100, 100);

    // Should be all zeroes.
    RDom r(output);
    float error = evaluate<float>(sum(abs(output(r.x, r.y))));
    if (error != 0) {
        printf("Something went wrong\n");
        return -1;
    }

    printf("Success!\n");
    return 0;

}
コード例 #8
0
    explicit Test(int i) {
        // We use specific calls as proxies for verifying that compute_at
        // happens where we expect: sin() for the inner function, cos()
        // for the outer one; these are chosen mainly because they won't
        // ever get generated incidentally by the lowering code as part of
        // general code structure.
        inner = Func("inner" + std::to_string(i));
        inner(x, y, c) = sin(cast<float>(x + y + c));

        inner.compute_at(inner_compute_at).store_at(inner_store_at);

        outer = Func("outer" + std::to_string(i));
        outer(x, y, c) = cos(cast<float>(inner(x, y, c)));
    }
コード例 #9
0
ファイル: image_wrap.cpp プロジェクト: bleibig/Halide
int global_wrap_test() {
    Func source("source"), g("g"), h("h"), i("i");
    Var x("x"), y("y");

    source(x, y) = x + y;
    ImageParam img(Int(32), 2, "img");
    Buffer<int> buf = source.realize(200, 200);
    img.set(buf);

    g(x, y) = img(x, y);
    h(x, y) = g(x, y) + img(x, y);

    Var xi("xi"), yi("yi"), t("t");
    Func wrapper = img.in();
    Func img_f = img;
    img_f.compute_root();
    h.compute_root().tile(x, y, xi, yi, 16, 16).fuse(x, y, t).parallel(t);
    g.compute_at(h, yi);
    wrapper.compute_at(h, yi).tile(_0, _1, xi, yi, 8, 8).fuse(xi, yi, t).vectorize(t, 4);

    // Check the call graphs.
    // Expect 'g' to call 'wrapper', 'wrapper' to call 'img_f', 'img_f' to call 'img',
    // 'h' to call 'wrapper' and 'g'
    Module m = h.compile_to_module({h.infer_arguments()});
    CheckCalls c;
    m.functions().front().body.accept(&c);

    CallGraphs expected = {
        {h.name(), {g.name(), wrapper.name()}},
        {g.name(), {wrapper.name()}},
        {wrapper.name(), {img_f.name()}},
        {img_f.name(), {img.name()}},
    };
    if (check_call_graphs(c.calls, expected) != 0) {
        return -1;
    }

    Buffer<int> im = h.realize(200, 200);
    auto func = [](int x, int y) { return 2*(x + y); };
    if (check_image(im, func)) {
        return -1;
    }
    return 0;
}
コード例 #10
0
ファイル: iir_blur.cpp プロジェクト: JayHuangYC/Halide
// Defines a func to blur the columns of an input with a first order low
// pass IIR filter, followed by a transpose.
Func blur_cols_transpose(Func input, Expr height, Expr alpha) {
    Func blur;

    // Pure definition: do nothing.
    blur(x, y, c) = undef<float>();
    // Update 0: set the top row of the result to the input.
    blur(x, 0, c) = input(x, 0, c);
    // Update 1: run the IIR filter down the columns.
    RDom ry(1, height - 1);
    blur(x, ry, c) =
        (1 - alpha)*blur(x, ry - 1, c) + alpha*input(x, ry, c);
    // Update 2: run the IIR blur up the columns.
    Expr flip_ry = height - ry - 1;
    blur(x, flip_ry, c) =
        (1 - alpha)*blur(x, flip_ry + 1, c) + alpha*blur(x, flip_ry, c);

    // Transpose the blur.
    Func transpose;
    transpose(x, y, c) = blur(y, x, c);

    // Schedule:
    // Split the transpose into tiles of rows. Parallelize over channels
    // and strips (Halide supports nested parallelism).
    Var xo, yo;
    transpose.compute_root()
        .tile(x, y, xo, yo, x, y, 8, 8)
        .vectorize(x)
        .parallel(yo)
        .parallel(c);

    // Run the filter on each row of tiles (which corresponds to a strip of
    // columns in the input).
    blur.compute_at(transpose, yo);

    // Vectorize computations within the strips.
    blur.update(1)
        .reorder(x, ry)
        .vectorize(x);
    blur.update(2)
        .reorder(x, ry)
        .vectorize(x);

    return transpose;
}
コード例 #11
0
ファイル: wrap.cpp プロジェクト: cyanjc321/Halide
int global_wrap_test() {
    Func f("f"), g("g"), h("h"), i("i");
    Var x("x"), y("y");

    f(x, y) = x + y;
    g(x, y) = f(x, y);
    h(x, y) = g(x, y) + f(x, y);

    Var xi("xi"), yi("yi"), t("t");
    Func wrapper = f.in();
    f.compute_root();
    h.compute_root().tile(x, y, xi, yi, 16, 16).fuse(x, y, t).parallel(t);
    g.compute_at(h, yi);
    wrapper.compute_at(h, yi).tile(x, y, xi, yi, 8, 8).fuse(xi, yi, t).vectorize(t, 4);

    // Check the call graphs.
    // Expect 'g' to call 'wrapper', 'wrapper' to call 'f', 'f' to call nothing,
    // 'h' to call 'wrapper' and 'g'
    Module m = h.compile_to_module({});
    CheckCalls c;
    m.functions().front().body.accept(&c);

    CallGraphs expected = {
        {h.name(), {g.name(), wrapper.name()}},
        {g.name(), {wrapper.name()}},
        {wrapper.name(), {f.name()}},
        {f.name(), {}},
    };
    if (check_call_graphs(c.calls, expected) != 0) {
        return -1;
    }

    Image<int> im = h.realize(200, 200);
    auto func = [](int x, int y) {
        return 2*(x + y);
    };
    if (check_image(im, func)) {
        return -1;
    }
    return 0;
}
コード例 #12
0
ファイル: side_effects.cpp プロジェクト: drtpig/Halide
int main(int argc, char **argv) {
    Var x, y;

    Func mandelbrot;
    // Use a different scale on x and y because terminal characters
    // are not square. Arbitrarily chosen to fit the set nicely.
    Complex initial(x/20.0f, y/8.0f);
    Var z;
    mandelbrot(x, y, z) = Complex(0.0f, 0.0f);
    RDom t(1, 40);
    Complex current = mandelbrot(x, y, t-1);
    mandelbrot(x, y, t) = current*current + initial;

    // How many iterations until something escapes a circle of radius 2?
    Func count;
    Tuple escape = argmin(magnitude(mandelbrot(x, y, t)) < 4);
    // If it never escapes, use the value 0
    count(x, y) = select(escape[1], 0, escape[0]);

    RDom r(-45, 71, -10, 21);
    Func render;
    render() = 0;
    render() = draw_pixel(r.x, r.y, count(r.x, r.y));

    mandelbrot.compute_at(render, r.x);

    render.realize();

    printf("\n");

    // Check draw_pixel was called the right number of times.
    if (call_count != 71*21) {
        printf("Something went wrong\n");
        return -1;
    }

    printf("Success!\n");
    return 0;
}
コード例 #13
0
ファイル: camera_pipe.cpp プロジェクト: DoDNet/Halide
Func process(Func raw, Type result_type,
             ImageParam matrix_3200, ImageParam matrix_7000, Param<float> color_temp,
             Param<float> gamma, Param<float> contrast) {

    Var xi, yi;

    Func denoised = hot_pixel_suppression(raw);
    Func deinterleaved = deinterleave(denoised);
    Func demosaiced = demosaic(deinterleaved);
    Func corrected = color_correct(demosaiced, matrix_3200, matrix_7000, color_temp);
    Func curved = apply_curve(corrected, result_type, gamma, contrast);

    processed(tx, ty, c) = curved(tx, ty, c);

    // Schedule
    processed.bound(c, 0, 3); // bound color loop 0-3, properly
    if (schedule == 0) {
        // Compute in chunks over tiles, vectorized by 8
        denoised.compute_at(processed, tx).vectorize(x, 8);
        deinterleaved.compute_at(processed, tx).vectorize(x, 8).reorder(c, x, y).unroll(c);
        corrected.compute_at(processed, tx).vectorize(x, 4).reorder(c, x, y).unroll(c);
        processed.tile(tx, ty, xi, yi, 32, 32).reorder(xi, yi, c, tx, ty);
        processed.parallel(ty);
    } else if (schedule == 1) {
        // Same as above, but don't vectorize (sse is bad at interleaved 16-bit ops)
        denoised.compute_at(processed, tx);
        deinterleaved.compute_at(processed, tx);
        corrected.compute_at(processed, tx);
        processed.tile(tx, ty, xi, yi, 128, 128).reorder(xi, yi, c, tx, ty);
        processed.parallel(ty);
    } else {
        denoised.compute_root();
        deinterleaved.compute_root();
        corrected.compute_root();
        processed.compute_root();
    }

    return processed;
}
コード例 #14
0
int main(int argc, char **argv) {
    if (!get_jit_target_from_environment().has_gpu_feature()) {
        printf("Not running test because no gpu target enabled\n");
        return 0;
    }

    {
        Func f;
        Var x, y, z;

        // Construct a Func with lots of potential race conditions, and
        // then run it in thread blocks on the gpu.

        f(x, y) = x + 100 * y;

        const int passes = 10;
        for (int i = 0; i < passes; i++) {
            RDom rx(0, 10);
            // Flip each row, using spots 10-19 as temporary storage
            f(rx + 10, y) = f(9 - rx, y);
            f(rx, y) = f(rx + 10, y);
            // Flip each column the same way
            RDom ry(0, 8);
            f(x, ry + 8) = f(x, 7 - ry);
            f(x, ry) = f(x, ry + 8);
        }

        Func g;
        g(x, y) = f(0, 0)+ f(9, 7);

        g.gpu_tile(x, y, 16, 8);
        f.compute_at(g, Var::gpu_blocks());

        for (int i = 0; i < passes; i++) {
            f.update(i*4 + 0).gpu_threads(y);
            f.update(i*4 + 1).gpu_threads(y);
            f.update(i*4 + 2).gpu_threads(x);
            f.update(i*4 + 3).gpu_threads(x);
        }

        Image<int> out = g.realize(100, 100);
        for (int y = 0; y < out.height(); y++) {
            for (int x = 0; x < out.width(); x++) {
                int correct = 7*100 + 9;
                if (out(x, y) != correct) {
                    printf("out(%d, %d) = %d instead of %d\n",
                           x, y, out(x, y), correct);
                    return -1;
                }
            }
        }

    }

    {
        // Construct a Func with undef stages, then run it in thread
        // blocks and make sure the right number of syncthreads are
        // added.

        Func f;
        Var x, y, z;
        f(x, y) = undef<int>();
        f(x, y) += x + 100 * y;
        // This next line is dubious, because it entirely masks the
        // effect of the previous definition. If you add an undefined
        // value to the previous def, then Halide can evaluate this to
        // whatever it likes. Currently we'll just elide this update
        // definition.
        f(x, y) += undef<int>();
        f(x, y) += y * 100 + x;

        Func g;
        g(x, y) = f(0, 0) + f(7, 7);

        g.gpu_tile(x, y, 8, 8);
        f.compute_at(g, Var::gpu_blocks());

        f.gpu_threads(x, y);
        f.update(0).gpu_threads(x, y);
        f.update(1).gpu_threads(x, y);
        f.update(2).gpu_threads(x, y);

        // There should be two thread barriers: one in between the
        // non-undef definitions, and one between f and g.
        g.add_custom_lowering_pass(new CheckBarrierCount(2));

        Image<int> out = g.realize(100, 100);
    }

    printf("Success!\n");
    return 0;
}
コード例 #15
0
ファイル: camera_pipe.cpp プロジェクト: alinas/Halide
Func demosaic(Func deinterleaved) {
    // These are the values we already know from the input
    // x_y = the value of channel x at a site in the input of channel y
    // gb refers to green sites in the blue rows
    // gr refers to green sites in the red rows

    // Give more convenient names to the four channels we know
    Func r_r, g_gr, g_gb, b_b;
    g_gr(x, y) = deinterleaved(x, y, 0);
    r_r(x, y)  = deinterleaved(x, y, 1);
    b_b(x, y)  = deinterleaved(x, y, 2);
    g_gb(x, y) = deinterleaved(x, y, 3);

    // These are the ones we need to interpolate
    Func b_r, g_r, b_gr, r_gr, b_gb, r_gb, r_b, g_b;

    // First calculate green at the red and blue sites

    // Try interpolating vertically and horizontally. Also compute
    // differences vertically and horizontally. Use interpolation in
    // whichever direction had the smallest difference.
    Expr gv_r  = avg(g_gb(x, y-1), g_gb(x, y));
    Expr gvd_r = absd(g_gb(x, y-1), g_gb(x, y));
    Expr gh_r  = avg(g_gr(x+1, y), g_gr(x, y));
    Expr ghd_r = absd(g_gr(x+1, y), g_gr(x, y));

    g_r(x, y)  = select(ghd_r < gvd_r, gh_r, gv_r);

    Expr gv_b  = avg(g_gr(x, y+1), g_gr(x, y));
    Expr gvd_b = absd(g_gr(x, y+1), g_gr(x, y));
    Expr gh_b  = avg(g_gb(x-1, y), g_gb(x, y));
    Expr ghd_b = absd(g_gb(x-1, y), g_gb(x, y));

    g_b(x, y)  = select(ghd_b < gvd_b, gh_b, gv_b);

    // Next interpolate red at gr by first interpolating, then
    // correcting using the error green would have had if we had
    // interpolated it in the same way (i.e. add the second derivative
    // of the green channel at the same place).
    Expr correction;
    correction = g_gr(x, y) - avg(g_r(x, y), g_r(x-1, y));
    r_gr(x, y) = correction + avg(r_r(x-1, y), r_r(x, y));

    // Do the same for other reds and blues at green sites
    correction = g_gr(x, y) - avg(g_b(x, y), g_b(x, y-1));
    b_gr(x, y) = correction + avg(b_b(x, y), b_b(x, y-1));

    correction = g_gb(x, y) - avg(g_r(x, y), g_r(x, y+1));
    r_gb(x, y) = correction + avg(r_r(x, y), r_r(x, y+1));

    correction = g_gb(x, y) - avg(g_b(x, y), g_b(x+1, y));
    b_gb(x, y) = correction + avg(b_b(x, y), b_b(x+1, y));

    // Now interpolate diagonally to get red at blue and blue at
    // red. Hold onto your hats; this gets really fancy. We do the
    // same thing as for interpolating green where we try both
    // directions (in this case the positive and negative diagonals),
    // and use the one with the lowest absolute difference. But we
    // also use the same trick as interpolating red and blue at green
    // sites - we correct our interpolations using the second
    // derivative of green at the same sites.

    correction = g_b(x, y)  - avg(g_r(x, y), g_r(x-1, y+1));
    Expr rp_b  = correction + avg(r_r(x, y), r_r(x-1, y+1));
    Expr rpd_b = absd(r_r(x, y), r_r(x-1, y+1));

    correction = g_b(x, y)  - avg(g_r(x-1, y), g_r(x, y+1));
    Expr rn_b  = correction + avg(r_r(x-1, y), r_r(x, y+1));
    Expr rnd_b = absd(r_r(x-1, y), r_r(x, y+1));

    r_b(x, y)  = select(rpd_b < rnd_b, rp_b, rn_b);


    // Same thing for blue at red
    correction = g_r(x, y)  - avg(g_b(x, y), g_b(x+1, y-1));
    Expr bp_r  = correction + avg(b_b(x, y), b_b(x+1, y-1));
    Expr bpd_r = absd(b_b(x, y), b_b(x+1, y-1));

    correction = g_r(x, y)  - avg(g_b(x+1, y), g_b(x, y-1));
    Expr bn_r  = correction + avg(b_b(x+1, y), b_b(x, y-1));
    Expr bnd_r = absd(b_b(x+1, y), b_b(x, y-1));

    b_r(x, y)  =  select(bpd_r < bnd_r, bp_r, bn_r);

    // Interleave the resulting channels
    Func r = interleave_y(interleave_x(r_gr, r_r),
                          interleave_x(r_b, r_gb));
    Func g = interleave_y(interleave_x(g_gr, g_r),
                          interleave_x(g_b, g_gb));
    Func b = interleave_y(interleave_x(b_gr, b_r),
                          interleave_x(b_b, b_gb));

    Func output;
    output(x, y, c) = select(c == 0, r(x, y),
                             c == 1, g(x, y),
                                     b(x, y));


    /* THE SCHEDULE */
    int vec = target.natural_vector_size(UInt(16));
    if (target.has_feature(Target::HVX_64)) {
        vec = 32;
    } else if (target.has_feature(Target::HVX_128)) {
        vec = 64;
    }
    g_r.compute_at(processed, yi)
        .store_at(processed, yo)
        .vectorize(x, vec, TailStrategy::RoundUp)
        .fold_storage(y, 2);
    g_b.compute_at(processed, yi)
        .store_at(processed, yo)
        .vectorize(x, vec, TailStrategy::RoundUp)
        .fold_storage(y, 2);
    output.compute_at(processed, x)
        .vectorize(x)
        .unroll(y)
        .reorder(c, x, y)
        .unroll(c);

    if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
        g_r.align_storage(x, vec);
        g_b.align_storage(x, vec);
    }

    return output;
}
コード例 #16
0
ファイル: memoize.cpp プロジェクト: josephsieh/Halide
int main(int argc, char **argv) {

    {
        call_count = 0;
        Func count_calls;
        count_calls.define_extern("count_calls",
                                  std::vector<ExternFuncArgument>(),
                                  UInt(8), 2);

        Func f;
        f() = count_calls(0, 0);
        f.compute_root().memoize();

        Image<uint8_t> result1 = f.realize();
        Image<uint8_t> result2 = f.realize();

        assert(result1(0) == 42);
        assert(result2(0) == 42);

        assert(call_count == 1);
    }

    {
        call_count = 0;
        Param<int32_t> coord;
        Func count_calls;
        count_calls.define_extern("count_calls",
                                  std::vector<ExternFuncArgument>(),
                                  UInt(8), 2);

        Func f, g;
        Var x, y;
        f() = count_calls(coord, coord);
        f.compute_root().memoize();

        g(x, y) = f();

        coord.set(0);
        Image<uint8_t> out1 = g.realize(256, 256);
        Image<uint8_t> out2 = g.realize(256, 256);

        for (int32_t i = 0; i < 256; i++) {
            for (int32_t j = 0; j < 256; j++) {
                assert(out1(i, j) == 42);
                assert(out2(i, j) == 42);
            }
        }
        assert(call_count == 1);

        coord.set(1);
        Image<uint8_t> out3 = g.realize(256, 256);
        Image<uint8_t> out4 = g.realize(256, 256);

        for (int32_t i = 0; i < 256; i++) {
            for (int32_t j = 0; j < 256; j++) {
                assert(out3(i, j) == 42);
                assert(out4(i, j) == 42);
            }
        }
        assert(call_count == 2);
    }

    {
        call_count = 0;
        Func count_calls;
        count_calls.define_extern("count_calls",
                                  std::vector<ExternFuncArgument>(),
                                  UInt(8), 2);

        Func f;
        Var x, y;
        f(x, y) = count_calls(x, y) + count_calls(x, y);
        count_calls.compute_root().memoize();

        Image<uint8_t> out1 = f.realize(256, 256);
        Image<uint8_t> out2 = f.realize(256, 256);

        for (int32_t i = 0; i < 256; i++) {
            for (int32_t j = 0; j < 256; j++) {
                assert(out1(i, j) == (42 + 42));
                assert(out2(i, j) == (42 + 42));
            }
        }
        assert(call_count == 1);
    }

    call_count = 0;

    {
        Func count_calls_23;
        count_calls_23.define_extern("count_calls_with_arg",
                                     Internal::vec(ExternFuncArgument(cast<uint8_t>(23))),
                                     UInt(8), 2);

        Func count_calls_42;
        count_calls_42.define_extern("count_calls_with_arg",
                                     Internal::vec(ExternFuncArgument(cast<uint8_t>(42))),
                                     UInt(8), 2);

        Func f;
        Var x, y;
        f(x, y) = count_calls_23(x, y) + count_calls_42(x, y);
        count_calls_23.compute_root().memoize();
        count_calls_42.compute_root().memoize();

        Image<uint8_t> out1 = f.realize(256, 256);
        Image<uint8_t> out2 = f.realize(256, 256);

        for (int32_t i = 0; i < 256; i++) {
            for (int32_t j = 0; j < 256; j++) {
                assert(out1(i, j) == (23 + 42));
                assert(out2(i, j) == (23 + 42));
            }
        }
        assert(call_count_with_arg == 2);
    }

    {
        Param<uint8_t> val1;
        Param<uint8_t> val2;

        call_count_with_arg = 0;
        Func count_calls_val1;
        count_calls_val1.define_extern("count_calls_with_arg",
                                       Internal::vec(ExternFuncArgument(Expr(val1))),
                                       UInt(8), 2);

        Func count_calls_val2;
        count_calls_val2.define_extern("count_calls_with_arg",
                                       Internal::vec(ExternFuncArgument(Expr(val2))),
                                       UInt(8), 2);

        Func f;
        Var x, y;
        f(x, y) = count_calls_val1(x, y) + count_calls_val2(x, y);
        count_calls_val1.compute_root().memoize();
        count_calls_val2.compute_root().memoize();

        val1.set(23);
        val2.set(42);

        Image<uint8_t> out1 = f.realize(256, 256);
        Image<uint8_t> out2 = f.realize(256, 256);

        val1.set(42);
        Image<uint8_t> out3 = f.realize(256, 256);

        val1.set(23);
        Image<uint8_t> out4 = f.realize(256, 256);

        val1.set(42);
        Image<uint8_t> out5 = f.realize(256, 256);

        val2.set(57);
        Image<uint8_t> out6 = f.realize(256, 256);


        for (int32_t i = 0; i < 256; i++) {
            for (int32_t j = 0; j < 256; j++) {
                assert(out1(i, j) == (23 + 42));
                assert(out2(i, j) == (23 + 42));
                assert(out3(i, j) == (42 + 42));
                assert(out4(i, j) == (23 + 42));
                assert(out5(i, j) == (42 + 42));
                assert(out6(i, j) == (42 + 57));
            }
        }
        assert(call_count_with_arg == 4);
    }

    {
        Param<float> val;

        call_count_with_arg = 0;
        Func count_calls;
        count_calls.define_extern("count_calls_with_arg",
                                  Internal::vec(ExternFuncArgument(cast<uint8_t>(val))),
                                  UInt(8), 2);

        Func f;
        Var x, y;
        f(x, y) = count_calls(x, y) + count_calls(x, y);
        count_calls.compute_root().memoize();

        val.set(23.0f);
        Image<uint8_t> out1 = f.realize(256, 256);
        val.set(23.4f);
        Image<uint8_t> out2 = f.realize(256, 256);

        for (int32_t i = 0; i < 256; i++) {
            for (int32_t j = 0; j < 256; j++) {
                assert(out1(i, j) == (23 + 23));
                assert(out2(i, j) == (23 + 23));
            }
        }
        assert(call_count_with_arg == 2);
    }

    {
        Param<float> val;

        call_count_with_arg = 0;
        Func count_calls;
        count_calls.define_extern("count_calls_with_arg",
                                  Internal::vec(ExternFuncArgument(memoize_tag(cast<uint8_t>(val)))),
                                  UInt(8), 2);

        Func f;
        Var x, y;
        f(x, y) = count_calls(x, y) + count_calls(x, y);
        count_calls.compute_root().memoize();

        val.set(23.0f);
        Image<uint8_t> out1 = f.realize(256, 256);
        val.set(23.4f);
        Image<uint8_t> out2 = f.realize(256, 256);

        for (int32_t i = 0; i < 256; i++) {
            for (int32_t j = 0; j < 256; j++) {
                assert(out1(i, j) == (23 + 23));
                assert(out2(i, j) == (23 + 23));
            }
        }
        assert(call_count_with_arg == 1);
    }

    {
        // Case with bounds computed not equal to bounds realized.
        Param<float> val;
        Param<int32_t> index;

        call_count_with_arg = 0;
        Func count_calls;
        count_calls.define_extern("count_calls_with_arg",
                                  Internal::vec(ExternFuncArgument(cast<uint8_t>(val))),
                                  UInt(8), 2);
        Func f, g, h;
        Var x;

        f(x) = count_calls(x, 0) + cast<uint8_t>(x);
        g(x) = f(x);
        h(x) = g(4) + g(index);

        f.compute_root().memoize();
        g.vectorize(x, 8).compute_at(h, x);

        val.set(23.0f);
        index.set(2);
        Image<uint8_t> out1 = h.realize(1);

        assert(out1(0) == (uint8_t)(2 * 23 + 4 + 2));
        assert(call_count_with_arg == 3);

        index.set(4);
        out1 = h.realize(1);

        assert(out1(0) == (uint8_t)(2 * 23 + 4 + 4));
        assert(call_count_with_arg == 4);
    }

    {
        // Test Tuple case
        Param<float> val;

        call_count_with_arg = 0;
        Func count_calls;
        count_calls.define_extern("count_calls_with_arg",
                                  Internal::vec(ExternFuncArgument(cast<uint8_t>(val))),
                                  UInt(8), 2);

        Func f;
        Var x, y, xi, yi;
        f(x, y) = Tuple(count_calls(x, y) + cast<uint8_t>(x), x);
        count_calls.compute_root().memoize();
        f.compute_root().memoize();

        Func g;
        g(x, y) = Tuple(f(x, y)[0] + f(x - 1, y)[0] + f(x + 1, y)[0], f(x, y)[1]);

        val.set(23.0f);
        Realization out = g.realize(128, 128);
        Image<uint8_t> out0 = out[0];
        Image<int32_t> out1 = out[1];


        for (int32_t i = 0; i < 100; i++) {
            for (int32_t j = 0; j < 100; j++) {
                assert(out0(i, j) == (uint8_t)(3 * 23 + i + (i - 1) + (i + 1)));
                assert(out1(i, j) == i);
            }
        }
        out = g.realize(128, 128);
        out0 = out[0];
        out1 = out[1];


        for (int32_t i = 0; i < 100; i++) {
            for (int32_t j = 0; j < 100; j++) {
                assert(out0(i, j) == (uint8_t)(3 * 23 + i + (i - 1) + (i + 1)));
                assert(out1(i, j) == i);
            }
        }
        assert(call_count_with_arg == 1);
    }

    {
        // Test cache eviction
        Param<float> val;

        call_count_with_arg = 0;
        Func count_calls;
        count_calls.define_extern("count_calls_with_arg",
                                  Internal::vec(ExternFuncArgument(cast<uint8_t>(val))),
                                  UInt(8), 2);

        Func f;
        Var x, y, xi, yi;
        f(x, y) = count_calls(x, y) + cast<uint8_t>(x);
        count_calls.compute_root().memoize();

        Func g;
        g(x, y) = f(x, y) + f(x - 1, y) + f(x + 1, y);
        Internal::JITSharedRuntime::memoization_cache_set_size(1000000);

        for (int v = 0; v < 1000; v++) {
            int r = rand() % 256;
            val.set((float)r);
            Image<uint8_t> out1 = g.realize(128, 128);

            for (int32_t i = 0; i < 100; i++) {
                for (int32_t j = 0; j < 100; j++) {
                    assert(out1(i, j) == (uint8_t)(3 * r + i + (i - 1) + (i + 1)));
                }
            }
        }
        // TODO work out an assertion on call count here.
        fprintf(stderr, "Call count is %d.\n", call_count_with_arg);

        // Return cache size to default.
        Internal::JITSharedRuntime::memoization_cache_set_size(0);
    }

    {
        // Test flushing entire cache with a single element larger than the cache
        Param<float> val;

        call_count_with_arg = 0;
        Func count_calls;
        count_calls.define_extern("count_calls_with_arg",
                                  Internal::vec(ExternFuncArgument(cast<uint8_t>(val))),
                                  UInt(8), 2);

        Func f;
        Var x, y, xi, yi;
        f(x, y) = count_calls(x, y) + cast<uint8_t>(x);
        count_calls.compute_root().memoize();

        Func g;
        g(x, y) = f(x, y) + f(x - 1, y) + f(x + 1, y);
        Internal::JITSharedRuntime::memoization_cache_set_size(1000000);

        for (int v = 0; v < 1000; v++) {
            int r = rand() % 256;
            val.set((float)r);
            Image<uint8_t> out1 = g.realize(128, 128);

            for (int32_t i = 0; i < 100; i++) {
                for (int32_t j = 0; j < 100; j++) {
                    assert(out1(i, j) == (uint8_t)(3 * r + i + (i - 1) + (i + 1)));
                }
            }
        }

        // TODO work out an assertion on call count here.
        fprintf(stderr, "Call count before oversize realize is %d.\n", call_count_with_arg);
        call_count_with_arg = 0;

        Image<uint8_t> big = g.realize(1024, 1024);
        Image<uint8_t> big2 = g.realize(1024, 1024);

        // TODO work out an assertion on call count here.
        fprintf(stderr, "Call count after oversize realize is %d.\n", call_count_with_arg);

        call_count_with_arg = 0;
        for (int v = 0; v < 1000; v++) {
            int r = rand() % 256;
            val.set((float)r);
            Image<uint8_t> out1 = g.realize(128, 128);

            for (int32_t i = 0; i < 100; i++) {
                for (int32_t j = 0; j < 100; j++) {
                    assert(out1(i, j) == (uint8_t)(3 * r + i + (i - 1) + (i + 1)));
                }
            }
        }

        fprintf(stderr, "Call count is %d.\n", call_count_with_arg);

        // Return cache size to default.
        Internal::JITSharedRuntime::memoization_cache_set_size(0);
    }

    {
        // Test parallel cache access
        Param<float> val;

        Func count_calls;
        count_calls.define_extern("count_calls_with_arg_parallel",
                                  Internal::vec(ExternFuncArgument(cast<uint8_t>(val))),
                                  UInt(8), 3);

        Func f;
        Var x, y;
        // Ensure that all calls map to the same cache key, but pass a thread ID
        // through to avoid having to do locking or an atomic add
        f(x, y) = count_calls(x, y % 4, memoize_tag(y / 16, 0)) + cast<uint8_t>(x);

        Func g;
        g(x, y) = f(x, y) + f(x - 1, y) + f(x + 1, y);
        count_calls.compute_at(f, y).memoize();
        f.compute_at(g, y).memoize();
        g.parallel(y, 16);

        val.set(23.0f);
        Internal::JITSharedRuntime::memoization_cache_set_size(1000000);
        Image<uint8_t> out = g.realize(128, 128);

        for (int32_t i = 0; i < 128; i++) {
            for (int32_t j = 0; j < 128; j++) {
                assert(out(i, j) == (uint8_t)(3 * 23 + i + (i - 1) + (i + 1)));
            }
        }

        // TODO work out an assertion on call counts here.
        for (int i = 0; i < 8; i++) {
          fprintf(stderr, "Call count for thread %d is %d.\n", i, call_count_with_arg_parallel[i]);
        }

        // Return cache size to default.
        Internal::JITSharedRuntime::memoization_cache_set_size(0);
    }

    {
        Param<float> val;

        Func f;
        Var x, y;
        f(x, y) = cast<uint8_t>((x << 8) + y);

        Func prev_func = f;

        Func stage[4];
        for (int i = 0; i < 4; i++) {
            std::vector<ExternFuncArgument> args(3);
            args[0] = cast<int32_t>(i);
            args[1] = cast<int32_t>(val);
            args[2] = prev_func;
            stage[i].define_extern("count_calls_staged",
                                   args,
                                   UInt(8), 2);
            prev_func = stage[i];
        }

        f.compute_root();
        for (int i = 0; i < 3; i++) {
          stage[i].compute_root();
        }
        stage[3].compute_root().memoize();
        val.set(23.0f);
        Image<uint8_t> result = stage[3].realize(128, 128);

        for (int32_t i = 0; i < 128; i++) {
            for (int32_t j = 0; j < 128; j++) {
              assert(result(i, j) == (uint8_t)((i << 8) + j + 4 * 23));
            }
        }

        for (int i = 0; i < 4; i++) {
          fprintf(stderr, "Call count for stage %d is %d.\n", i, call_count_staged[i]);
        }

        result = stage[3].realize(128, 128);
        for (int32_t i = 0; i < 128; i++) {
            for (int32_t j = 0; j < 128; j++) {
              assert(result(i, j) == (uint8_t)((i << 8) + j + 4 * 23));
            }
        }

        for (int i = 0; i < 4; i++) {
            fprintf(stderr, "Call count for stage %d is %d.\n", i, call_count_staged[i]);
        }

    }

    fprintf(stderr, "Success!\n");
    return 0;
}
コード例 #17
0
ファイル: game_of_life.cpp プロジェクト: josephwinston/Halide
int main(int argc, char **argv) {

    Expr random_bit = cast<uint8_t>(random_float() > 0.5f);

    // First define the function that gives the initial state of the
    // game board
    {
        Func initial;

        initial(x, y, c) = random_bit;
        initial.compile_to_file("game_of_life_init");
    }

    // Then the function that updates the state. Also depends on user input.
    {
        ImageParam state(UInt(8), 3);
        Param<int> mouse_x, mouse_y;

        // Add a boundary condition.
        Func clamped;
        clamped(x, y, c) = state(clamp(x, state.left(), state.right()),
                                 clamp(y, state.top(), state.bottom()), c);

        Expr xm = max(x-1, 0), xp = min(x+1, state.width()-1);
        Expr ym = max(y-1, 0), yp = min(y+1, state.height()-1);

        // Count the number of live neighbors.
        Expr count = (clamped(x - 1, y - 1, c) + clamped(x, y - 1, c) +
                      clamped(x + 1, y - 1, c) + clamped(x - 1, y, c) +
                      clamped(x + 1, y, c) + clamped(x - 1, y + 1, c) +
                      clamped(x, y + 1, c) + clamped(x + 1, y + 1, c));

        // Was this pixel alive in the previous generation?
        Expr alive_before = state(x, y, c) != 0;

        // We're alive in the next generation if we have two neighbors and
        // were alive before, or if we have three neighbors.
        Expr alive_now = (count == 2 && alive_before) || count == 3;

        Expr alive = cast<uint8_t>(1);
        Expr dead = cast<uint8_t>(0);

        Func output;
        output(x, y, c) = select(alive_now, alive, dead);

        // Clobber part of the output around where the mouse is with random junk
        Expr min_x = clamp(mouse_x - 10, 0, state.width()-1);
        Expr max_x = clamp(mouse_x + 10, 0, state.width()-1);
        Expr min_y = clamp(mouse_y - 10, 0, state.height()-1);
        Expr max_y = clamp(mouse_y + 10, 0, state.height()-1);
        RDom clobber(min_x, max_x - min_x + 1, min_y, max_y - min_y + 1);

        Expr dx = clobber.x - mouse_x;
        Expr dy = clobber.y - mouse_y;
        Expr r = dx*dx + dy*dy;

        output(clobber.x, clobber.y, c) =
            select(r < 100,
                   cast<uint8_t>(random_float() < 0.25f),
                   output(clobber.x, clobber.y, c));

        output.vectorize(x, 16);
        clamped.compute_at(output, x);

        Var yi;
        output.split(y, y, yi, 16).reorder(x, yi, c, y).parallel(y);

        output.compile_to_file("game_of_life_update", state, mouse_x, mouse_y);
    }

    // Now the function that converts the state into an argb image.
    {
        ImageParam state(UInt(8), 3);

        Func state_32;
        state_32(x, y, c) = cast<int32_t>(state(x, y, c));

        Func render;
        Expr r = select(state_32(x, y, 0) == 1, 255, 0);
        Expr g = select(state_32(x, y, 1) == 1, 255, 0);
        Expr b = select(state_32(x, y, 2) == 1, 255, 0);
        render(x, y) = (255 << 24) + (r << 16) + (g << 8) + b;

        render.vectorize(x, 4);
        state_32.compute_at(render, x);

        Var yi;
        render.split(y, y, yi, 16).parallel(y);

        render.compile_to_file("game_of_life_render", state);
    }

    return 0;
}
コード例 #18
0
    // Now a schedule that uses CUDA or OpenCL.
    void schedule_for_gpu() {
        // We make the decision about whether to use the GPU for each
        // Func independently. If you have one Func computed on the
        // CPU, and the next computed on the GPU, Halide will do the
        // copy-to-gpu under the hood. For this pipeline, there's no
        // reason to use the CPU for any of the stages. Halide will
        // copy the input image to the GPU the first time we run the
        // pipeline, and leave it there to reuse on subsequent runs.

        // As before, we'll compute the LUT once at the start of the
        // pipeline.
        lut.compute_root();

        // Let's compute the look-up-table using the GPU in 16-wide
        // one-dimensional thread blocks. First we split the index
        // into blocks of size 16:
        Var block, thread;
        lut.split(i, block, thread, 16);
        // Then we tell cuda that our Vars 'block' and 'thread'
        // correspond to CUDA's notions of blocks and threads, or
        // OpenCL's notions of thread groups and threads.
        lut.gpu_blocks(block)
           .gpu_threads(thread);

        // This is a very common scheduling pattern on the GPU, so
        // there's a shorthand for it:

        // lut.gpu_tile(i, block, thread, 16);

        // Func::gpu_tile behaves the same as Func::tile, except that
        // it also specifies that the tile coordinates correspond to
        // GPU blocks, and the coordinates within each tile correspond
        // to GPU threads.

        // Compute color channels innermost. Promise that there will
        // be three of them and unroll across them.
        curved.reorder(c, x, y)
              .bound(c, 0, 3)
              .unroll(c);

        // Compute curved in 2D 8x8 tiles using the GPU.
        curved.gpu_tile(x, y, xo, yo, xi, yi, 8, 8);

        // This is equivalent to:
        // curved.tile(x, y, xo, yo, xi, yi, 8, 8)
        //       .gpu_blocks(xo, yo)
        //       .gpu_threads(xi, yi);

        // We'll leave sharpen as inlined into curved.

        // Compute the padded input as needed per GPU block, storing
        // the intermediate result in shared memory. In the schedule
        // above xo corresponds to GPU blocks.
        padded.compute_at(curved, xo);

        // Use the GPU threads for the x and y coordinates of the
        // padded input.
        padded.gpu_threads(x, y);

        // JIT-compile the pipeline for the GPU. CUDA, OpenCL, or
        // Metal are not enabled by default. We have to construct a
        // Target object, enable one of them, and then pass that
        // target object to compile_jit. Otherwise your CPU will very
        // slowly pretend it's a GPU, and use one thread per output
        // pixel.

        // Start with a target suitable for the machine you're running
        // this on.
        Target target = get_host_target();

        // Then enable OpenCL or Metal, depending on which platform
        // we're on. OS X doesn't update its OpenCL drivers, so they
        // tend to be broken. CUDA would also be a fine choice on
        // machines with NVidia GPUs.
        if (target.os == Target::OSX) {
            target.set_feature(Target::Metal);
        } else {
            target.set_feature(Target::OpenCL);
        }

        // Uncomment the next line and comment out the lines above to
        // try CUDA instead.
        // target.set_feature(Target::CUDA);

        // If you want to see all of the OpenCL, Metal, or CUDA API
        // calls done by the pipeline, you can also enable the Debug
        // flag. This is helpful for figuring out which stages are
        // slow, or when CPU -> GPU copies happen. It hurts
        // performance though, so we'll leave it commented out.
        // target.set_feature(Target::Debug);

        curved.compile_jit(target);
    }
コード例 #19
0
int main(int argc, char **argv) {
    // Try doing vector loads with a boundary condition in various
    // ways and compare the performance.

    input = Image<uint16_t>(1024+8, 320);

    for (int y = 0; y < input.height(); y++) {
        for (int x = 0; x < input.width(); x++) {
            input(x, y) = rand() & 0xfff;
        }
    }

    output = Image<uint16_t>(1024, 320);

    Var x, y;

    double t_ref, t_clamped, t_scalar, t_pad;

    {
        // Do an unclamped load to get a reference number
        Func f;
        f(x, y) = input(x, y) * 3 + input(x+1, y);

        f.vectorize(x, 8);

        t_ref = test(f, false);
    }

    {
        // Variant 1 - do the clamped vector load
        Func g;
        g(x, y) = input(clamp(x, MIN, MAX), y);

        Func f;
        f(x, y) = g(x, y) * 3 + g(x+1, y);

        f.vectorize(x, 8);

        t_clamped = test(f);
    }

    {
        // Variant 2 - do the load as a scalar op just before the vectorized stuff
        Func g;
        g(x, y) = input(clamp(x, MIN, MAX), y);

        Func f;
        f(x, y) = g(x, y) * 3 + g(x+1, y);

        f.vectorize(x, 8);
        g.compute_at(f, x);

        t_scalar = test(f);
    }

    {
        // Variant 3 - pad each scanline using scalar code
        Func g;
        g(x, y) = input(clamp(x, MIN, MAX), y);

        Func f;
        f(x, y) = g(x, y) * 3 + g(x+1, y);

        f.vectorize(x, 8);
        g.compute_at(f, y);

        t_pad = test(f);
    }

    // This constraint is pretty lax, because the op is so trivial
    // that the overhead of branching is large. For more complex ops,
    // the overhead should be smaller. We just make sure it's faster
    // than scalarizing or padding.
    if (t_clamped > t_scalar || t_clamped > t_pad) {
        printf("Clamped load timings suspicious:\n"
               "Unclamped: %f\n"
               "Clamped: %f\n"
               "Scalarize the load: %f\n"
               "Pad the input: %f\n",
               t_ref, t_clamped, t_scalar, t_pad);
        return -1;
    }

    printf("Success!\n");

    // Clean up our global images, otherwise you get destructor 
    // order weirdness. The images hold onto the JIT-compiled module
    // that created them, and will delete it when they die. However, 
    // it might not be possible to destroy the module cleanly after
    // main exits, because destroying the module touches globals
    // inside of llvm, and destructor order of globals is not 
    // guaranteed.
    input = Image<uint16_t>();
    output = Image<uint16_t>();

    return 0;
}
コード例 #20
0
ファイル: sort.cpp プロジェクト: AheadIO/Halide
// Merge sort contiguous chunks of size s in a 1d func.
Func merge_sort(Func input, int total_size) {
    std::vector<Func> stages;
    Func result;

    const int parallel_work_size = 512;

    Func parallel_stage("parallel_stage");

    // First gather the input into a 2D array of width four where each row is sorted
    {
        assert(input.dimensions() == 1);
        // Use a small sorting network
        Expr a0 = input(4*y);
        Expr a1 = input(4*y+1);
        Expr a2 = input(4*y+2);
        Expr a3 = input(4*y+3);

        Expr b0 = min(a0, a1);
        Expr b1 = max(a0, a1);
        Expr b2 = min(a2, a3);
        Expr b3 = max(a2, a3);

        a0 = min(b0, b2);
        a1 = max(b0, b2);
        a2 = min(b1, b3);
        a3 = max(b1, b3);

        b0 = a0;
        b1 = min(a1, a2);
        b2 = max(a1, a2);
        b3 = a3;

        result(x, y) = select(x == 0, b0,
                              select(x == 1, b1,
                                  select(x == 2, b2, b3)));

        result.compute_at(parallel_stage, y).bound(x, 0, 4).unroll(x);

        stages.push_back(result);
    }

    // Now build up to the total size, merging each pair of rows
    for (int chunk_size = 4; chunk_size < total_size; chunk_size *= 2) {
        // "result" contains the sorted halves
        assert(result.dimensions() == 2);

        // Merge pairs of rows from the partial result
        Func merge_rows("merge_rows");
        RDom r(0, chunk_size*2);

        // The first dimension of merge_rows is within the chunk, and the
        // second dimension is the chunk index.  Keeps track of two
        // pointers we're merging from and an output value.
        merge_rows(x, y) = Tuple(0, 0, cast(input.value().type(), 0));

        Expr candidate_a = merge_rows(r-1, y)[0];
        Expr candidate_b = merge_rows(r-1, y)[1];
        Expr valid_a = candidate_a < chunk_size;
        Expr valid_b = candidate_b < chunk_size;
        Expr value_a = result(clamp(candidate_a, 0, chunk_size-1), 2*y);
        Expr value_b = result(clamp(candidate_b, 0, chunk_size-1), 2*y+1);
        merge_rows(r, y) = tuple_select(valid_a && ((value_a < value_b) || !valid_b),
                                        Tuple(candidate_a + 1, candidate_b, value_a),
                                        Tuple(candidate_a, candidate_b + 1, value_b));


        if (chunk_size <= parallel_work_size) {
            merge_rows.compute_at(parallel_stage, y);
        } else {
            merge_rows.compute_root();
        }

        if (chunk_size == parallel_work_size) {
            parallel_stage(x, y) = merge_rows(x, y)[2];
            parallel_stage.compute_root().parallel(y);
            result = parallel_stage;
        } else {
            result = lambda(x, y, merge_rows(x, y)[2]);
        }
    }

    // Convert back to 1D
    return lambda(x, result(x, 0));
}
コード例 #21
0
ファイル: camera_pipe.cpp プロジェクト: DoDNet/Halide
Func demosaic(Func deinterleaved) {
    // These are the values we already know from the input
    // x_y = the value of channel x at a site in the input of channel y
    // gb refers to green sites in the blue rows
    // gr refers to green sites in the red rows

    // Give more convenient names to the four channels we know
    Func r_r, g_gr, g_gb, b_b;
    g_gr(x, y) = deinterleaved(x, y, 0);
    r_r(x, y)  = deinterleaved(x, y, 1);
    b_b(x, y)  = deinterleaved(x, y, 2);
    g_gb(x, y) = deinterleaved(x, y, 3);

    // These are the ones we need to interpolate
    Func b_r, g_r, b_gr, r_gr, b_gb, r_gb, r_b, g_b;

    // First calculate green at the red and blue sites

    // Try interpolating vertically and horizontally. Also compute
    // differences vertically and horizontally. Use interpolation in
    // whichever direction had the smallest difference.
    Expr gv_r  = avg(g_gb(x, y-1), g_gb(x, y));
    Expr gvd_r = absd(g_gb(x, y-1), g_gb(x, y));
    Expr gh_r  = avg(g_gr(x+1, y), g_gr(x, y));
    Expr ghd_r = absd(g_gr(x+1, y), g_gr(x, y));

    g_r(x, y)  = select(ghd_r < gvd_r, gh_r, gv_r);

    Expr gv_b  = avg(g_gr(x, y+1), g_gr(x, y));
    Expr gvd_b = absd(g_gr(x, y+1), g_gr(x, y));
    Expr gh_b  = avg(g_gb(x-1, y), g_gb(x, y));
    Expr ghd_b = absd(g_gb(x-1, y), g_gb(x, y));

    g_b(x, y)  = select(ghd_b < gvd_b, gh_b, gv_b);

    // Next interpolate red at gr by first interpolating, then
    // correcting using the error green would have had if we had
    // interpolated it in the same way (i.e. add the second derivative
    // of the green channel at the same place).
    Expr correction;
    correction = g_gr(x, y) - avg(g_r(x, y), g_r(x-1, y));
    r_gr(x, y) = correction + avg(r_r(x-1, y), r_r(x, y));

    // Do the same for other reds and blues at green sites
    correction = g_gr(x, y) - avg(g_b(x, y), g_b(x, y-1));
    b_gr(x, y) = correction + avg(b_b(x, y), b_b(x, y-1));

    correction = g_gb(x, y) - avg(g_r(x, y), g_r(x, y+1));
    r_gb(x, y) = correction + avg(r_r(x, y), r_r(x, y+1));

    correction = g_gb(x, y) - avg(g_b(x, y), g_b(x+1, y));
    b_gb(x, y) = correction + avg(b_b(x, y), b_b(x+1, y));

    // Now interpolate diagonally to get red at blue and blue at
    // red. Hold onto your hats; this gets really fancy. We do the
    // same thing as for interpolating green where we try both
    // directions (in this case the positive and negative diagonals),
    // and use the one with the lowest absolute difference. But we
    // also use the same trick as interpolating red and blue at green
    // sites - we correct our interpolations using the second
    // derivative of green at the same sites.

    correction = g_b(x, y)  - avg(g_r(x, y), g_r(x-1, y+1));
    Expr rp_b  = correction + avg(r_r(x, y), r_r(x-1, y+1));
    Expr rpd_b = absd(r_r(x, y), r_r(x-1, y+1));

    correction = g_b(x, y)  - avg(g_r(x-1, y), g_r(x, y+1));
    Expr rn_b  = correction + avg(r_r(x-1, y), r_r(x, y+1));
    Expr rnd_b = absd(r_r(x-1, y), r_r(x, y+1));

    r_b(x, y)  = select(rpd_b < rnd_b, rp_b, rn_b);


    // Same thing for blue at red
    correction = g_r(x, y)  - avg(g_b(x, y), g_b(x+1, y-1));
    Expr bp_r  = correction + avg(b_b(x, y), b_b(x+1, y-1));
    Expr bpd_r = absd(b_b(x, y), b_b(x+1, y-1));

    correction = g_r(x, y)  - avg(g_b(x+1, y), g_b(x, y-1));
    Expr bn_r  = correction + avg(b_b(x+1, y), b_b(x, y-1));
    Expr bnd_r = absd(b_b(x+1, y), b_b(x, y-1));

    b_r(x, y)  =  select(bpd_r < bnd_r, bp_r, bn_r);

    // Interleave the resulting channels
    Func r = interleave_y(interleave_x(r_gr, r_r),
                          interleave_x(r_b, r_gb));
    Func g = interleave_y(interleave_x(g_gr, g_r),
                          interleave_x(g_b, g_gb));
    Func b = interleave_y(interleave_x(b_gr, b_r),
                          interleave_x(b_b, b_gb));

    Func output;
    output(x, y, c) = select(c == 0, r(x, y),
                             c == 1, g(x, y),
                                     b(x, y));


    /* THE SCHEDULE */
    if (schedule == 0) {
        // optimized for ARM
        // Compute these in chunks over tiles, vectorized by 8
        g_r.compute_at(processed, tx).vectorize(x, 8);
        g_b.compute_at(processed, tx).vectorize(x, 8);
        r_gr.compute_at(processed, tx).vectorize(x, 8);
        b_gr.compute_at(processed, tx).vectorize(x, 8);
        r_gb.compute_at(processed, tx).vectorize(x, 8);
        b_gb.compute_at(processed, tx).vectorize(x, 8);
        r_b.compute_at(processed, tx).vectorize(x, 8);
        b_r.compute_at(processed, tx).vectorize(x, 8);
        // These interleave in y, so unrolling them in y helps
        output.compute_at(processed, tx)
            .vectorize(x, 8)
            .unroll(y, 2)
            .reorder(c, x, y).bound(c, 0, 3).unroll(c);
    } else if (schedule == 1) {
        // optimized for X86
        // Don't vectorize, because sse is bad at 16-bit interleaving
        g_r.compute_at(processed, tx);
        g_b.compute_at(processed, tx);
        r_gr.compute_at(processed, tx);
        b_gr.compute_at(processed, tx);
        r_gb.compute_at(processed, tx);
        b_gb.compute_at(processed, tx);
        r_b.compute_at(processed, tx);
        b_r.compute_at(processed, tx);
        // These interleave in x and y, so unrolling them helps
        output.compute_at(processed, tx).unroll(x, 2).unroll(y, 2)
            .reorder(c, x, y).bound(c, 0, 3).unroll(c);

    } else {
        // Basic naive schedule
        g_r.compute_root();
        g_b.compute_root();
        r_gr.compute_root();
        b_gr.compute_root();
        r_gb.compute_root();
        b_gb.compute_root();
        r_b.compute_root();
        b_r.compute_root();
        output.compute_root();
    }
    return output;
}
コード例 #22
0
int main(int argc, char **argv) {
    // Try doing vector loads with a boundary condition in various
    // ways and compare the performance.

    input = Image<uint16_t>(1024+8, 320);

    for (int y = 0; y < input.height(); y++) {
        for (int x = 0; x < input.width(); x++) {
            input(x, y) = rand() & 0xfff;
        }
    }

    output = Image<uint16_t>(1024, 320);

    Var x, y;

    double t_ref, t_clamped, t_scalar, t_pad;

    {
        // Do an unclamped load to get a reference number
        Func f;
        f(x, y) = input(x, y) * 3 + input(x+1, y);

        f.vectorize(x, 8);

        t_ref = test(f, false);
    }

    {
        // Variant 1 - do the clamped vector load
        Func g;
        g(x, y) = input(clamp(x, MIN, MAX), y);

        Func f;
        f(x, y) = g(x, y) * 3 + g(x+1, y);

        f.vectorize(x, 8);

        t_clamped = test(f);
    }

    {
        // Variant 2 - do the load as a scalar op just before the vectorized stuff
        Func g;
        g(x, y) = input(clamp(x, MIN, MAX), y);

        Func f;
        f(x, y) = g(x, y) * 3 + g(x+1, y);

        f.vectorize(x, 8);
        g.compute_at(f, x);

        t_scalar = test(f);
    }

    {
        // Variant 3 - pad each scanline using scalar code
        Func g;
        g(x, y) = input(clamp(x, MIN, MAX), y);

        Func f;
        f(x, y) = g(x, y) * 3 + g(x+1, y);

        f.vectorize(x, 8);
        g.compute_at(f, y);

        t_pad = test(f);
    }

    // This constraint is pretty lax, because the op is so trivial
    // that the overhead of branching is large. For more complex ops,
    // the overhead should be smaller.
    if (t_clamped > 5.0f * t_ref || t_clamped > t_scalar || t_clamped > t_pad) {
        printf("Clamped load timings suspicious:\n"
               "Unclamped: %f\n"
               "Clamped: %f\n"
               "Scalarize the load: %f\n"
               "Pad the input: %f\n",
               t_ref, t_clamped, t_scalar, t_pad);
        return -1;
    }

    printf("Success!\n");

    return 0;
}
コード例 #23
0
    // Now a schedule that uses CUDA or OpenCL.
    void schedule_for_gpu() {
        // We make the decision about whether to use the GPU for each
        // Func independently. If you have one Func computed on the
        // CPU, and the next computed on the GPU, Halide will do the
        // copy-to-gpu under the hood. For this pipeline, there's no
        // reason to use the CPU for any of the stages. Halide will
        // copy the input image to the GPU the first time we run the
        // pipeline, and leave it there to reuse on subsequent runs.

        // As before, we'll compute the LUT once at the start of the
        // pipeline.
        lut.compute_root();

        // Let's compute the look-up-table using the GPU in 16-wide
        // one-dimensional thread blocks. First we split the index
        // into blocks of size 16:
        Var block, thread;
        lut.split(i, block, thread, 16);
        // Then we tell cuda that our Vars 'block' and 'thread'
        // correspond to CUDA's notions of blocks and threads, or
        // OpenCL's notions of thread groups and threads.
        lut.gpu_blocks(block)
           .gpu_threads(thread);

        // This is a very common scheduling pattern on the GPU, so
        // there's a shorthand for it:

        // lut.gpu_tile(i, 16);

        // Func::gpu_tile method is similar to Func::tile, except that
        // it also specifies that the tile coordinates correspond to
        // GPU blocks, and the coordinates within each tile correspond
        // to GPU threads.

        // Compute color channels innermost. Promise that there will
        // be three of them and unroll across them.
        curved.reorder(c, x, y)
              .bound(c, 0, 3)
              .unroll(c);

        // Compute curved in 2D 8x8 tiles using the GPU.
        curved.gpu_tile(x, y, 8, 8);

        // This is equivalent to:
        // curved.tile(x, y, xo, yo, xi, yi, 8, 8)
        //       .gpu_blocks(xo, yo)
        //       .gpu_threads(xi, yi);

        // We'll leave sharpen as inlined into curved.

        // Compute the padded input as needed per GPU block, storing the
        // intermediate result in shared memory. Var::gpu_blocks, and
        // Var::gpu_threads exist to help you schedule producers within
        // GPU threads and blocks.
        padded.compute_at(curved, Var::gpu_blocks());

        // Use the GPU threads for the x and y coordinates of the
        // padded input.
        padded.gpu_threads(x, y);

        // JIT-compile the pipeline for the GPU. CUDA or OpenCL are
        // not enabled by default. We have to construct a Target
        // object, enable one of them, and then pass that target
        // object to compile_jit. Otherwise your CPU will very slowly
        // pretend it's a GPU, and use one thread per output pixel.

        // Start with a target suitable for the machine you're running
        // this on.
        Target target = get_host_target();

        // Then enable OpenCL or CUDA.

        // We'll enable OpenCL here, because it tends to give better
        // performance than CUDA, even with NVidia's drivers, because
        // NVidia's open source LLVM backend doesn't seem to do all
        // the same optimizations their proprietary compiler does.
        target.features |= Target::OpenCL;

        // Uncomment the next line and comment out the line above to
        // try CUDA instead.
        // target.features |= Target::CUDA;

        // If you want to see all of the OpenCL or CUDA API calls done
        // by the pipeline, you can also enable the GPUDebug
        // flag. This is helpful for figuring out which stages are
        // slow, or when CPU -> GPU copies happen. It hurts
        // performance though, so we'll leave it commented out.
        //target.features |= Target::GPUDebug;

        curved.compile_jit(target);
    }
コード例 #24
0
ファイル: fft.cpp プロジェクト: alinas/Halide
ComplexFunc fft2d_r2c(Func r,
                      const vector<int> &R0,
                      const vector<int> &R1,
                      const Target& target,
                      const Fft2dDesc& desc) {
    string prefix = desc.name.empty() ? "r2c_" : desc.name + "_";

    vector<Var> args(r.args());
    Var n0(args[0]), n1(args[1]);
    args.erase(args.begin());
    args.erase(args.begin());

    // Get the innermost variable outside the FFT.
    Var outer = Var::outermost();
    if (!args.empty()) {
        outer = args.front();
    }

    int N0 = product(R0);
    int N1 = product(R1);

    // Cache of twiddle factors for this FFT.
    TwiddleFactorSet twiddle_cache;

    // The gain requested of the FFT.
    Expr gain = desc.gain;

    // Combine pairs of real columns x, y into complex columns z = x + j y. This
    // allows us to compute two real DFTs using one complex FFT. See the large
    // comment above this function for more background.
    //
    // An implementation detail is that we zip the columns in groups from the
    // input data to enable the loads to be dense vectors. x is taken from the
    // even indexed groups columns, y is taken from the odd indexed groups of
    // columns.
    //
    // Changing the group size can (insignificantly) numerically change the result
    // due to regrouping floating point operations. To avoid this, if the FFT
    // description specified a vector width, use it as the group size.
    ComplexFunc zipped(prefix + "zipped");
    int zip_width = desc.vector_width;
    if (zip_width <= 0) {
        zip_width = target.natural_vector_size(r.output_types()[0]);
    }
    // Ensure the zip width divides the zipped extent.
    zip_width = gcd(zip_width, N0 / 2);
    Expr zip_n0 = (n0 / zip_width) * zip_width * 2 + (n0 % zip_width);
    zipped(A({n0, n1}, args)) =
        ComplexExpr(r(A({zip_n0, n1}, args)),
                    r(A({zip_n0 + zip_width, n1}, args)));

    // DFT down the columns first.
    ComplexFunc dft1 = fft_dim1(zipped,
                                R1,
                                -1,  // sign
                                std::min(zip_width, N0 / 2),  // extent of dim 0
                                1.0f,
                                false,  // We parallelize unzipped below instead.
                                prefix,
                                target,
                                &twiddle_cache);

    // Unzip the two groups of real DFTs we zipped together above. For more
    // information about the unzipping operation, see the large comment above this
    // function.
    ComplexFunc unzipped(prefix + "unzipped"); {
        Expr unzip_n0 = (n0 / (zip_width * 2)) * zip_width + (n0 % zip_width);
        ComplexExpr Z = dft1(A({unzip_n0, n1}, args));
        ComplexExpr conjsymZ = conj(dft1(A({unzip_n0, (N1 - n1) % N1}, args)));

        ComplexExpr X = Z + conjsymZ;
        ComplexExpr Y = -j * (Z - conjsymZ);
        // Rather than divide the above expressions by 2 here, adjust the gain
        // instead.
        gain /= 2;

        unzipped(A({n0, n1}, args)) =
            select(n0 % (zip_width * 2) < zip_width, X, Y);
    }

    // Zip the DC and Nyquist DFT bin rows, which should be real.
    ComplexFunc zipped_0(prefix + "zipped_0");
    zipped_0(A({n0, n1}, args)) =
        select(n1 > 0, likely(unzipped(A({n0, n1}, args))),
                       ComplexExpr(re(unzipped(A({n0, 0}, args))),
                                   re(unzipped(A({n0, N1 / 2}, args)))));

    // The vectorization of the columns must not exceed this value.
    int zipped_extent0 = std::min((N1 + 1) / 2, zip_width);

    // transpose so we can FFT dimension 0 (by making it dimension 1).
    ComplexFunc unzippedT, unzippedT_tiled;
    std::tie(unzippedT, unzippedT_tiled) = tiled_transpose(zipped_0, zipped_extent0, target, prefix);

    // DFT down the columns again (the rows of the original).
    ComplexFunc dftT = fft_dim1(unzippedT,
                                R0,
                                -1,  // sign
                                zipped_extent0,
                                gain,
                                desc.parallel,
                                prefix,
                                target,
                                &twiddle_cache);

    // transpose the result back to the original orientation, unless the caller
    // requested a transposed DFT.
    ComplexFunc dft = transpose(dftT);

    // We are going to add a row to the result (with update steps) by unzipping
    // the DC and Nyquist bin rows. To avoid unnecessarily computing some junk for
    // this row before we overwrite it, pad the pure definition with undef.
    dft = ComplexFunc(constant_exterior((Func)dft, Tuple(undef_z()), Expr(), Expr(), Expr(0), Expr(N1 / 2)));

    // Unzip the DFTs of the DC and Nyquist bin DFTs. Unzip the Nyquist DFT first,
    // because the DC bin DFT is updated in-place. For more information about
    // this, see the large comment above this function.
    RDom n0z1(1, N0 / 2);
    RDom n0z2(N0 / 2, N0 / 2);
    // Update 0: Unzip the DC bin of the DFT of the Nyquist bin row.
    dft(A({0, N1 / 2}, args)) = im(dft(A({0, 0}, args)));
    // Update 1: Unzip the rest of the DFT of the Nyquist bin row.
    dft(A({n0z1, N1 / 2}, args)) =
        0.5f * -j * (dft(A({n0z1, 0}, args)) - conj(dft(A({N0 - n0z1, 0}, args))));
    // Update 2: Compute the rest of the Nyquist bin row via conjugate symmetry.
    // Note that this redundantly computes n0 = N0/2, but that's faster and easier
    // than trying to deal with N0/2 - 1 bins.
    dft(A({n0z2, N1 / 2}, args)) = conj(dft(A({N0 - n0z2, N1 / 2}, args)));

    // Update 3: Unzip the DC bin of the DFT of the DC bin row.
    dft(A({0, 0}, args)) = re(dft(A({0, 0}, args)));
    // Update 4: Unzip the rest of the DFT of the DC bin row.
    dft(A({n0z1, 0}, args)) =
        0.5f * (dft(A({n0z1, 0}, args)) + conj(dft(A({N0 - n0z1, 0}, args))));
    // Update 5: Compute the rest of the DC bin row via conjugate symmetry.
    // Note that this redundantly computes n0 = N0/2, but that's faster and easier
    // than trying to deal with N0/2 - 1 bins.
    dft(A({n0z2, 0}, args)) = conj(dft(A({N0 - n0z2, 0}, args)));

    // Schedule.
    dftT.compute_at(dft, outer);

    // Schedule the tiled transposes.
    if (unzippedT_tiled.defined()) {
        unzippedT_tiled.compute_at(dftT, group);
    }

    // Schedule the input, if requested.
    if (desc.schedule_input) {
        r.compute_at(dft1, group);
    }

    // Vectorize the zip groups, and unroll by a factor of 2 to simplify the
    // even/odd selection.
    Var n0o("n0o"), n0i("n0i");
    unzipped.compute_at(dft, outer)
        .split(n0, n0o, n0i, zip_width * 2)
        .reorder(n0i, n1, n0o)
        .vectorize(n0i, zip_width)
        .unroll(n0i);
    dft1.compute_at(unzipped, n0o);
    if (desc.parallel) {
        // Note that this also parallelizes dft1, which is computed inside this loop
        // of unzipped.
        unzipped.parallel(n0o);
    }

    // Schedule the final DFT transpose and unzipping updates.
    dft.vectorize(n0, target.natural_vector_size<float>())
        .unroll(n0, std::min(N0 / target.natural_vector_size<float>(), 4));

    // The Nyquist bin at n0z = N0/2 looks like a race condition because it
    // simplifies to an expression similar to the DC bin. However, we include it
    // in the reduction because it makes the reduction have length N/2, which is
    // convenient for vectorization, and just ignore the resulting appearance of
    // a race condition.
    dft.update(1).allow_race_conditions()
        .vectorize(n0z1, target.natural_vector_size<float>());
    dft.update(2).allow_race_conditions()
        .vectorize(n0z2, target.natural_vector_size<float>());
    dft.update(4).allow_race_conditions()
        .vectorize(n0z1, target.natural_vector_size<float>());
    dft.update(5).allow_race_conditions()
        .vectorize(n0z2, target.natural_vector_size<float>());

    // Our result is undefined outside these bounds.
    dft.bound(n0, 0, N0);
    dft.bound(n1, 0, (N1 + 1) / 2 + 1);

    return dft;
}