Func process(Func raw, Type result_type, ImageParam matrix_3200, ImageParam matrix_7000, Param<float> color_temp, Param<float> gamma, Param<float> contrast, Param<int> blackLevel, Param<int> whiteLevel) { Var yii, xi; Func denoised = hot_pixel_suppression(raw); Func deinterleaved = deinterleave(denoised); Func demosaiced = demosaic(deinterleaved); Func corrected = color_correct(demosaiced, matrix_3200, matrix_7000, color_temp); Func curved = apply_curve(corrected, result_type, gamma, contrast, blackLevel, whiteLevel); processed(x, y, c) = curved(x, y, c); // Schedule Expr out_width = processed.output_buffer().width(); Expr out_height = processed.output_buffer().height(); int strip_size = 32; int vec = target.natural_vector_size(UInt(16)); if (target.has_feature(Target::HVX_64)) { vec = 32; } else if (target.has_feature(Target::HVX_128)) { vec = 64; } denoised.compute_at(processed, yi).store_at(processed, yo) .fold_storage(y, 8) .vectorize(x, vec); deinterleaved.compute_at(processed, yi).store_at(processed, yo) .fold_storage(y, 4) .vectorize(x, 2*vec, TailStrategy::RoundUp) .reorder(c, x, y) .unroll(c); corrected.compute_at(processed, x) .vectorize(x, vec) .reorder(c, x, y) .unroll(c); processed.compute_root() .split(y, yo, yi, strip_size) .split(yi, yi, yii, 2) .split(x, x, xi, 2*vec, TailStrategy::RoundUp) .reorder(xi, c, yii, x, yi, yo) .vectorize(xi, 2*vec) .parallel(yo); if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { processed.hexagon(); denoised.align_storage(x, vec); deinterleaved.align_storage(x, vec); corrected.align_storage(x, vec); } // We can generate slightly better code if we know the splits divide the extent. processed .bound(c, 0, 3) .bound(x, 0, ((out_width)/(2*vec))*(2*vec)) .bound(y, 0, (out_height/strip_size)*strip_size); return processed; }
int main(int argc, char **argv) { Func source; source.define_extern("make_data", std::vector<ExternFuncArgument>(), Float(32), 2); Func sink; Var x, y; sink(x, y) = source(x, y) - sin(x + y); Var xi, yi; sink.tile(x, y, xi, yi, 32, 32); // Compute the source per tile of sink source.compute_at(sink, x); Image<float> output = sink.realize(100, 100); // Should be all zeroes. RDom r(output); float error = evaluate_may_gpu<float>(sum(abs(output(r.x, r.y)))); if (error != 0) { printf("Something went wrong\n"); return -1; } Func multi; std::vector<Type> types; types.push_back(Float(32)); types.push_back(Float(32)); multi.define_extern("make_data_multi", std::vector<ExternFuncArgument>(), types, 2); Func sink_multi; sink_multi(x, y) = multi(x, y)[0] - sin(x + y) + multi(x, y)[1] - cos(x + y); sink_multi.tile(x, y, xi, yi, 32, 32); // Compute the source per tile of sink multi.compute_at(sink_multi, x); Image<float> output_multi = sink_multi.realize(100, 100); // Should be all zeroes. float error_multi = evaluate<float>(sum(abs(output_multi(r.x, r.y)))); if (error_multi != 0) { printf("Something went wrong in multi case\n"); return -1; } printf("Success!\n"); return 0; }
Func blur_then_transpose(Func f, Func coeff, Expr size, Expr sigma) { Func blurred = performBlur(f, coeff, size, sigma); // Also compute attenuation due to zero boundary condition by // blurring an image of ones in the same way. This gives a // boundary condition equivalent to reweighting the Gaussian // near the edge. (TODO: add a generator param to select // different boundary conditions). Func ones; ones(x, y) = 1.0f; Func attenuation = performBlur(ones, coeff, size, sigma); // Invert the attenuation so we can multiply by it. The // attenuation is the same for every row/channel so we only // need one column. Func inverse_attenuation; inverse_attenuation(y) = 1.0f / attenuation(0, y); // Transpose it Func transposed; transposed(x, y) = blurred(y, x); // Correct for attenuation Func out; out(x, y) = transposed(x, y) * inverse_attenuation(x); // Schedule it. Var yi, xi, yii, xii; attenuation.compute_root(); inverse_attenuation.compute_root().vectorize(y, 8); out.compute_root() .tile(x, y, xi, yi, 8, 32) .tile(xi, yi, xii, yii, 8, 8) .vectorize(xii).unroll(yii).parallel(y); blurred.compute_at(out, y); transposed.compute_at(out, xi).vectorize(y).unroll(x); for (int i = 0; i < blurred.num_update_definitions(); i++) { RDom r = blurred.reduction_domain(i); if (r.defined()) { blurred.update(i).reorder(x, r); } blurred.update(i).vectorize(x, 8).unroll(x); } return out; }
int main(int argc, char **argv) { Func mandelbrot; Var x, y; Param<float> x_min, x_max, y_min, y_max, c_real, c_imag; Param<int> w, h, iters; Complex initial(lerp(x_min, x_max, cast<float>(x)/w), lerp(y_min, y_max, cast<float>(y)/h)); Complex c(c_real, c_imag); Var z; mandelbrot(x, y, z) = initial; RDom t(1, iters); Complex current = mandelbrot(x, y, t-1); mandelbrot(x, y, t) = current*current + c; // How many iterations until something escapes a circle of radius 2? Func count; Tuple escape = argmin(magnitude(mandelbrot(x, y, t)) < 4); // If it never escapes, use the value 0 count(x, y) = select(escape[1], 0, escape[0]); Var xi, yi, xo, yo; count.tile(x, y, xo, yo, xi, yi, 8, 8); count.parallel(yo).vectorize(xi, 4).unroll(xi).unroll(yi, 2); mandelbrot.compute_at(count, xo); Argument args[] = {x_min, x_max, y_min, y_max, c_real, c_imag, iters, w, h}; count.compile_to_file("mandelbrot", std::vector<Argument>(args, args + 9)); return 0; }
int main(int argc, char **argv) { Image<uint8_t> input = load<uint8_t>("P1070046.png"); timeval t1, t2; gettimeofday(&t1, NULL); Var x,y,c; Func toFloat; toFloat(c,x,y) = cast<float>(input(x,y,c))/255.0; Func toHSV; toHSV = hsv(toFloat); Func saturated; saturated(c,x,y) = select(c != 1, toHSV(c,x,y), clamp(1*fast_pow(toHSV(c,x,y),0.5), 0,1)); Func toRGB,toInt; toRGB = rgb(saturated); toInt(x,y,c) = cast<uint8_t>(toRGB(c,x,y)*255.0); Var y_outer,y_inner; toInt.reorder(c,x,y); toInt.split(y,y_outer, y_inner, 256); toInt.parallel(y_outer); toHSV.compute_at(toInt,x); Halide::Image<uint8_t> output = toInt.realize(input.width(),input.height(),input.channels()); gettimeofday(&t2, NULL); save(output,"vibSat.png"); std::cout<<float(t2.tv_sec - t1.tv_sec) + float(t2.tv_usec - t1.tv_usec)/1000000.0f << std::endl; return 0; }
// Now we define methods that give our pipeline several different // schedules. void schedule_for_cpu() { // Compute the look-up-table ahead of time. lut.compute_root(); // Compute color channels innermost. Promise that there will // be three of them and unroll across them. curved.reorder(c, x, y) .bound(c, 0, 3) .unroll(c); // Look-up-tables don't vectorize well, so just parallelize // curved in slices of 16 scanlines. Var yo, yi; curved.split(y, yo, yi, 16) .parallel(yo); // Compute sharpen as needed per scanline of curved. sharpen.compute_at(curved, yi); // Vectorize the sharpen. It's 16-bit so we'll vectorize it 8-wide. sharpen.vectorize(x, 8); // Compute the padded input as needed per scanline of curved, // reusing previous values computed within the same strip of // 16 scanlines. padded.store_at(curved, yo) .compute_at(curved, yi); // Also vectorize the padding. It's 8-bit, so we'll vectorize // 16-wide. padded.vectorize(x, 16); // JIT-compile the pipeline for the CPU. curved.compile_jit(); }
int main(int argc, char **argv) { Func source; source.define_extern("make_data", std::vector<ExternFuncArgument>(), Float(32), 2); Func sink; Var x, y; sink(x, y) = source(x, y) - sin(x + y); Var xi, yi; sink.tile(x, y, xi, yi, 32, 32); // Compute the source per tile of sink source.compute_at(sink, x); Image<float> output = sink.realize(100, 100); // Should be all zeroes. RDom r(output); float error = evaluate<float>(sum(abs(output(r.x, r.y)))); if (error != 0) { printf("Something went wrong\n"); return -1; } printf("Success!\n"); return 0; }
explicit Test(int i) { // We use specific calls as proxies for verifying that compute_at // happens where we expect: sin() for the inner function, cos() // for the outer one; these are chosen mainly because they won't // ever get generated incidentally by the lowering code as part of // general code structure. inner = Func("inner" + std::to_string(i)); inner(x, y, c) = sin(cast<float>(x + y + c)); inner.compute_at(inner_compute_at).store_at(inner_store_at); outer = Func("outer" + std::to_string(i)); outer(x, y, c) = cos(cast<float>(inner(x, y, c))); }
int global_wrap_test() { Func source("source"), g("g"), h("h"), i("i"); Var x("x"), y("y"); source(x, y) = x + y; ImageParam img(Int(32), 2, "img"); Buffer<int> buf = source.realize(200, 200); img.set(buf); g(x, y) = img(x, y); h(x, y) = g(x, y) + img(x, y); Var xi("xi"), yi("yi"), t("t"); Func wrapper = img.in(); Func img_f = img; img_f.compute_root(); h.compute_root().tile(x, y, xi, yi, 16, 16).fuse(x, y, t).parallel(t); g.compute_at(h, yi); wrapper.compute_at(h, yi).tile(_0, _1, xi, yi, 8, 8).fuse(xi, yi, t).vectorize(t, 4); // Check the call graphs. // Expect 'g' to call 'wrapper', 'wrapper' to call 'img_f', 'img_f' to call 'img', // 'h' to call 'wrapper' and 'g' Module m = h.compile_to_module({h.infer_arguments()}); CheckCalls c; m.functions().front().body.accept(&c); CallGraphs expected = { {h.name(), {g.name(), wrapper.name()}}, {g.name(), {wrapper.name()}}, {wrapper.name(), {img_f.name()}}, {img_f.name(), {img.name()}}, }; if (check_call_graphs(c.calls, expected) != 0) { return -1; } Buffer<int> im = h.realize(200, 200); auto func = [](int x, int y) { return 2*(x + y); }; if (check_image(im, func)) { return -1; } return 0; }
// Defines a func to blur the columns of an input with a first order low // pass IIR filter, followed by a transpose. Func blur_cols_transpose(Func input, Expr height, Expr alpha) { Func blur; // Pure definition: do nothing. blur(x, y, c) = undef<float>(); // Update 0: set the top row of the result to the input. blur(x, 0, c) = input(x, 0, c); // Update 1: run the IIR filter down the columns. RDom ry(1, height - 1); blur(x, ry, c) = (1 - alpha)*blur(x, ry - 1, c) + alpha*input(x, ry, c); // Update 2: run the IIR blur up the columns. Expr flip_ry = height - ry - 1; blur(x, flip_ry, c) = (1 - alpha)*blur(x, flip_ry + 1, c) + alpha*blur(x, flip_ry, c); // Transpose the blur. Func transpose; transpose(x, y, c) = blur(y, x, c); // Schedule: // Split the transpose into tiles of rows. Parallelize over channels // and strips (Halide supports nested parallelism). Var xo, yo; transpose.compute_root() .tile(x, y, xo, yo, x, y, 8, 8) .vectorize(x) .parallel(yo) .parallel(c); // Run the filter on each row of tiles (which corresponds to a strip of // columns in the input). blur.compute_at(transpose, yo); // Vectorize computations within the strips. blur.update(1) .reorder(x, ry) .vectorize(x); blur.update(2) .reorder(x, ry) .vectorize(x); return transpose; }
int global_wrap_test() { Func f("f"), g("g"), h("h"), i("i"); Var x("x"), y("y"); f(x, y) = x + y; g(x, y) = f(x, y); h(x, y) = g(x, y) + f(x, y); Var xi("xi"), yi("yi"), t("t"); Func wrapper = f.in(); f.compute_root(); h.compute_root().tile(x, y, xi, yi, 16, 16).fuse(x, y, t).parallel(t); g.compute_at(h, yi); wrapper.compute_at(h, yi).tile(x, y, xi, yi, 8, 8).fuse(xi, yi, t).vectorize(t, 4); // Check the call graphs. // Expect 'g' to call 'wrapper', 'wrapper' to call 'f', 'f' to call nothing, // 'h' to call 'wrapper' and 'g' Module m = h.compile_to_module({}); CheckCalls c; m.functions().front().body.accept(&c); CallGraphs expected = { {h.name(), {g.name(), wrapper.name()}}, {g.name(), {wrapper.name()}}, {wrapper.name(), {f.name()}}, {f.name(), {}}, }; if (check_call_graphs(c.calls, expected) != 0) { return -1; } Image<int> im = h.realize(200, 200); auto func = [](int x, int y) { return 2*(x + y); }; if (check_image(im, func)) { return -1; } return 0; }
int main(int argc, char **argv) { Var x, y; Func mandelbrot; // Use a different scale on x and y because terminal characters // are not square. Arbitrarily chosen to fit the set nicely. Complex initial(x/20.0f, y/8.0f); Var z; mandelbrot(x, y, z) = Complex(0.0f, 0.0f); RDom t(1, 40); Complex current = mandelbrot(x, y, t-1); mandelbrot(x, y, t) = current*current + initial; // How many iterations until something escapes a circle of radius 2? Func count; Tuple escape = argmin(magnitude(mandelbrot(x, y, t)) < 4); // If it never escapes, use the value 0 count(x, y) = select(escape[1], 0, escape[0]); RDom r(-45, 71, -10, 21); Func render; render() = 0; render() = draw_pixel(r.x, r.y, count(r.x, r.y)); mandelbrot.compute_at(render, r.x); render.realize(); printf("\n"); // Check draw_pixel was called the right number of times. if (call_count != 71*21) { printf("Something went wrong\n"); return -1; } printf("Success!\n"); return 0; }
Func process(Func raw, Type result_type, ImageParam matrix_3200, ImageParam matrix_7000, Param<float> color_temp, Param<float> gamma, Param<float> contrast) { Var xi, yi; Func denoised = hot_pixel_suppression(raw); Func deinterleaved = deinterleave(denoised); Func demosaiced = demosaic(deinterleaved); Func corrected = color_correct(demosaiced, matrix_3200, matrix_7000, color_temp); Func curved = apply_curve(corrected, result_type, gamma, contrast); processed(tx, ty, c) = curved(tx, ty, c); // Schedule processed.bound(c, 0, 3); // bound color loop 0-3, properly if (schedule == 0) { // Compute in chunks over tiles, vectorized by 8 denoised.compute_at(processed, tx).vectorize(x, 8); deinterleaved.compute_at(processed, tx).vectorize(x, 8).reorder(c, x, y).unroll(c); corrected.compute_at(processed, tx).vectorize(x, 4).reorder(c, x, y).unroll(c); processed.tile(tx, ty, xi, yi, 32, 32).reorder(xi, yi, c, tx, ty); processed.parallel(ty); } else if (schedule == 1) { // Same as above, but don't vectorize (sse is bad at interleaved 16-bit ops) denoised.compute_at(processed, tx); deinterleaved.compute_at(processed, tx); corrected.compute_at(processed, tx); processed.tile(tx, ty, xi, yi, 128, 128).reorder(xi, yi, c, tx, ty); processed.parallel(ty); } else { denoised.compute_root(); deinterleaved.compute_root(); corrected.compute_root(); processed.compute_root(); } return processed; }
int main(int argc, char **argv) { if (!get_jit_target_from_environment().has_gpu_feature()) { printf("Not running test because no gpu target enabled\n"); return 0; } { Func f; Var x, y, z; // Construct a Func with lots of potential race conditions, and // then run it in thread blocks on the gpu. f(x, y) = x + 100 * y; const int passes = 10; for (int i = 0; i < passes; i++) { RDom rx(0, 10); // Flip each row, using spots 10-19 as temporary storage f(rx + 10, y) = f(9 - rx, y); f(rx, y) = f(rx + 10, y); // Flip each column the same way RDom ry(0, 8); f(x, ry + 8) = f(x, 7 - ry); f(x, ry) = f(x, ry + 8); } Func g; g(x, y) = f(0, 0)+ f(9, 7); g.gpu_tile(x, y, 16, 8); f.compute_at(g, Var::gpu_blocks()); for (int i = 0; i < passes; i++) { f.update(i*4 + 0).gpu_threads(y); f.update(i*4 + 1).gpu_threads(y); f.update(i*4 + 2).gpu_threads(x); f.update(i*4 + 3).gpu_threads(x); } Image<int> out = g.realize(100, 100); for (int y = 0; y < out.height(); y++) { for (int x = 0; x < out.width(); x++) { int correct = 7*100 + 9; if (out(x, y) != correct) { printf("out(%d, %d) = %d instead of %d\n", x, y, out(x, y), correct); return -1; } } } } { // Construct a Func with undef stages, then run it in thread // blocks and make sure the right number of syncthreads are // added. Func f; Var x, y, z; f(x, y) = undef<int>(); f(x, y) += x + 100 * y; // This next line is dubious, because it entirely masks the // effect of the previous definition. If you add an undefined // value to the previous def, then Halide can evaluate this to // whatever it likes. Currently we'll just elide this update // definition. f(x, y) += undef<int>(); f(x, y) += y * 100 + x; Func g; g(x, y) = f(0, 0) + f(7, 7); g.gpu_tile(x, y, 8, 8); f.compute_at(g, Var::gpu_blocks()); f.gpu_threads(x, y); f.update(0).gpu_threads(x, y); f.update(1).gpu_threads(x, y); f.update(2).gpu_threads(x, y); // There should be two thread barriers: one in between the // non-undef definitions, and one between f and g. g.add_custom_lowering_pass(new CheckBarrierCount(2)); Image<int> out = g.realize(100, 100); } printf("Success!\n"); return 0; }
Func demosaic(Func deinterleaved) { // These are the values we already know from the input // x_y = the value of channel x at a site in the input of channel y // gb refers to green sites in the blue rows // gr refers to green sites in the red rows // Give more convenient names to the four channels we know Func r_r, g_gr, g_gb, b_b; g_gr(x, y) = deinterleaved(x, y, 0); r_r(x, y) = deinterleaved(x, y, 1); b_b(x, y) = deinterleaved(x, y, 2); g_gb(x, y) = deinterleaved(x, y, 3); // These are the ones we need to interpolate Func b_r, g_r, b_gr, r_gr, b_gb, r_gb, r_b, g_b; // First calculate green at the red and blue sites // Try interpolating vertically and horizontally. Also compute // differences vertically and horizontally. Use interpolation in // whichever direction had the smallest difference. Expr gv_r = avg(g_gb(x, y-1), g_gb(x, y)); Expr gvd_r = absd(g_gb(x, y-1), g_gb(x, y)); Expr gh_r = avg(g_gr(x+1, y), g_gr(x, y)); Expr ghd_r = absd(g_gr(x+1, y), g_gr(x, y)); g_r(x, y) = select(ghd_r < gvd_r, gh_r, gv_r); Expr gv_b = avg(g_gr(x, y+1), g_gr(x, y)); Expr gvd_b = absd(g_gr(x, y+1), g_gr(x, y)); Expr gh_b = avg(g_gb(x-1, y), g_gb(x, y)); Expr ghd_b = absd(g_gb(x-1, y), g_gb(x, y)); g_b(x, y) = select(ghd_b < gvd_b, gh_b, gv_b); // Next interpolate red at gr by first interpolating, then // correcting using the error green would have had if we had // interpolated it in the same way (i.e. add the second derivative // of the green channel at the same place). Expr correction; correction = g_gr(x, y) - avg(g_r(x, y), g_r(x-1, y)); r_gr(x, y) = correction + avg(r_r(x-1, y), r_r(x, y)); // Do the same for other reds and blues at green sites correction = g_gr(x, y) - avg(g_b(x, y), g_b(x, y-1)); b_gr(x, y) = correction + avg(b_b(x, y), b_b(x, y-1)); correction = g_gb(x, y) - avg(g_r(x, y), g_r(x, y+1)); r_gb(x, y) = correction + avg(r_r(x, y), r_r(x, y+1)); correction = g_gb(x, y) - avg(g_b(x, y), g_b(x+1, y)); b_gb(x, y) = correction + avg(b_b(x, y), b_b(x+1, y)); // Now interpolate diagonally to get red at blue and blue at // red. Hold onto your hats; this gets really fancy. We do the // same thing as for interpolating green where we try both // directions (in this case the positive and negative diagonals), // and use the one with the lowest absolute difference. But we // also use the same trick as interpolating red and blue at green // sites - we correct our interpolations using the second // derivative of green at the same sites. correction = g_b(x, y) - avg(g_r(x, y), g_r(x-1, y+1)); Expr rp_b = correction + avg(r_r(x, y), r_r(x-1, y+1)); Expr rpd_b = absd(r_r(x, y), r_r(x-1, y+1)); correction = g_b(x, y) - avg(g_r(x-1, y), g_r(x, y+1)); Expr rn_b = correction + avg(r_r(x-1, y), r_r(x, y+1)); Expr rnd_b = absd(r_r(x-1, y), r_r(x, y+1)); r_b(x, y) = select(rpd_b < rnd_b, rp_b, rn_b); // Same thing for blue at red correction = g_r(x, y) - avg(g_b(x, y), g_b(x+1, y-1)); Expr bp_r = correction + avg(b_b(x, y), b_b(x+1, y-1)); Expr bpd_r = absd(b_b(x, y), b_b(x+1, y-1)); correction = g_r(x, y) - avg(g_b(x+1, y), g_b(x, y-1)); Expr bn_r = correction + avg(b_b(x+1, y), b_b(x, y-1)); Expr bnd_r = absd(b_b(x+1, y), b_b(x, y-1)); b_r(x, y) = select(bpd_r < bnd_r, bp_r, bn_r); // Interleave the resulting channels Func r = interleave_y(interleave_x(r_gr, r_r), interleave_x(r_b, r_gb)); Func g = interleave_y(interleave_x(g_gr, g_r), interleave_x(g_b, g_gb)); Func b = interleave_y(interleave_x(b_gr, b_r), interleave_x(b_b, b_gb)); Func output; output(x, y, c) = select(c == 0, r(x, y), c == 1, g(x, y), b(x, y)); /* THE SCHEDULE */ int vec = target.natural_vector_size(UInt(16)); if (target.has_feature(Target::HVX_64)) { vec = 32; } else if (target.has_feature(Target::HVX_128)) { vec = 64; } g_r.compute_at(processed, yi) .store_at(processed, yo) .vectorize(x, vec, TailStrategy::RoundUp) .fold_storage(y, 2); g_b.compute_at(processed, yi) .store_at(processed, yo) .vectorize(x, vec, TailStrategy::RoundUp) .fold_storage(y, 2); output.compute_at(processed, x) .vectorize(x) .unroll(y) .reorder(c, x, y) .unroll(c); if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { g_r.align_storage(x, vec); g_b.align_storage(x, vec); } return output; }
int main(int argc, char **argv) { { call_count = 0; Func count_calls; count_calls.define_extern("count_calls", std::vector<ExternFuncArgument>(), UInt(8), 2); Func f; f() = count_calls(0, 0); f.compute_root().memoize(); Image<uint8_t> result1 = f.realize(); Image<uint8_t> result2 = f.realize(); assert(result1(0) == 42); assert(result2(0) == 42); assert(call_count == 1); } { call_count = 0; Param<int32_t> coord; Func count_calls; count_calls.define_extern("count_calls", std::vector<ExternFuncArgument>(), UInt(8), 2); Func f, g; Var x, y; f() = count_calls(coord, coord); f.compute_root().memoize(); g(x, y) = f(); coord.set(0); Image<uint8_t> out1 = g.realize(256, 256); Image<uint8_t> out2 = g.realize(256, 256); for (int32_t i = 0; i < 256; i++) { for (int32_t j = 0; j < 256; j++) { assert(out1(i, j) == 42); assert(out2(i, j) == 42); } } assert(call_count == 1); coord.set(1); Image<uint8_t> out3 = g.realize(256, 256); Image<uint8_t> out4 = g.realize(256, 256); for (int32_t i = 0; i < 256; i++) { for (int32_t j = 0; j < 256; j++) { assert(out3(i, j) == 42); assert(out4(i, j) == 42); } } assert(call_count == 2); } { call_count = 0; Func count_calls; count_calls.define_extern("count_calls", std::vector<ExternFuncArgument>(), UInt(8), 2); Func f; Var x, y; f(x, y) = count_calls(x, y) + count_calls(x, y); count_calls.compute_root().memoize(); Image<uint8_t> out1 = f.realize(256, 256); Image<uint8_t> out2 = f.realize(256, 256); for (int32_t i = 0; i < 256; i++) { for (int32_t j = 0; j < 256; j++) { assert(out1(i, j) == (42 + 42)); assert(out2(i, j) == (42 + 42)); } } assert(call_count == 1); } call_count = 0; { Func count_calls_23; count_calls_23.define_extern("count_calls_with_arg", Internal::vec(ExternFuncArgument(cast<uint8_t>(23))), UInt(8), 2); Func count_calls_42; count_calls_42.define_extern("count_calls_with_arg", Internal::vec(ExternFuncArgument(cast<uint8_t>(42))), UInt(8), 2); Func f; Var x, y; f(x, y) = count_calls_23(x, y) + count_calls_42(x, y); count_calls_23.compute_root().memoize(); count_calls_42.compute_root().memoize(); Image<uint8_t> out1 = f.realize(256, 256); Image<uint8_t> out2 = f.realize(256, 256); for (int32_t i = 0; i < 256; i++) { for (int32_t j = 0; j < 256; j++) { assert(out1(i, j) == (23 + 42)); assert(out2(i, j) == (23 + 42)); } } assert(call_count_with_arg == 2); } { Param<uint8_t> val1; Param<uint8_t> val2; call_count_with_arg = 0; Func count_calls_val1; count_calls_val1.define_extern("count_calls_with_arg", Internal::vec(ExternFuncArgument(Expr(val1))), UInt(8), 2); Func count_calls_val2; count_calls_val2.define_extern("count_calls_with_arg", Internal::vec(ExternFuncArgument(Expr(val2))), UInt(8), 2); Func f; Var x, y; f(x, y) = count_calls_val1(x, y) + count_calls_val2(x, y); count_calls_val1.compute_root().memoize(); count_calls_val2.compute_root().memoize(); val1.set(23); val2.set(42); Image<uint8_t> out1 = f.realize(256, 256); Image<uint8_t> out2 = f.realize(256, 256); val1.set(42); Image<uint8_t> out3 = f.realize(256, 256); val1.set(23); Image<uint8_t> out4 = f.realize(256, 256); val1.set(42); Image<uint8_t> out5 = f.realize(256, 256); val2.set(57); Image<uint8_t> out6 = f.realize(256, 256); for (int32_t i = 0; i < 256; i++) { for (int32_t j = 0; j < 256; j++) { assert(out1(i, j) == (23 + 42)); assert(out2(i, j) == (23 + 42)); assert(out3(i, j) == (42 + 42)); assert(out4(i, j) == (23 + 42)); assert(out5(i, j) == (42 + 42)); assert(out6(i, j) == (42 + 57)); } } assert(call_count_with_arg == 4); } { Param<float> val; call_count_with_arg = 0; Func count_calls; count_calls.define_extern("count_calls_with_arg", Internal::vec(ExternFuncArgument(cast<uint8_t>(val))), UInt(8), 2); Func f; Var x, y; f(x, y) = count_calls(x, y) + count_calls(x, y); count_calls.compute_root().memoize(); val.set(23.0f); Image<uint8_t> out1 = f.realize(256, 256); val.set(23.4f); Image<uint8_t> out2 = f.realize(256, 256); for (int32_t i = 0; i < 256; i++) { for (int32_t j = 0; j < 256; j++) { assert(out1(i, j) == (23 + 23)); assert(out2(i, j) == (23 + 23)); } } assert(call_count_with_arg == 2); } { Param<float> val; call_count_with_arg = 0; Func count_calls; count_calls.define_extern("count_calls_with_arg", Internal::vec(ExternFuncArgument(memoize_tag(cast<uint8_t>(val)))), UInt(8), 2); Func f; Var x, y; f(x, y) = count_calls(x, y) + count_calls(x, y); count_calls.compute_root().memoize(); val.set(23.0f); Image<uint8_t> out1 = f.realize(256, 256); val.set(23.4f); Image<uint8_t> out2 = f.realize(256, 256); for (int32_t i = 0; i < 256; i++) { for (int32_t j = 0; j < 256; j++) { assert(out1(i, j) == (23 + 23)); assert(out2(i, j) == (23 + 23)); } } assert(call_count_with_arg == 1); } { // Case with bounds computed not equal to bounds realized. Param<float> val; Param<int32_t> index; call_count_with_arg = 0; Func count_calls; count_calls.define_extern("count_calls_with_arg", Internal::vec(ExternFuncArgument(cast<uint8_t>(val))), UInt(8), 2); Func f, g, h; Var x; f(x) = count_calls(x, 0) + cast<uint8_t>(x); g(x) = f(x); h(x) = g(4) + g(index); f.compute_root().memoize(); g.vectorize(x, 8).compute_at(h, x); val.set(23.0f); index.set(2); Image<uint8_t> out1 = h.realize(1); assert(out1(0) == (uint8_t)(2 * 23 + 4 + 2)); assert(call_count_with_arg == 3); index.set(4); out1 = h.realize(1); assert(out1(0) == (uint8_t)(2 * 23 + 4 + 4)); assert(call_count_with_arg == 4); } { // Test Tuple case Param<float> val; call_count_with_arg = 0; Func count_calls; count_calls.define_extern("count_calls_with_arg", Internal::vec(ExternFuncArgument(cast<uint8_t>(val))), UInt(8), 2); Func f; Var x, y, xi, yi; f(x, y) = Tuple(count_calls(x, y) + cast<uint8_t>(x), x); count_calls.compute_root().memoize(); f.compute_root().memoize(); Func g; g(x, y) = Tuple(f(x, y)[0] + f(x - 1, y)[0] + f(x + 1, y)[0], f(x, y)[1]); val.set(23.0f); Realization out = g.realize(128, 128); Image<uint8_t> out0 = out[0]; Image<int32_t> out1 = out[1]; for (int32_t i = 0; i < 100; i++) { for (int32_t j = 0; j < 100; j++) { assert(out0(i, j) == (uint8_t)(3 * 23 + i + (i - 1) + (i + 1))); assert(out1(i, j) == i); } } out = g.realize(128, 128); out0 = out[0]; out1 = out[1]; for (int32_t i = 0; i < 100; i++) { for (int32_t j = 0; j < 100; j++) { assert(out0(i, j) == (uint8_t)(3 * 23 + i + (i - 1) + (i + 1))); assert(out1(i, j) == i); } } assert(call_count_with_arg == 1); } { // Test cache eviction Param<float> val; call_count_with_arg = 0; Func count_calls; count_calls.define_extern("count_calls_with_arg", Internal::vec(ExternFuncArgument(cast<uint8_t>(val))), UInt(8), 2); Func f; Var x, y, xi, yi; f(x, y) = count_calls(x, y) + cast<uint8_t>(x); count_calls.compute_root().memoize(); Func g; g(x, y) = f(x, y) + f(x - 1, y) + f(x + 1, y); Internal::JITSharedRuntime::memoization_cache_set_size(1000000); for (int v = 0; v < 1000; v++) { int r = rand() % 256; val.set((float)r); Image<uint8_t> out1 = g.realize(128, 128); for (int32_t i = 0; i < 100; i++) { for (int32_t j = 0; j < 100; j++) { assert(out1(i, j) == (uint8_t)(3 * r + i + (i - 1) + (i + 1))); } } } // TODO work out an assertion on call count here. fprintf(stderr, "Call count is %d.\n", call_count_with_arg); // Return cache size to default. Internal::JITSharedRuntime::memoization_cache_set_size(0); } { // Test flushing entire cache with a single element larger than the cache Param<float> val; call_count_with_arg = 0; Func count_calls; count_calls.define_extern("count_calls_with_arg", Internal::vec(ExternFuncArgument(cast<uint8_t>(val))), UInt(8), 2); Func f; Var x, y, xi, yi; f(x, y) = count_calls(x, y) + cast<uint8_t>(x); count_calls.compute_root().memoize(); Func g; g(x, y) = f(x, y) + f(x - 1, y) + f(x + 1, y); Internal::JITSharedRuntime::memoization_cache_set_size(1000000); for (int v = 0; v < 1000; v++) { int r = rand() % 256; val.set((float)r); Image<uint8_t> out1 = g.realize(128, 128); for (int32_t i = 0; i < 100; i++) { for (int32_t j = 0; j < 100; j++) { assert(out1(i, j) == (uint8_t)(3 * r + i + (i - 1) + (i + 1))); } } } // TODO work out an assertion on call count here. fprintf(stderr, "Call count before oversize realize is %d.\n", call_count_with_arg); call_count_with_arg = 0; Image<uint8_t> big = g.realize(1024, 1024); Image<uint8_t> big2 = g.realize(1024, 1024); // TODO work out an assertion on call count here. fprintf(stderr, "Call count after oversize realize is %d.\n", call_count_with_arg); call_count_with_arg = 0; for (int v = 0; v < 1000; v++) { int r = rand() % 256; val.set((float)r); Image<uint8_t> out1 = g.realize(128, 128); for (int32_t i = 0; i < 100; i++) { for (int32_t j = 0; j < 100; j++) { assert(out1(i, j) == (uint8_t)(3 * r + i + (i - 1) + (i + 1))); } } } fprintf(stderr, "Call count is %d.\n", call_count_with_arg); // Return cache size to default. Internal::JITSharedRuntime::memoization_cache_set_size(0); } { // Test parallel cache access Param<float> val; Func count_calls; count_calls.define_extern("count_calls_with_arg_parallel", Internal::vec(ExternFuncArgument(cast<uint8_t>(val))), UInt(8), 3); Func f; Var x, y; // Ensure that all calls map to the same cache key, but pass a thread ID // through to avoid having to do locking or an atomic add f(x, y) = count_calls(x, y % 4, memoize_tag(y / 16, 0)) + cast<uint8_t>(x); Func g; g(x, y) = f(x, y) + f(x - 1, y) + f(x + 1, y); count_calls.compute_at(f, y).memoize(); f.compute_at(g, y).memoize(); g.parallel(y, 16); val.set(23.0f); Internal::JITSharedRuntime::memoization_cache_set_size(1000000); Image<uint8_t> out = g.realize(128, 128); for (int32_t i = 0; i < 128; i++) { for (int32_t j = 0; j < 128; j++) { assert(out(i, j) == (uint8_t)(3 * 23 + i + (i - 1) + (i + 1))); } } // TODO work out an assertion on call counts here. for (int i = 0; i < 8; i++) { fprintf(stderr, "Call count for thread %d is %d.\n", i, call_count_with_arg_parallel[i]); } // Return cache size to default. Internal::JITSharedRuntime::memoization_cache_set_size(0); } { Param<float> val; Func f; Var x, y; f(x, y) = cast<uint8_t>((x << 8) + y); Func prev_func = f; Func stage[4]; for (int i = 0; i < 4; i++) { std::vector<ExternFuncArgument> args(3); args[0] = cast<int32_t>(i); args[1] = cast<int32_t>(val); args[2] = prev_func; stage[i].define_extern("count_calls_staged", args, UInt(8), 2); prev_func = stage[i]; } f.compute_root(); for (int i = 0; i < 3; i++) { stage[i].compute_root(); } stage[3].compute_root().memoize(); val.set(23.0f); Image<uint8_t> result = stage[3].realize(128, 128); for (int32_t i = 0; i < 128; i++) { for (int32_t j = 0; j < 128; j++) { assert(result(i, j) == (uint8_t)((i << 8) + j + 4 * 23)); } } for (int i = 0; i < 4; i++) { fprintf(stderr, "Call count for stage %d is %d.\n", i, call_count_staged[i]); } result = stage[3].realize(128, 128); for (int32_t i = 0; i < 128; i++) { for (int32_t j = 0; j < 128; j++) { assert(result(i, j) == (uint8_t)((i << 8) + j + 4 * 23)); } } for (int i = 0; i < 4; i++) { fprintf(stderr, "Call count for stage %d is %d.\n", i, call_count_staged[i]); } } fprintf(stderr, "Success!\n"); return 0; }
int main(int argc, char **argv) { Expr random_bit = cast<uint8_t>(random_float() > 0.5f); // First define the function that gives the initial state of the // game board { Func initial; initial(x, y, c) = random_bit; initial.compile_to_file("game_of_life_init"); } // Then the function that updates the state. Also depends on user input. { ImageParam state(UInt(8), 3); Param<int> mouse_x, mouse_y; // Add a boundary condition. Func clamped; clamped(x, y, c) = state(clamp(x, state.left(), state.right()), clamp(y, state.top(), state.bottom()), c); Expr xm = max(x-1, 0), xp = min(x+1, state.width()-1); Expr ym = max(y-1, 0), yp = min(y+1, state.height()-1); // Count the number of live neighbors. Expr count = (clamped(x - 1, y - 1, c) + clamped(x, y - 1, c) + clamped(x + 1, y - 1, c) + clamped(x - 1, y, c) + clamped(x + 1, y, c) + clamped(x - 1, y + 1, c) + clamped(x, y + 1, c) + clamped(x + 1, y + 1, c)); // Was this pixel alive in the previous generation? Expr alive_before = state(x, y, c) != 0; // We're alive in the next generation if we have two neighbors and // were alive before, or if we have three neighbors. Expr alive_now = (count == 2 && alive_before) || count == 3; Expr alive = cast<uint8_t>(1); Expr dead = cast<uint8_t>(0); Func output; output(x, y, c) = select(alive_now, alive, dead); // Clobber part of the output around where the mouse is with random junk Expr min_x = clamp(mouse_x - 10, 0, state.width()-1); Expr max_x = clamp(mouse_x + 10, 0, state.width()-1); Expr min_y = clamp(mouse_y - 10, 0, state.height()-1); Expr max_y = clamp(mouse_y + 10, 0, state.height()-1); RDom clobber(min_x, max_x - min_x + 1, min_y, max_y - min_y + 1); Expr dx = clobber.x - mouse_x; Expr dy = clobber.y - mouse_y; Expr r = dx*dx + dy*dy; output(clobber.x, clobber.y, c) = select(r < 100, cast<uint8_t>(random_float() < 0.25f), output(clobber.x, clobber.y, c)); output.vectorize(x, 16); clamped.compute_at(output, x); Var yi; output.split(y, y, yi, 16).reorder(x, yi, c, y).parallel(y); output.compile_to_file("game_of_life_update", state, mouse_x, mouse_y); } // Now the function that converts the state into an argb image. { ImageParam state(UInt(8), 3); Func state_32; state_32(x, y, c) = cast<int32_t>(state(x, y, c)); Func render; Expr r = select(state_32(x, y, 0) == 1, 255, 0); Expr g = select(state_32(x, y, 1) == 1, 255, 0); Expr b = select(state_32(x, y, 2) == 1, 255, 0); render(x, y) = (255 << 24) + (r << 16) + (g << 8) + b; render.vectorize(x, 4); state_32.compute_at(render, x); Var yi; render.split(y, y, yi, 16).parallel(y); render.compile_to_file("game_of_life_render", state); } return 0; }
// Now a schedule that uses CUDA or OpenCL. void schedule_for_gpu() { // We make the decision about whether to use the GPU for each // Func independently. If you have one Func computed on the // CPU, and the next computed on the GPU, Halide will do the // copy-to-gpu under the hood. For this pipeline, there's no // reason to use the CPU for any of the stages. Halide will // copy the input image to the GPU the first time we run the // pipeline, and leave it there to reuse on subsequent runs. // As before, we'll compute the LUT once at the start of the // pipeline. lut.compute_root(); // Let's compute the look-up-table using the GPU in 16-wide // one-dimensional thread blocks. First we split the index // into blocks of size 16: Var block, thread; lut.split(i, block, thread, 16); // Then we tell cuda that our Vars 'block' and 'thread' // correspond to CUDA's notions of blocks and threads, or // OpenCL's notions of thread groups and threads. lut.gpu_blocks(block) .gpu_threads(thread); // This is a very common scheduling pattern on the GPU, so // there's a shorthand for it: // lut.gpu_tile(i, block, thread, 16); // Func::gpu_tile behaves the same as Func::tile, except that // it also specifies that the tile coordinates correspond to // GPU blocks, and the coordinates within each tile correspond // to GPU threads. // Compute color channels innermost. Promise that there will // be three of them and unroll across them. curved.reorder(c, x, y) .bound(c, 0, 3) .unroll(c); // Compute curved in 2D 8x8 tiles using the GPU. curved.gpu_tile(x, y, xo, yo, xi, yi, 8, 8); // This is equivalent to: // curved.tile(x, y, xo, yo, xi, yi, 8, 8) // .gpu_blocks(xo, yo) // .gpu_threads(xi, yi); // We'll leave sharpen as inlined into curved. // Compute the padded input as needed per GPU block, storing // the intermediate result in shared memory. In the schedule // above xo corresponds to GPU blocks. padded.compute_at(curved, xo); // Use the GPU threads for the x and y coordinates of the // padded input. padded.gpu_threads(x, y); // JIT-compile the pipeline for the GPU. CUDA, OpenCL, or // Metal are not enabled by default. We have to construct a // Target object, enable one of them, and then pass that // target object to compile_jit. Otherwise your CPU will very // slowly pretend it's a GPU, and use one thread per output // pixel. // Start with a target suitable for the machine you're running // this on. Target target = get_host_target(); // Then enable OpenCL or Metal, depending on which platform // we're on. OS X doesn't update its OpenCL drivers, so they // tend to be broken. CUDA would also be a fine choice on // machines with NVidia GPUs. if (target.os == Target::OSX) { target.set_feature(Target::Metal); } else { target.set_feature(Target::OpenCL); } // Uncomment the next line and comment out the lines above to // try CUDA instead. // target.set_feature(Target::CUDA); // If you want to see all of the OpenCL, Metal, or CUDA API // calls done by the pipeline, you can also enable the Debug // flag. This is helpful for figuring out which stages are // slow, or when CPU -> GPU copies happen. It hurts // performance though, so we'll leave it commented out. // target.set_feature(Target::Debug); curved.compile_jit(target); }
int main(int argc, char **argv) { // Try doing vector loads with a boundary condition in various // ways and compare the performance. input = Image<uint16_t>(1024+8, 320); for (int y = 0; y < input.height(); y++) { for (int x = 0; x < input.width(); x++) { input(x, y) = rand() & 0xfff; } } output = Image<uint16_t>(1024, 320); Var x, y; double t_ref, t_clamped, t_scalar, t_pad; { // Do an unclamped load to get a reference number Func f; f(x, y) = input(x, y) * 3 + input(x+1, y); f.vectorize(x, 8); t_ref = test(f, false); } { // Variant 1 - do the clamped vector load Func g; g(x, y) = input(clamp(x, MIN, MAX), y); Func f; f(x, y) = g(x, y) * 3 + g(x+1, y); f.vectorize(x, 8); t_clamped = test(f); } { // Variant 2 - do the load as a scalar op just before the vectorized stuff Func g; g(x, y) = input(clamp(x, MIN, MAX), y); Func f; f(x, y) = g(x, y) * 3 + g(x+1, y); f.vectorize(x, 8); g.compute_at(f, x); t_scalar = test(f); } { // Variant 3 - pad each scanline using scalar code Func g; g(x, y) = input(clamp(x, MIN, MAX), y); Func f; f(x, y) = g(x, y) * 3 + g(x+1, y); f.vectorize(x, 8); g.compute_at(f, y); t_pad = test(f); } // This constraint is pretty lax, because the op is so trivial // that the overhead of branching is large. For more complex ops, // the overhead should be smaller. We just make sure it's faster // than scalarizing or padding. if (t_clamped > t_scalar || t_clamped > t_pad) { printf("Clamped load timings suspicious:\n" "Unclamped: %f\n" "Clamped: %f\n" "Scalarize the load: %f\n" "Pad the input: %f\n", t_ref, t_clamped, t_scalar, t_pad); return -1; } printf("Success!\n"); // Clean up our global images, otherwise you get destructor // order weirdness. The images hold onto the JIT-compiled module // that created them, and will delete it when they die. However, // it might not be possible to destroy the module cleanly after // main exits, because destroying the module touches globals // inside of llvm, and destructor order of globals is not // guaranteed. input = Image<uint16_t>(); output = Image<uint16_t>(); return 0; }
// Merge sort contiguous chunks of size s in a 1d func. Func merge_sort(Func input, int total_size) { std::vector<Func> stages; Func result; const int parallel_work_size = 512; Func parallel_stage("parallel_stage"); // First gather the input into a 2D array of width four where each row is sorted { assert(input.dimensions() == 1); // Use a small sorting network Expr a0 = input(4*y); Expr a1 = input(4*y+1); Expr a2 = input(4*y+2); Expr a3 = input(4*y+3); Expr b0 = min(a0, a1); Expr b1 = max(a0, a1); Expr b2 = min(a2, a3); Expr b3 = max(a2, a3); a0 = min(b0, b2); a1 = max(b0, b2); a2 = min(b1, b3); a3 = max(b1, b3); b0 = a0; b1 = min(a1, a2); b2 = max(a1, a2); b3 = a3; result(x, y) = select(x == 0, b0, select(x == 1, b1, select(x == 2, b2, b3))); result.compute_at(parallel_stage, y).bound(x, 0, 4).unroll(x); stages.push_back(result); } // Now build up to the total size, merging each pair of rows for (int chunk_size = 4; chunk_size < total_size; chunk_size *= 2) { // "result" contains the sorted halves assert(result.dimensions() == 2); // Merge pairs of rows from the partial result Func merge_rows("merge_rows"); RDom r(0, chunk_size*2); // The first dimension of merge_rows is within the chunk, and the // second dimension is the chunk index. Keeps track of two // pointers we're merging from and an output value. merge_rows(x, y) = Tuple(0, 0, cast(input.value().type(), 0)); Expr candidate_a = merge_rows(r-1, y)[0]; Expr candidate_b = merge_rows(r-1, y)[1]; Expr valid_a = candidate_a < chunk_size; Expr valid_b = candidate_b < chunk_size; Expr value_a = result(clamp(candidate_a, 0, chunk_size-1), 2*y); Expr value_b = result(clamp(candidate_b, 0, chunk_size-1), 2*y+1); merge_rows(r, y) = tuple_select(valid_a && ((value_a < value_b) || !valid_b), Tuple(candidate_a + 1, candidate_b, value_a), Tuple(candidate_a, candidate_b + 1, value_b)); if (chunk_size <= parallel_work_size) { merge_rows.compute_at(parallel_stage, y); } else { merge_rows.compute_root(); } if (chunk_size == parallel_work_size) { parallel_stage(x, y) = merge_rows(x, y)[2]; parallel_stage.compute_root().parallel(y); result = parallel_stage; } else { result = lambda(x, y, merge_rows(x, y)[2]); } } // Convert back to 1D return lambda(x, result(x, 0)); }
Func demosaic(Func deinterleaved) { // These are the values we already know from the input // x_y = the value of channel x at a site in the input of channel y // gb refers to green sites in the blue rows // gr refers to green sites in the red rows // Give more convenient names to the four channels we know Func r_r, g_gr, g_gb, b_b; g_gr(x, y) = deinterleaved(x, y, 0); r_r(x, y) = deinterleaved(x, y, 1); b_b(x, y) = deinterleaved(x, y, 2); g_gb(x, y) = deinterleaved(x, y, 3); // These are the ones we need to interpolate Func b_r, g_r, b_gr, r_gr, b_gb, r_gb, r_b, g_b; // First calculate green at the red and blue sites // Try interpolating vertically and horizontally. Also compute // differences vertically and horizontally. Use interpolation in // whichever direction had the smallest difference. Expr gv_r = avg(g_gb(x, y-1), g_gb(x, y)); Expr gvd_r = absd(g_gb(x, y-1), g_gb(x, y)); Expr gh_r = avg(g_gr(x+1, y), g_gr(x, y)); Expr ghd_r = absd(g_gr(x+1, y), g_gr(x, y)); g_r(x, y) = select(ghd_r < gvd_r, gh_r, gv_r); Expr gv_b = avg(g_gr(x, y+1), g_gr(x, y)); Expr gvd_b = absd(g_gr(x, y+1), g_gr(x, y)); Expr gh_b = avg(g_gb(x-1, y), g_gb(x, y)); Expr ghd_b = absd(g_gb(x-1, y), g_gb(x, y)); g_b(x, y) = select(ghd_b < gvd_b, gh_b, gv_b); // Next interpolate red at gr by first interpolating, then // correcting using the error green would have had if we had // interpolated it in the same way (i.e. add the second derivative // of the green channel at the same place). Expr correction; correction = g_gr(x, y) - avg(g_r(x, y), g_r(x-1, y)); r_gr(x, y) = correction + avg(r_r(x-1, y), r_r(x, y)); // Do the same for other reds and blues at green sites correction = g_gr(x, y) - avg(g_b(x, y), g_b(x, y-1)); b_gr(x, y) = correction + avg(b_b(x, y), b_b(x, y-1)); correction = g_gb(x, y) - avg(g_r(x, y), g_r(x, y+1)); r_gb(x, y) = correction + avg(r_r(x, y), r_r(x, y+1)); correction = g_gb(x, y) - avg(g_b(x, y), g_b(x+1, y)); b_gb(x, y) = correction + avg(b_b(x, y), b_b(x+1, y)); // Now interpolate diagonally to get red at blue and blue at // red. Hold onto your hats; this gets really fancy. We do the // same thing as for interpolating green where we try both // directions (in this case the positive and negative diagonals), // and use the one with the lowest absolute difference. But we // also use the same trick as interpolating red and blue at green // sites - we correct our interpolations using the second // derivative of green at the same sites. correction = g_b(x, y) - avg(g_r(x, y), g_r(x-1, y+1)); Expr rp_b = correction + avg(r_r(x, y), r_r(x-1, y+1)); Expr rpd_b = absd(r_r(x, y), r_r(x-1, y+1)); correction = g_b(x, y) - avg(g_r(x-1, y), g_r(x, y+1)); Expr rn_b = correction + avg(r_r(x-1, y), r_r(x, y+1)); Expr rnd_b = absd(r_r(x-1, y), r_r(x, y+1)); r_b(x, y) = select(rpd_b < rnd_b, rp_b, rn_b); // Same thing for blue at red correction = g_r(x, y) - avg(g_b(x, y), g_b(x+1, y-1)); Expr bp_r = correction + avg(b_b(x, y), b_b(x+1, y-1)); Expr bpd_r = absd(b_b(x, y), b_b(x+1, y-1)); correction = g_r(x, y) - avg(g_b(x+1, y), g_b(x, y-1)); Expr bn_r = correction + avg(b_b(x+1, y), b_b(x, y-1)); Expr bnd_r = absd(b_b(x+1, y), b_b(x, y-1)); b_r(x, y) = select(bpd_r < bnd_r, bp_r, bn_r); // Interleave the resulting channels Func r = interleave_y(interleave_x(r_gr, r_r), interleave_x(r_b, r_gb)); Func g = interleave_y(interleave_x(g_gr, g_r), interleave_x(g_b, g_gb)); Func b = interleave_y(interleave_x(b_gr, b_r), interleave_x(b_b, b_gb)); Func output; output(x, y, c) = select(c == 0, r(x, y), c == 1, g(x, y), b(x, y)); /* THE SCHEDULE */ if (schedule == 0) { // optimized for ARM // Compute these in chunks over tiles, vectorized by 8 g_r.compute_at(processed, tx).vectorize(x, 8); g_b.compute_at(processed, tx).vectorize(x, 8); r_gr.compute_at(processed, tx).vectorize(x, 8); b_gr.compute_at(processed, tx).vectorize(x, 8); r_gb.compute_at(processed, tx).vectorize(x, 8); b_gb.compute_at(processed, tx).vectorize(x, 8); r_b.compute_at(processed, tx).vectorize(x, 8); b_r.compute_at(processed, tx).vectorize(x, 8); // These interleave in y, so unrolling them in y helps output.compute_at(processed, tx) .vectorize(x, 8) .unroll(y, 2) .reorder(c, x, y).bound(c, 0, 3).unroll(c); } else if (schedule == 1) { // optimized for X86 // Don't vectorize, because sse is bad at 16-bit interleaving g_r.compute_at(processed, tx); g_b.compute_at(processed, tx); r_gr.compute_at(processed, tx); b_gr.compute_at(processed, tx); r_gb.compute_at(processed, tx); b_gb.compute_at(processed, tx); r_b.compute_at(processed, tx); b_r.compute_at(processed, tx); // These interleave in x and y, so unrolling them helps output.compute_at(processed, tx).unroll(x, 2).unroll(y, 2) .reorder(c, x, y).bound(c, 0, 3).unroll(c); } else { // Basic naive schedule g_r.compute_root(); g_b.compute_root(); r_gr.compute_root(); b_gr.compute_root(); r_gb.compute_root(); b_gb.compute_root(); r_b.compute_root(); b_r.compute_root(); output.compute_root(); } return output; }
int main(int argc, char **argv) { // Try doing vector loads with a boundary condition in various // ways and compare the performance. input = Image<uint16_t>(1024+8, 320); for (int y = 0; y < input.height(); y++) { for (int x = 0; x < input.width(); x++) { input(x, y) = rand() & 0xfff; } } output = Image<uint16_t>(1024, 320); Var x, y; double t_ref, t_clamped, t_scalar, t_pad; { // Do an unclamped load to get a reference number Func f; f(x, y) = input(x, y) * 3 + input(x+1, y); f.vectorize(x, 8); t_ref = test(f, false); } { // Variant 1 - do the clamped vector load Func g; g(x, y) = input(clamp(x, MIN, MAX), y); Func f; f(x, y) = g(x, y) * 3 + g(x+1, y); f.vectorize(x, 8); t_clamped = test(f); } { // Variant 2 - do the load as a scalar op just before the vectorized stuff Func g; g(x, y) = input(clamp(x, MIN, MAX), y); Func f; f(x, y) = g(x, y) * 3 + g(x+1, y); f.vectorize(x, 8); g.compute_at(f, x); t_scalar = test(f); } { // Variant 3 - pad each scanline using scalar code Func g; g(x, y) = input(clamp(x, MIN, MAX), y); Func f; f(x, y) = g(x, y) * 3 + g(x+1, y); f.vectorize(x, 8); g.compute_at(f, y); t_pad = test(f); } // This constraint is pretty lax, because the op is so trivial // that the overhead of branching is large. For more complex ops, // the overhead should be smaller. if (t_clamped > 5.0f * t_ref || t_clamped > t_scalar || t_clamped > t_pad) { printf("Clamped load timings suspicious:\n" "Unclamped: %f\n" "Clamped: %f\n" "Scalarize the load: %f\n" "Pad the input: %f\n", t_ref, t_clamped, t_scalar, t_pad); return -1; } printf("Success!\n"); return 0; }
// Now a schedule that uses CUDA or OpenCL. void schedule_for_gpu() { // We make the decision about whether to use the GPU for each // Func independently. If you have one Func computed on the // CPU, and the next computed on the GPU, Halide will do the // copy-to-gpu under the hood. For this pipeline, there's no // reason to use the CPU for any of the stages. Halide will // copy the input image to the GPU the first time we run the // pipeline, and leave it there to reuse on subsequent runs. // As before, we'll compute the LUT once at the start of the // pipeline. lut.compute_root(); // Let's compute the look-up-table using the GPU in 16-wide // one-dimensional thread blocks. First we split the index // into blocks of size 16: Var block, thread; lut.split(i, block, thread, 16); // Then we tell cuda that our Vars 'block' and 'thread' // correspond to CUDA's notions of blocks and threads, or // OpenCL's notions of thread groups and threads. lut.gpu_blocks(block) .gpu_threads(thread); // This is a very common scheduling pattern on the GPU, so // there's a shorthand for it: // lut.gpu_tile(i, 16); // Func::gpu_tile method is similar to Func::tile, except that // it also specifies that the tile coordinates correspond to // GPU blocks, and the coordinates within each tile correspond // to GPU threads. // Compute color channels innermost. Promise that there will // be three of them and unroll across them. curved.reorder(c, x, y) .bound(c, 0, 3) .unroll(c); // Compute curved in 2D 8x8 tiles using the GPU. curved.gpu_tile(x, y, 8, 8); // This is equivalent to: // curved.tile(x, y, xo, yo, xi, yi, 8, 8) // .gpu_blocks(xo, yo) // .gpu_threads(xi, yi); // We'll leave sharpen as inlined into curved. // Compute the padded input as needed per GPU block, storing the // intermediate result in shared memory. Var::gpu_blocks, and // Var::gpu_threads exist to help you schedule producers within // GPU threads and blocks. padded.compute_at(curved, Var::gpu_blocks()); // Use the GPU threads for the x and y coordinates of the // padded input. padded.gpu_threads(x, y); // JIT-compile the pipeline for the GPU. CUDA or OpenCL are // not enabled by default. We have to construct a Target // object, enable one of them, and then pass that target // object to compile_jit. Otherwise your CPU will very slowly // pretend it's a GPU, and use one thread per output pixel. // Start with a target suitable for the machine you're running // this on. Target target = get_host_target(); // Then enable OpenCL or CUDA. // We'll enable OpenCL here, because it tends to give better // performance than CUDA, even with NVidia's drivers, because // NVidia's open source LLVM backend doesn't seem to do all // the same optimizations their proprietary compiler does. target.features |= Target::OpenCL; // Uncomment the next line and comment out the line above to // try CUDA instead. // target.features |= Target::CUDA; // If you want to see all of the OpenCL or CUDA API calls done // by the pipeline, you can also enable the GPUDebug // flag. This is helpful for figuring out which stages are // slow, or when CPU -> GPU copies happen. It hurts // performance though, so we'll leave it commented out. //target.features |= Target::GPUDebug; curved.compile_jit(target); }
ComplexFunc fft2d_r2c(Func r, const vector<int> &R0, const vector<int> &R1, const Target& target, const Fft2dDesc& desc) { string prefix = desc.name.empty() ? "r2c_" : desc.name + "_"; vector<Var> args(r.args()); Var n0(args[0]), n1(args[1]); args.erase(args.begin()); args.erase(args.begin()); // Get the innermost variable outside the FFT. Var outer = Var::outermost(); if (!args.empty()) { outer = args.front(); } int N0 = product(R0); int N1 = product(R1); // Cache of twiddle factors for this FFT. TwiddleFactorSet twiddle_cache; // The gain requested of the FFT. Expr gain = desc.gain; // Combine pairs of real columns x, y into complex columns z = x + j y. This // allows us to compute two real DFTs using one complex FFT. See the large // comment above this function for more background. // // An implementation detail is that we zip the columns in groups from the // input data to enable the loads to be dense vectors. x is taken from the // even indexed groups columns, y is taken from the odd indexed groups of // columns. // // Changing the group size can (insignificantly) numerically change the result // due to regrouping floating point operations. To avoid this, if the FFT // description specified a vector width, use it as the group size. ComplexFunc zipped(prefix + "zipped"); int zip_width = desc.vector_width; if (zip_width <= 0) { zip_width = target.natural_vector_size(r.output_types()[0]); } // Ensure the zip width divides the zipped extent. zip_width = gcd(zip_width, N0 / 2); Expr zip_n0 = (n0 / zip_width) * zip_width * 2 + (n0 % zip_width); zipped(A({n0, n1}, args)) = ComplexExpr(r(A({zip_n0, n1}, args)), r(A({zip_n0 + zip_width, n1}, args))); // DFT down the columns first. ComplexFunc dft1 = fft_dim1(zipped, R1, -1, // sign std::min(zip_width, N0 / 2), // extent of dim 0 1.0f, false, // We parallelize unzipped below instead. prefix, target, &twiddle_cache); // Unzip the two groups of real DFTs we zipped together above. For more // information about the unzipping operation, see the large comment above this // function. ComplexFunc unzipped(prefix + "unzipped"); { Expr unzip_n0 = (n0 / (zip_width * 2)) * zip_width + (n0 % zip_width); ComplexExpr Z = dft1(A({unzip_n0, n1}, args)); ComplexExpr conjsymZ = conj(dft1(A({unzip_n0, (N1 - n1) % N1}, args))); ComplexExpr X = Z + conjsymZ; ComplexExpr Y = -j * (Z - conjsymZ); // Rather than divide the above expressions by 2 here, adjust the gain // instead. gain /= 2; unzipped(A({n0, n1}, args)) = select(n0 % (zip_width * 2) < zip_width, X, Y); } // Zip the DC and Nyquist DFT bin rows, which should be real. ComplexFunc zipped_0(prefix + "zipped_0"); zipped_0(A({n0, n1}, args)) = select(n1 > 0, likely(unzipped(A({n0, n1}, args))), ComplexExpr(re(unzipped(A({n0, 0}, args))), re(unzipped(A({n0, N1 / 2}, args))))); // The vectorization of the columns must not exceed this value. int zipped_extent0 = std::min((N1 + 1) / 2, zip_width); // transpose so we can FFT dimension 0 (by making it dimension 1). ComplexFunc unzippedT, unzippedT_tiled; std::tie(unzippedT, unzippedT_tiled) = tiled_transpose(zipped_0, zipped_extent0, target, prefix); // DFT down the columns again (the rows of the original). ComplexFunc dftT = fft_dim1(unzippedT, R0, -1, // sign zipped_extent0, gain, desc.parallel, prefix, target, &twiddle_cache); // transpose the result back to the original orientation, unless the caller // requested a transposed DFT. ComplexFunc dft = transpose(dftT); // We are going to add a row to the result (with update steps) by unzipping // the DC and Nyquist bin rows. To avoid unnecessarily computing some junk for // this row before we overwrite it, pad the pure definition with undef. dft = ComplexFunc(constant_exterior((Func)dft, Tuple(undef_z()), Expr(), Expr(), Expr(0), Expr(N1 / 2))); // Unzip the DFTs of the DC and Nyquist bin DFTs. Unzip the Nyquist DFT first, // because the DC bin DFT is updated in-place. For more information about // this, see the large comment above this function. RDom n0z1(1, N0 / 2); RDom n0z2(N0 / 2, N0 / 2); // Update 0: Unzip the DC bin of the DFT of the Nyquist bin row. dft(A({0, N1 / 2}, args)) = im(dft(A({0, 0}, args))); // Update 1: Unzip the rest of the DFT of the Nyquist bin row. dft(A({n0z1, N1 / 2}, args)) = 0.5f * -j * (dft(A({n0z1, 0}, args)) - conj(dft(A({N0 - n0z1, 0}, args)))); // Update 2: Compute the rest of the Nyquist bin row via conjugate symmetry. // Note that this redundantly computes n0 = N0/2, but that's faster and easier // than trying to deal with N0/2 - 1 bins. dft(A({n0z2, N1 / 2}, args)) = conj(dft(A({N0 - n0z2, N1 / 2}, args))); // Update 3: Unzip the DC bin of the DFT of the DC bin row. dft(A({0, 0}, args)) = re(dft(A({0, 0}, args))); // Update 4: Unzip the rest of the DFT of the DC bin row. dft(A({n0z1, 0}, args)) = 0.5f * (dft(A({n0z1, 0}, args)) + conj(dft(A({N0 - n0z1, 0}, args)))); // Update 5: Compute the rest of the DC bin row via conjugate symmetry. // Note that this redundantly computes n0 = N0/2, but that's faster and easier // than trying to deal with N0/2 - 1 bins. dft(A({n0z2, 0}, args)) = conj(dft(A({N0 - n0z2, 0}, args))); // Schedule. dftT.compute_at(dft, outer); // Schedule the tiled transposes. if (unzippedT_tiled.defined()) { unzippedT_tiled.compute_at(dftT, group); } // Schedule the input, if requested. if (desc.schedule_input) { r.compute_at(dft1, group); } // Vectorize the zip groups, and unroll by a factor of 2 to simplify the // even/odd selection. Var n0o("n0o"), n0i("n0i"); unzipped.compute_at(dft, outer) .split(n0, n0o, n0i, zip_width * 2) .reorder(n0i, n1, n0o) .vectorize(n0i, zip_width) .unroll(n0i); dft1.compute_at(unzipped, n0o); if (desc.parallel) { // Note that this also parallelizes dft1, which is computed inside this loop // of unzipped. unzipped.parallel(n0o); } // Schedule the final DFT transpose and unzipping updates. dft.vectorize(n0, target.natural_vector_size<float>()) .unroll(n0, std::min(N0 / target.natural_vector_size<float>(), 4)); // The Nyquist bin at n0z = N0/2 looks like a race condition because it // simplifies to an expression similar to the DC bin. However, we include it // in the reduction because it makes the reduction have length N/2, which is // convenient for vectorization, and just ignore the resulting appearance of // a race condition. dft.update(1).allow_race_conditions() .vectorize(n0z1, target.natural_vector_size<float>()); dft.update(2).allow_race_conditions() .vectorize(n0z2, target.natural_vector_size<float>()); dft.update(4).allow_race_conditions() .vectorize(n0z1, target.natural_vector_size<float>()); dft.update(5).allow_race_conditions() .vectorize(n0z2, target.natural_vector_size<float>()); // Our result is undefined outside these bounds. dft.bound(n0, 0, N0); dft.bound(n1, 0, (N1 + 1) / 2 + 1); return dft; }