void copy_float(std::string suffix, ImageParam input8, const int channels) { Var x, y, c; Func input; input(x, y, c) = input8(clamp(x, input8.left(), input8.right()), clamp(y, input8.top(), input8.bottom()), c); Func result("result"); result(x, y, c) = input(x, y, c); result.bound(c, 0, channels); // Unset default constraints so that specialization works. result.output_buffer().set_stride(0, Expr()); Expr interleaved = (result.output_buffer().stride(0) == channels && result.output_buffer().stride(2) == 1); if (suffix == "_rs") { result.shader(x, y, c, DeviceAPI::Renderscript); result.specialize(interleaved).vectorize(c); } else { result.reorder(c, x, y) .parallel(y) .unroll(c) .vectorize(x, 4) .specialize(interleaved); } // non-specialized version is planar std::vector<Argument> args; args.push_back(input8); std::string fn_name = "generated_copy" + suffix + "_float"; result.compile_to_file(fn_name, args, fn_name); }
void blur_uint8(std::string suffix, ImageParam input8, const int channels) { Var x, y, c; Func input; input(x, y, c) = input8(clamp(x, input8.left(), input8.right()), clamp(y, input8.top(), input8.bottom()), c); Func blur_x("blur_x"); blur_x(x, y, c) = cast<uint8_t>( (cast<uint16_t>(input(x, y, c)) + input(x + 1, y, c) + input(x + 2, y, c)) / 3); Func result("result"); result(x, y, c) = cast<uint8_t>( (cast<uint16_t>(blur_x(x, y, c)) + blur_x(x, y + 1, c) + blur_x(x, y + 2, c)) / 3); // Unset default constraints so that specialization works. result.output_buffer().set_stride(0, Expr()); result.bound(c, 0, channels); Expr interleaved = (result.output_buffer().stride(0) == channels && result.output_buffer().stride(2) == 1); Expr planar = result.output_buffer().stride(0) == 1; if (suffix == "_rs") { result.shader(x, y, c, DeviceAPI::Renderscript); result.specialize(interleaved).vectorize(c); // non-specialized version is planar } else { Var yi; result .reorder(c, x, y) .unroll(c) .split(y, y, yi, 32) .parallel(y) .vectorize(x, 8); result.specialize(interleaved); result.specialize(planar); // blur_x is compute at result, so it's included in result's // specializations. blur_x.store_at(result, y) .compute_at(result, yi) .reorder(c, x, y) .unroll(c) .vectorize(x, 8); } std::vector<Argument> args; args.push_back(input8); std::string fn_name = "generated_blur" + suffix + "_uint8"; result.compile_to_file(fn_name, args, fn_name); }
RDom::RDom(ImageParam p) { static string var_names[] = {"x", "y", "z", "w"}; std::vector<ReductionVariable> vars; for (int i = 0; i < p.dimensions(); i++) { ReductionVariable var = { p.name() + "$" + var_names[i], p.dim(i).min(), p.dim(i).extent() }; vars.push_back(var); } dom = ReductionDomain(vars); init_vars(p.name()); }
Func build() { Expr width = input.width(); Expr height = input.height(); //Input Func input_func("in"); input_func(x, y, c) = input(x, y, c); //Warping Func K_input = K_grad_mat(input_func, width, height); //Allow for arbitrary strides input.set_stride(0, Expr()); K_input.output_buffer().set_stride(0, Expr()); return K_input; }
Func build() override { Expr width = input.width(); Expr height = input.height(); // Our input is an ImageParam, but blur_cols takes a Func, so // we define a trivial func to wrap the input. Func input_func; input_func(x, y, c) = input(x, y, c); // First, blur the columns of the input. Func blury_T = blur_cols_transpose(input_func, height, alpha); // Blur the columns again (the rows of the original). Func blur = blur_cols_transpose(blury_T, width, alpha); // Scheduling is done inside blur_cols_transpose. return blur; }
Func build() { Expr width = input.width(); Expr height = input.height(); Expr width_kernel = K.width(); Expr height_kernel = K.height(); //Input Func input_func("in"); input_func(x, y, c) = input(x, y, c); //Input H Func K_func("K"); K_func(i, j, c) = K(i, j, c); //Warping Func conv_input = A_conv(input_func, width, height, K_func, width_kernel, height_kernel); //Allow for arbitrary strides input.set_stride(0, Expr()); K.set_stride(0, Expr()); conv_input.output_buffer().set_stride(0, Expr()); return conv_input; }
/* Do n unrolled iterations of game of life on a torus */ Func gameOfLife(ImageParam input, int n) { Var x, y; Func in; if (n == 1) { in(x, y) = input(x, y); } else { in = gameOfLife(input, n-1); in.compute_root(); } Expr w = input.width(), h = input.height(); Expr W = (x+w-1) % w, E = (x+1) % w, N = (y+h-1) % h, S = (y+1) % h; Expr livingNeighbors = (in(W, N) + in(x, N) + in(E, N) + in(W, y) + in(E, y) + in(W, S) + in(x, S) + in(E, S)); Expr alive = in(x, y) != 0; Func output; output(x, y) = select(livingNeighbors == 3 || (alive && livingNeighbors == 2), u8(1), u8(0)); return output; }
Func build() { Expr width = input.width(); Expr height = input.height(); Expr nhom = H.channels(); //Input Func input_func("in"); input_func(x, y, c) = input(x, y, c); //Input H Func H_func("H"); H_func(i, j, g) = H(i, j, g); //Warping Func warp_input = A_warpHomography(input_func, width, height, H_func, nhom); //Allow for arbitrary strides input.set_stride(0, Expr()); H.set_stride(0, Expr()); warp_input.output_buffer().set_stride(0, Expr()); return warp_input; }
Func build() { //Input Func input_func("in"); input_func(x, y, c, k) = input(x, y, c, k); //Warping Func fftOut = ifft2_c2r(input_func, WTARGET, HTARGET); //Allow for arbitrary strides input.set_stride(0, Expr()); fftOut.output_buffer().set_stride(0, Expr()); return fftOut; }
void blur(std::string suffix, ImageParam input) { input.dim(2).set_bounds(0, 4).set_stride(1).dim(0).set_stride(4); Var x("x"), y("y"), c("c"); Func clamped("clamped"); clamped = BoundaryConditions::repeat_edge(input); Func blur_x("blur_x"); blur_x(x, y, c) = (clamped(x - 1, y, c) + clamped(x, y, c) + clamped(x + 1, y, c)) / 3; Func result("avg_filter"); result(x, y, c) = (blur_x(x, y - 1, c) + blur_x(x, y, c) + blur_x(x, y + 1, c)) / 3; result.output_buffer().dim(2).set_bounds(0, 4).set_stride(1).dim(0).set_stride(4); Target target = get_target_from_environment(); result.bound(c, 0, 4) .reorder_storage(c, x, y) .reorder(c, x, y); if (target.has_gpu_feature() || target.has_feature(Target::OpenGLCompute)) { Var xi("xi"), yi("yi"); result.unroll(c) .gpu_tile(x, y, xi, yi, 64, 64); } else { Var yi("yi"); result .unroll(c) .split(y, y, yi, 32) .parallel(y) .vectorize(x, 4); blur_x.store_at(result, y) .compute_at(result, yi) .reorder(c, x, y) .unroll(c) .vectorize(x, 4); } std::string fn_name = std::string("avg_filter") + suffix; result.compile_to_file(fn_name, {input}, fn_name); }
RDom::RDom(ImageParam p) { Expr min[4], extent[4]; for (int i = 0; i < 4; i++) { if (p.dimensions() > i) { min[i] = 0; extent[i] = p.extent(i); } } string names[] = {p.name() + ".x$r", p.name() + ".y$r", p.name() + ".z$r", p.name() + ".w$r"}; dom = build_domain(names[0], min[0], extent[0], names[1], min[1], extent[1], names[2], min[2], extent[2], names[3], min[3], extent[3]); RVar *vars[] = {&x, &y, &z, &w}; for (int i = 0; i < 4; i++) { if (p.dimensions() > i) { *(vars[i]) = RVar(names[i], min[i], extent[i], dom); } } }
void set(ImageParam &a, const Buffer &b) { a.set(b); }
void set_alignment_host_ptr(ImageParam &i, int align, std::map<string, int> &m) { i.set_host_alignment(align); m.insert(std::pair<string, int>(i.name()+".host", align)); }
Func build() { // Define the Func. Func brighter("brighter"); brighter(x, y, c) = input(x, y, c) + offset; // Schedule it. brighter.vectorize(x, 16); // We will compile this pipeline to handle memory layouts in // several different ways, depending on the 'layout' generator // param. if (layout == Layout::Planar) { // This pipeline as written will only work with images in // which each scanline is densely-packed single color // channel. In terms of the strides described in lesson // 10, Halide assumes and asserts that the stride in x is // one. // This constraint permits planar images, where the red, // green, and blue channels are laid out in memory like // this: // RRRRRRRR // RRRRRRRR // RRRRRRRR // RRRRRRRR // GGGGGGGG // GGGGGGGG // GGGGGGGG // GGGGGGGG // BBBBBBBB // BBBBBBBB // BBBBBBBB // BBBBBBBB // It also works with the less-commonly used line-by-line // layout, in which scanlines of red, green, and blue // alternate. // RRRRRRRR // GGGGGGGG // BBBBBBBB // RRRRRRRR // GGGGGGGG // BBBBBBBB // RRRRRRRR // GGGGGGGG // BBBBBBBB // RRRRRRRR // GGGGGGGG // BBBBBBBB } else if (layout == Layout::Interleaved) { // Another common format is 'interleaved', in which the // red, green, and blue values for each pixel occur next // to each other in memory: // RGBRGBRGBRGBRGBRGBRGBRGB // RGBRGBRGBRGBRGBRGBRGBRGB // RGBRGBRGBRGBRGBRGBRGBRGB // RGBRGBRGBRGBRGBRGBRGBRGB // In this case the stride in x is three, the stride in y // is three times the width of the image, and the stride // in c is one. We can tell Halide to assume (and assert) // that this is the case for the input and output like so: input .set_stride(0, 3) // stride in dimension 0 (x) is three .set_stride(2, 1); // stride in dimension 2 (c) is one brighter.output_buffer() .set_stride(0, 3) .set_stride(2, 1); // For interleaved layout, you may want to use a different // schedule. We'll tell Halide to additionally assume and // assert that there are three color channels, then // exploit this fact to make the loop over 'c' innermost // and unrolled. input.set_bounds(2, 0, 3); // Dimension 2 (c) starts at 0 and has extent 3. brighter.output_buffer().set_bounds(2, 0, 3); // Move the loop over color channels innermost and unroll // it. brighter.reorder(c, x, y).unroll(c); // Note that if we were dealing with an image with an // alpha channel (RGBA), then the stride in x and the // bounds of the channels dimension would both be four // instead of three. } else if (layout == Layout::Either) { // We can also remove all constraints and compile a // pipeline that will work with any memory layout. It will // probably be slow, because all vector loads become // gathers, and all vector stores become scatters. input.set_stride(0, Expr()); // Use a default-constructed // undefined Expr to mean // there is no constraint. brighter.output_buffer().set_stride(0, Expr()); } else if (layout == Layout::Specialized) { // We can accept any memory layout with good performance // by telling Halide to inspect the memory layout at // runtime, and branch to different code depending on the // strides it find. First we relax the default constraint // that stride(0) == 1: input.set_stride(0, Expr()); // Use an undefined Expr to // mean there is no // constraint. brighter.output_buffer().set_stride(0, Expr()); // The we construct boolean Exprs that detect at runtime // whether we're planar or interleaved. The conditions // should check for all the facts we want to exploit in // each case. Expr input_is_planar = (input.stride(0) == 1); Expr input_is_interleaved = (input.stride(0) == 3 && input.stride(2) == 1 && input.extent(2) == 3); Expr output_is_planar = (brighter.output_buffer().stride(0) == 1); Expr output_is_interleaved = (brighter.output_buffer().stride(0) == 3 && brighter.output_buffer().stride(2) == 1 && brighter.output_buffer().extent(2) == 3); // We can then use Func::specialize to write a schedule // that switches at runtime to specialized code based on a // boolean Expr. That code will exploit the fact that the // Expr is known to be true. brighter.specialize(input_is_planar && output_is_planar); // We've already vectorized and parallelized brighter, and // our two specializations will inherit those scheduling // directives. We can also add additional scheduling // directives that apply to a single specialization // only. We'll tell Halide to make a specialized version // of the code for interleaved layouts, and to reorder and // unroll that specialized code. brighter.specialize(input_is_interleaved && output_is_interleaved) .reorder(c, x, y).unroll(c); // We could also add specializations for if the input is // interleaved and the output is planar, and vice versa, // but two specializations is enough to demonstrate the // feature. A later tutorial will explore more creative // uses of Func::specialize. // Adding specializations can improve performance // substantially for the cases they apply to, but it also // increases the amount of code to compile and ship. If // binary sizes are a concern and the input and output // memory layouts are known, you probably want to use // set_stride and set_extent instead. } return brighter; }