Halide::Func ImageConverter(Halide::ImageParam image) { // First get min and max of the image RDom r(0, image.width(), 0, image.height()); // Now rescale the image to the range 0..255 and project the value to a RGBA integer value Func imgmin; imgmin() = minimum(image(r.x, r.y)); Func imgmax; imgmax() = maximum(image(r.x, r.y)); Expr scale = 1.0f / (imgmax() - imgmin()); Func rescaled; Var x, y; Expr val = cast<uint32_t>(255.0f * (image(x, y) - imgmin()) * scale + 0.5f); Expr scaled = val * cast<uint32_t>(0x010101); rescaled(x, y) = scaled; imgmin.compute_root(); imgmax.compute_root(); Var xo, yo, xi, yi; //rescaled.tile(x, y, xo, yo, xi, yi, 32, 8); //rescaled.vectorize(xi); //rescaled.unroll(yi); return rescaled; }
Func blur_then_transpose(Func f, Func coeff, Expr size, Expr sigma) { Func blurred = performBlur(f, coeff, size, sigma); // Also compute attenuation due to zero boundary condition by // blurring an image of ones in the same way. This gives a // boundary condition equivalent to reweighting the Gaussian // near the edge. (TODO: add a generator param to select // different boundary conditions). Func ones; ones(x, y) = 1.0f; Func attenuation = performBlur(ones, coeff, size, sigma); // Invert the attenuation so we can multiply by it. The // attenuation is the same for every row/channel so we only // need one column. Func inverse_attenuation; inverse_attenuation(y) = 1.0f / attenuation(0, y); // Transpose it Func transposed; transposed(x, y) = blurred(y, x); // Correct for attenuation Func out; out(x, y) = transposed(x, y) * inverse_attenuation(x); // Schedule it. Var yi, xi, yii, xii; attenuation.compute_root(); inverse_attenuation.compute_root().vectorize(y, 8); out.compute_root() .tile(x, y, xi, yi, 8, 32) .tile(xi, yi, xii, yii, 8, 8) .vectorize(xii).unroll(yii).parallel(y); blurred.compute_at(out, y); transposed.compute_at(out, xi).vectorize(y).unroll(x); for (int i = 0; i < blurred.num_update_definitions(); i++) { RDom r = blurred.reduction_domain(i); if (r.defined()) { blurred.update(i).reorder(x, r); } blurred.update(i).vectorize(x, 8).unroll(x); } return out; }
Func color_correct(Func input, ImageParam matrix_3200, ImageParam matrix_7000, Param<float> kelvin) { // Get a color matrix by linearly interpolating between two // calibrated matrices using inverse kelvin. Func matrix; Expr alpha = (1.0f/kelvin - 1.0f/3200) / (1.0f/7000 - 1.0f/3200); Expr val = (matrix_3200(x, y) * alpha + matrix_7000(x, y) * (1 - alpha)); matrix(x, y) = cast<int16_t>(val * 256.0f); // Q8.8 fixed point matrix.compute_root(); Func corrected; Expr ir = cast<int32_t>(input(x, y, 0)); Expr ig = cast<int32_t>(input(x, y, 1)); Expr ib = cast<int32_t>(input(x, y, 2)); Expr r = matrix(3, 0) + matrix(0, 0) * ir + matrix(1, 0) * ig + matrix(2, 0) * ib; Expr g = matrix(3, 1) + matrix(0, 1) * ir + matrix(1, 1) * ig + matrix(2, 1) * ib; Expr b = matrix(3, 2) + matrix(0, 2) * ir + matrix(1, 2) * ig + matrix(2, 2) * ib; r = cast<int16_t>(r/256); g = cast<int16_t>(g/256); b = cast<int16_t>(b/256); corrected(x, y, c) = select(c == 0, r, c == 1, g, b); return corrected; }
// Now we define methods that give our pipeline several different // schedules. void schedule_for_cpu() { // Compute the look-up-table ahead of time. lut.compute_root(); // Compute color channels innermost. Promise that there will // be three of them and unroll across them. curved.reorder(c, x, y) .bound(c, 0, 3) .unroll(c); // Look-up-tables don't vectorize well, so just parallelize // curved in slices of 16 scanlines. Var yo, yi; curved.split(y, yo, yi, 16) .parallel(yo); // Compute sharpen as needed per scanline of curved. sharpen.compute_at(curved, yi); // Vectorize the sharpen. It's 16-bit so we'll vectorize it 8-wide. sharpen.vectorize(x, 8); // Compute the padded input as needed per scanline of curved, // reusing previous values computed within the same strip of // 16 scanlines. padded.store_at(curved, yo) .compute_at(curved, yi); // Also vectorize the padding. It's 8-bit, so we'll vectorize // 16-wide. padded.vectorize(x, 16); // JIT-compile the pipeline for the CPU. curved.compile_jit(); }
Func build(bool use_shared) { Func host; Var x, y; host(x, y) = x + y; host.compute_root(); // We'll either inline this (and hopefully use the GPU's L1 cache) // or stage it into shared. Func staged; staged(x, y) = host(x, y); // Now we just need to access the Func staged a bunch. const int stages = 10; Func f[stages]; for (int i = 0; i < stages; i++) { Expr prev = (i == 0) ? Expr(0) : Expr(f[i-1](x, y)); Expr stencil = 0; for (int dy = -1; dy <= 1; dy++) { for (int dx = -1; dx <= 1; dx++) { stencil += staged(select(prev > 0, x, x+dx), select(prev > 0, y, y+dy)); } } if (i == 0) { f[i](x, y) = stencil; } else { f[i](x, y) = f[i-1](x, y) + stencil; } } Func final = f[stages-1]; final.compute_root().gpu_tile(x, y, 8, 8);
int main(int argc, char **argv) { Func data; Var x; data(x) = sin(x); data.compute_root(); Func sorted; std::vector<ExternFuncArgument> args; args.push_back(data); sorted.define_extern("sort_buffer", args, Float(32), 1); Buffer<float> output = sorted.realize(100); // Check the output Buffer<float> reference = lambda(x, sin(x)).realize(100); std::sort(&reference(0), &reference(100)); RDom r(reference); float error = evaluate_may_gpu<float>(sum(abs(reference(r) - output(r)))); if (error != 0) { printf("Output incorrect\n"); return -1; } printf("Success!\n"); return 0; }
Func ColorMgetfilter(Func stBasis, float angle, uint8_t iXo, uint8_t iYo, uint8_t iTo, uint8_t iCo ) { // Compute a rotated basis at (iXo,iYo,iTo,iCo) order with angle value // temporary setting uint8_t numSTB = 63; uint8_t numSB = 21; angle = -1*angle - M_PI/2; float * weights; Func work; // work: rotated basis at a particular spatio-temporal order work(x,y,t) = cast<float>(0.0f); weights = (float *) calloc(iXo+iYo+1,sizeof(float)); // compute weights for possible orders for (int i = 0; i <= iXo; i++) for (int j = 0; j <= iYo; j++) weights[iXo+iYo-i-j] += float(combination(iXo,i))*float(combination(iYo,j))*pow((-1.0f),float(i))*pow(cos(angle),float(iXo-i+j))*pow(sin(angle),float(iYo+i-j)); // get filtered expression at paricular order and angle value // Func basis("basis"); for (int k=0; k<=(iXo+iYo); k++) { int index = Mgetfilterindex(iXo+iYo-k,k,iTo,numSTB,numSB); // basis = spatial_temporal_derivative(T,iXo+iYo-k,k,iTo,iCo); if ((index > 0) && (weights[iXo+iYo-k] != 0)) work(x,y,t) += weights[iXo+iYo-k]*stBasis(x,y,iCo,t)[index]; } work.compute_root(); free(weights); return work; }
Func process(Func raw, Type result_type, ImageParam matrix_3200, ImageParam matrix_7000, Param<float> color_temp, Param<float> gamma, Param<float> contrast, Param<int> blackLevel, Param<int> whiteLevel) { Var yii, xi; Func denoised = hot_pixel_suppression(raw); Func deinterleaved = deinterleave(denoised); Func demosaiced = demosaic(deinterleaved); Func corrected = color_correct(demosaiced, matrix_3200, matrix_7000, color_temp); Func curved = apply_curve(corrected, result_type, gamma, contrast, blackLevel, whiteLevel); processed(x, y, c) = curved(x, y, c); // Schedule Expr out_width = processed.output_buffer().width(); Expr out_height = processed.output_buffer().height(); int strip_size = 32; int vec = target.natural_vector_size(UInt(16)); if (target.has_feature(Target::HVX_64)) { vec = 32; } else if (target.has_feature(Target::HVX_128)) { vec = 64; } denoised.compute_at(processed, yi).store_at(processed, yo) .fold_storage(y, 8) .vectorize(x, vec); deinterleaved.compute_at(processed, yi).store_at(processed, yo) .fold_storage(y, 4) .vectorize(x, 2*vec, TailStrategy::RoundUp) .reorder(c, x, y) .unroll(c); corrected.compute_at(processed, x) .vectorize(x, vec) .reorder(c, x, y) .unroll(c); processed.compute_root() .split(y, yo, yi, strip_size) .split(yi, yi, yii, 2) .split(x, x, xi, 2*vec, TailStrategy::RoundUp) .reorder(xi, c, yii, x, yi, yo) .vectorize(xi, 2*vec) .parallel(yo); if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { processed.hexagon(); denoised.align_storage(x, vec); deinterleaved.align_storage(x, vec); corrected.align_storage(x, vec); } // We can generate slightly better code if we know the splits divide the extent. processed .bound(c, 0, 3) .bound(x, 0, ((out_width)/(2*vec))*(2*vec)) .bound(y, 0, (out_height/strip_size)*strip_size); return processed; }
int simple_rfactor_with_specialize_test(bool compile_module) { Func f("f"), g("g"); Var x("x"), y("y"); f(x, y) = x + y; f.compute_root(); g(x, y) = 40; RDom r(10, 20, 30, 40); g(r.x, r.y) = min(f(r.x, r.y) + 2, g(r.x, r.y)); Param<int> p; Var u("u"); Func intm = g.update(0).specialize(p >= 10).rfactor(r.y, u); intm.compute_root(); intm.vectorize(u, 8); intm.update(0).vectorize(r.x, 2); if (compile_module) { p.set(20); // Check the call graphs. Module m = g.compile_to_module({g.infer_arguments()}); CheckCalls checker; m.functions().front().body.accept(&checker); CallGraphs expected = { {g.name(), {}}, {g.update(0).name(), {f.name(), intm.name(), g.name()}}, {intm.name(), {}}, {intm.update(0).name(), {f.name(), intm.name()}}, {f.name(), {}}, }; if (check_call_graphs(checker.calls, expected) != 0) { return -1; } } else { { p.set(0); Image<int> im = g.realize(80, 80); auto func = [](int x, int y, int z) { return (10 <= x && x <= 29) && (30 <= y && y <= 69) ? std::min(x + y + 2, 40) : 40; }; if (check_image(im, func)) { return -1; } } { p.set(20); Image<int> im = g.realize(80, 80); auto func = [](int x, int y, int z) { return (10 <= x && x <= 29) && (30 <= y && y <= 69) ? std::min(x + y + 2, 40) : 40; }; if (check_image(im, func)) { return -1; } } } return 0; }
int count_host_alignment_asserts(Func f, std::map<string, int> m) { Target t = get_jit_target_from_environment(); t.set_feature(Target::NoBoundsQuery); f.compute_root(); Stmt s = Internal::lower({f.function()}, f.name(), t); CountHostAlignmentAsserts c(m); s.accept(&c); return c.count; }
Func build() { Func in; in(x) = x; in.compute_root(); Func up = upsample(upsample(in)); return up; }
int count_interleaves(Func f) { Target t = get_jit_target_from_environment(); t.set_feature(Target::NoBoundsQuery); t.set_feature(Target::NoAsserts); f.compute_root(); Stmt s = Internal::lower({f.function()}, f.name(), t); CountInterleaves i; s.accept(&i); return i.result; }
int main(int argc, char **argv) { // Define a pipeline that dumps some squares to a file using an // external consumer stage. Func source; Var x; source(x) = x*x; Param<int> min, extent; Param<const char *> filename; Func sink; std::vector<ExternFuncArgument> args; args.push_back(source); args.push_back(filename); args.push_back(min); args.push_back(extent); sink.define_extern("dump_to_file", args, Int(32), 0); source.compute_root(); sink.compile_jit(); // Dump the first 10 squares to a file filename.set("halide_test_extern_consumer.txt"); min.set(0); extent.set(10); sink.realize(); if (!check_result()) return -1; // Test ImageParam ExternFuncArgument via passed in image. Image<int32_t> buf = source.realize(10); ImageParam passed_in(Int(32), 1); passed_in.set(buf); Func sink2; std::vector<ExternFuncArgument> args2; args2.push_back(passed_in); args2.push_back(filename); args2.push_back(min); args2.push_back(extent); sink2.define_extern("dump_to_file", args2, Int(32), 0); sink2.realize(); if (!check_result()) return -1; printf("Success!\n"); return 0; }
Func process(Func raw, Type result_type, ImageParam matrix_3200, ImageParam matrix_7000, Param<float> color_temp, Param<float> gamma, Param<float> contrast) { Var xi, yi; Func denoised = hot_pixel_suppression(raw); Func deinterleaved = deinterleave(denoised); Func demosaiced = demosaic(deinterleaved); Func corrected = color_correct(demosaiced, matrix_3200, matrix_7000, color_temp); Func curved = apply_curve(corrected, result_type, gamma, contrast); processed(tx, ty, c) = curved(tx, ty, c); // Schedule processed.bound(c, 0, 3); // bound color loop 0-3, properly if (schedule == 0) { // Compute in chunks over tiles, vectorized by 8 denoised.compute_at(processed, tx).vectorize(x, 8); deinterleaved.compute_at(processed, tx).vectorize(x, 8).reorder(c, x, y).unroll(c); corrected.compute_at(processed, tx).vectorize(x, 4).reorder(c, x, y).unroll(c); processed.tile(tx, ty, xi, yi, 32, 32).reorder(xi, yi, c, tx, ty); processed.parallel(ty); } else if (schedule == 1) { // Same as above, but don't vectorize (sse is bad at interleaved 16-bit ops) denoised.compute_at(processed, tx); deinterleaved.compute_at(processed, tx); corrected.compute_at(processed, tx); processed.tile(tx, ty, xi, yi, 128, 128).reorder(xi, yi, c, tx, ty); processed.parallel(ty); } else { denoised.compute_root(); deinterleaved.compute_root(); corrected.compute_root(); processed.compute_root(); } return processed; }
int main(int argc, char **argv) { // Generate random input image. const int W = 128, H = 48; Buffer<uint8_t> in(W, H); for (int y = 0; y < H; y++) { for (int x = 0; x < W; x++) { in(x, y) = rand() & 0xff; } } Var x("x"), y("y"); // Apply the boundary condition up-front. Func input = BoundaryConditions::repeat_edge(in); input.compute_root(); // Define the dilate algorithm. Func max_x("max_x"); Func dilate3x3("dilate3x3"); max_x(x, y) = max3(input(x-1, y), input(x, y), input(x+1, y)); dilate3x3(x, y) = max3(max_x(x, y-1), max_x(x, y), max_x(x, y+1)); // Schedule. Target target = get_jit_target_from_environment(); if (target.has_gpu_feature()) { dilate3x3.gpu_tile(x, y, 16, 16); } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { dilate3x3.hexagon().vectorize(x, 64); } else { dilate3x3.vectorize(x, target.natural_vector_size<uint8_t>()); } // Run the pipeline and verify the results are correct. Buffer<uint8_t> out = dilate3x3.realize(W, H, target); for (int y = 1; y < H-1; y++) { for (int x = 1; x < W-1; x++) { uint16_t correct = max3(max3(in(x-1, y-1), in(x, y-1), in(x+1, y-1)), max3(in(x-1, y ), in(x, y ), in(x+1, y )), max3(in(x-1, y+1), in(x, y+1), in(x+1, y+1))); if (out(x, y) != correct) { std::cout << "out(" << x << ", " << y << ") = " << out(x, y) << " instead of " << correct << "\n"; return -1; } } } std::cout << "Success!\n"; return 0; }
int rdom_with_predicate_rfactor_test(bool compile_module) { Func f("f"), g("g"); Var x("x"), y("y"), z("z"); f(x, y, z) = x + y + z; f.compute_root(); g(x, y, z) = 1; RDom r(5, 10, 5, 10, 0, 20); r.where(r.x < r.y); r.where(r.x + 2*r.y <= r.z); g(r.x, r.y, r.z) += f(r.x, r.y, r.z); Var u("u"), v("v"); Func intm = g.update(0).rfactor({{r.y, u}, {r.x, v}}); intm.compute_root(); Var ui("ui"), vi("vi"), t("t"); intm.tile(u, v, ui, vi, 2, 2).fuse(u, v, t).parallel(t); intm.update(0).vectorize(r.z, 2); if (compile_module) { // Check the call graphs. Module m = g.compile_to_module({g.infer_arguments()}); CheckCalls checker; m.functions().front().body.accept(&checker); CallGraphs expected = { {g.name(), {}}, {g.update(0).name(), {intm.name(), g.name()}}, {intm.name(), {}}, {intm.update(0).name(), {f.name(), intm.name()}}, {f.name(), {}}, }; if (check_call_graphs(checker.calls, expected) != 0) { return -1; } } else { Image<int> im = g.realize(20, 20, 20); auto func = [](int x, int y, int z) { return (5 <= x && x <= 14) && (5 <= y && y <= 14) && (0 <= z && z <= 19) && (x < y) && (x + 2*y <= z) ? x + y + z + 1 : 1; }; if (check_image(im, func)) { return -1; } } return 0; }
int main(int argc, char **argv) { Var x, y, z; RDom r(0, 4096, 0, 4096, 0, 256); Func big; big(x, y, z) = cast<uint8_t>(42); big.set_error_handler(&halide_error); big.compute_root(); Func grand_total; grand_total() = cast<uint8_t>(sum(big(r.x, r.y, r.z))); grand_total.set_error_handler(&halide_error); Image<uint8_t> result = grand_total.realize(); assert(error_occurred); printf("Success!\n"); }
Func make_noise(int depth) { Func f; Var x, y, c; if (depth == 0) { f(x, y, c) = random_float(); } else { Func g = make_noise(depth - 1); Func g_up; f(x, y, c) = (g(x/2, y/2, c) + g((x+1)/2, y/2, c) + g(x/2, (y+1)/2, c) + g((x+1)/2, (y+1)/2, c) + 0.25f * random_float()) / 4.25f; } f.compute_root(); return f; }
int main(int argc, char **argv) { ImageParam input(Float(32), 2); Var x, y; Func g; g(x, y) = input(x, y) * 2; g.compute_root(); Func f; f(x, y) = g(x, y); f.parallel(y); f.trace_stores(); f.compile_to_file("user_context_insanity", input, user_context_param()); return 0; }
// Defines a func to blur the columns of an input with a first order low // pass IIR filter, followed by a transpose. Func blur_cols_transpose(Func input, Expr height, Expr alpha) { Func blur; // Pure definition: do nothing. blur(x, y, c) = undef<float>(); // Update 0: set the top row of the result to the input. blur(x, 0, c) = input(x, 0, c); // Update 1: run the IIR filter down the columns. RDom ry(1, height - 1); blur(x, ry, c) = (1 - alpha)*blur(x, ry - 1, c) + alpha*input(x, ry, c); // Update 2: run the IIR blur up the columns. Expr flip_ry = height - ry - 1; blur(x, flip_ry, c) = (1 - alpha)*blur(x, flip_ry + 1, c) + alpha*blur(x, flip_ry, c); // Transpose the blur. Func transpose; transpose(x, y, c) = blur(y, x, c); // Schedule: // Split the transpose into tiles of rows. Parallelize over channels // and strips (Halide supports nested parallelism). Var xo, yo; transpose.compute_root() .tile(x, y, xo, yo, x, y, 8, 8) .vectorize(x) .parallel(yo) .parallel(c); // Run the filter on each row of tiles (which corresponds to a strip of // columns in the input). blur.compute_at(transpose, yo); // Vectorize computations within the strips. blur.update(1) .reorder(x, ry) .vectorize(x); blur.update(2) .reorder(x, ry) .vectorize(x); return transpose; }
int main(int argc, char **argv) { Func f; Var x; f(x) = sin(x); f.compute_root(); const int N = 9; std::vector<Expr> exprs; for (int i = 0; i < N; i++) { exprs.push_back(f(i)); } exprs = bitonic_sort(exprs); std::cout << exprs.size() << "\n"; // Use update definitions to write them to another Func in sorted // order for inspection. Note that doing this doesn't explicitly // share work between each element - it'll generate the huge // min/max expression to extract each sorted element. llvm should // lift out common subexpressions though. Func g; g(x) = undef<float>(); for (int i = 0; i < N; i++) { g(i) = exprs[i]; } Buffer<float> result = g.realize(N); for (int i = 0; i < N; i++) { printf("%f ", result(i)); } printf("\n"); for (int i = 0; i < N-1; i++) { if (result(i) >= result(i+1)) { printf("Results were not in order\n"); return -1; } } return 0; }
//Convolution Func ifft2_c2r(Func input, int W, int H) { Target target = get_target_from_environment(); Fft2dDesc fwd_desc; Fft2dDesc inv_desc; inv_desc.gain = 1.0f/(W*H); //Make complex ComplexFunc input_complex; input_complex(x, y, c) = {input(x, y, c, 0), input(x, y, c, 1)}; // Compute the inverse DFT Func res = fft2d_c2r(input_complex, W, H, target, inv_desc); //Schedule res.compute_root(); return res; }
Func blur(Func input, Expr sigma, Expr width, Expr height) { // Compute IIR coefficients using the method of Young and Van Vliet. Func coeff; Expr q = select(sigma < 2.5f, 3.97156f - 4.14554f*sqrt(1 - 0.26891f*sigma), 0.98711f*sigma - 0.96330f); Expr denom = 1.57825f + 2.44413f*q + 1.4281f*q*q + 0.422205f*q*q*q; coeff(x) = undef<float>(); coeff(1) = (2.44413f*q + 2.85619f*q*q + 1.26661f*q*q*q)/denom; coeff(2) = -(1.4281f*q*q + 1.26661f*q*q*q)/denom; coeff(3) = (0.422205f*q*q*q)/denom; coeff(0) = 1 - (coeff(1) + coeff(2) + coeff(3)); coeff.compute_root(); Func blurY, blurX; blurY = blur_then_transpose(input, coeff, height, sigma); blurX = blur_then_transpose(blurY, coeff, width, sigma); return blurX; }
int main(int argc, char **argv) { // Move this test to correctness once we can support >4d buffer_ts on the gpu if (!get_jit_target_from_environment().has_gpu_feature()) { printf("No gpu target enabled. Skipping test.\n"); // This test is currently expected to error out. printf("Error: pretending that there was an error\n"); return -1; } Func f; Var v0, v1, v2, v3, v4; f(v0, v1, v2, v3, v4) = v0 + 2*v1 + 4*v2 + 8*v3 + 16*v4; f.compute_root().gpu_blocks(v3, v4).gpu_threads(v1, v2); // Linearize into an output buffer Func g; g(v0) = f(v0 % 2, (v0 / 2) % 2, (v0 / 4) % 2, (v0 / 8) % 2, (v0 / 16) % 2); Image<int> result = g.realize(32); // Delete this code once this test works. printf("Error: I should not have successfully compiled.\n"); return -1; for (int i = 0; i < result.width(); i++) { if (i != result(i)) { printf("result(%d) = %d instead of %d\n", i, result(i), i); return -1; } } printf("Success!\n"); return 0; }
/* Do n unrolled iterations of game of life on a torus */ Func gameOfLife(ImageParam input, int n) { Var x, y; Func in; if (n == 1) { in(x, y) = input(x, y); } else { in = gameOfLife(input, n-1); in.compute_root(); } Expr w = input.width(), h = input.height(); Expr W = (x+w-1) % w, E = (x+1) % w, N = (y+h-1) % h, S = (y+1) % h; Expr livingNeighbors = (in(W, N) + in(x, N) + in(E, N) + in(W, y) + in(E, y) + in(W, S) + in(x, S) + in(E, S)); Expr alive = in(x, y) != 0; Func output; output(x, y) = select(livingNeighbors == 3 || (alive && livingNeighbors == 2), u8(1), u8(0)); return output; }
int main(int argc, char **argv) { Func f; Var x, y; Func in; in(x, y) = x + y; in.compute_root(); // Set f to zero f(x, y) = 0; // Then iterate over a circle, adding in(x, y) to f. Expr t = cast<int>(ceil(sqrt(10*10 - y*y))); f(x, y) += select(x > -t && x < t, in(x, y), 0); in.trace_loads(); f.set_custom_trace(my_trace); f.realize(20, 20); int c = 0; for (int y = 0; y < 20; y++) { for (int x = 0; x < 20; x++) { if (x*x + y*y < 10*10) c++; } } if (count != c) { printf("Func 'in' should only have been loaded from at points " "within the circle x*x + y*y < 10*10. It was loaded %d " "times, but there are %d points within that circle\n", count, c); printf("Passing for now. TODO: re-enable this test once trim-no-ops is in.\n"); } printf("Success!\n"); return 0; }
int subtraction_rfactor_test() { Func f("f"), g("g"), ref("ref"); Var x("x"), y("y"); f(x, y) = x + y; f.compute_root(); Param<int> inner_extent, outer_extent; RDom r(10, inner_extent, 30, outer_extent); inner_extent.set(20); outer_extent.set(40); ref(x, y) = 40; ref(x, y) -= f(r.x, r.y); g(x, y) = 40; g(x, y) -= f(r.x, r.y); RVar rxi("rxi"), rxo("rxo"); g.update(0).split(r.x, rxo, rxi, 2); Var u("u"); Func intm = g.update(0).rfactor(rxo, u); intm.compute_root(); intm.update(0).vectorize(u, 2); Image<int> im_ref = ref.realize(80, 80); Image<int> im = g.realize(80, 80); auto func = [&im_ref](int x, int y, int z) { return im_ref(x, y); }; if (check_image(im, func)) { return -1; } return 0; }
int main(int argc, char **argv) { Image<uint8_t> board1(32, 32), board2(32, 32), board3(32, 32); for (int y = 0; y < 32; y++) { for (int x = 0; x < 32; x++) { uint8_t val = ((rand() & 0xff) < 128) ? 1 : 0; board1(x, y) = val; board2(x, y) = val; board3(x, y) = val; } } ImageParam input(UInt(8), 2); { // Outer loop in C Func oneIteration = gameOfLife(input, 1); Func twoIterations = gameOfLife(input, 2); for (int i = 0; i < 10; i++) { input.set(board1); board1 = oneIteration.realize(32, 32); input.set(board1); board1 = oneIteration.realize(32, 32); input.set(board2); board2 = twoIterations.realize(32, 32); /* for (int y = 0; y < 32; y++) { for (int x = 0; x < 32; x++) { printf(board1(x, y) ? "#" : " "); } printf("|"); for (int x = 0; x < 32; x++) { printf(board2(x, y) ? "#" : " "); } printf("\n"); } */ for (int y = 0; y < 32; y++) { for (int x = 0; x < 32; x++) { if (board1(x, y) != board2(x, y)) { printf("At timestep %d, boards one and two disagree at %d, %d: %d vs %d\n", i, x, y, board1(x, y), board2(x, y)); return -1; } } } } } { // Outer loop in Halide using a reduction Func life; // Initialize step Var x, y, z; life(x, y, z) = input(x, y); // Update step Expr w = input.width(), h = input.height(); RDom t(0, w, 0, h, 0, 21); Expr lastT = (t.z+1)%2; Expr W = (t.x+w-1) % w, E = (t.x+1) % w, N = (t.y+h-1) % h, S = (t.y+1) % h; Expr alive = life(t.x, t.y, lastT) != u8(0); Expr livingNeighbors = (life(W, N, lastT) + life(t.x, N, lastT) + life(E, N, lastT) + life(W, t.y, lastT) + life(E, t.y, lastT) + life(W, S, lastT) + life(t.x, S, lastT) + life(E, S, lastT)); life(t.x, t.y, t.z%2) = select(livingNeighbors == 3 || (alive && livingNeighbors == 2), u8(1), u8(0)); life.compute_root(); Func output; output(x, y) = life(x, y, 1); input.set(board3); output.realize(board3); /* for (int y = 0; y < 32; y++) { for (int x = 0; x < 32; x++) { printf(board1(x, y) ? "#" : " "); } printf("|"); for (int x = 0; x < 32; x++) { printf(board3(x, y) ? "#" : " "); } printf("\n"); } */ for (int y = 0; y < 32; y++) { for (int x = 0; x < 32; x++) { if (board1(x, y) != board3(x, y)) { printf("Boards one and three disagree at %d, %d: %d vs %d\n", x, y, board1(x, y), board3(x, y)); return -1; } } } } printf("Success!\n"); return 0; }
// Now a schedule that uses CUDA or OpenCL. void schedule_for_gpu() { // We make the decision about whether to use the GPU for each // Func independently. If you have one Func computed on the // CPU, and the next computed on the GPU, Halide will do the // copy-to-gpu under the hood. For this pipeline, there's no // reason to use the CPU for any of the stages. Halide will // copy the input image to the GPU the first time we run the // pipeline, and leave it there to reuse on subsequent runs. // As before, we'll compute the LUT once at the start of the // pipeline. lut.compute_root(); // Let's compute the look-up-table using the GPU in 16-wide // one-dimensional thread blocks. First we split the index // into blocks of size 16: Var block, thread; lut.split(i, block, thread, 16); // Then we tell cuda that our Vars 'block' and 'thread' // correspond to CUDA's notions of blocks and threads, or // OpenCL's notions of thread groups and threads. lut.gpu_blocks(block) .gpu_threads(thread); // This is a very common scheduling pattern on the GPU, so // there's a shorthand for it: // lut.gpu_tile(i, 16); // Func::gpu_tile method is similar to Func::tile, except that // it also specifies that the tile coordinates correspond to // GPU blocks, and the coordinates within each tile correspond // to GPU threads. // Compute color channels innermost. Promise that there will // be three of them and unroll across them. curved.reorder(c, x, y) .bound(c, 0, 3) .unroll(c); // Compute curved in 2D 8x8 tiles using the GPU. curved.gpu_tile(x, y, 8, 8); // This is equivalent to: // curved.tile(x, y, xo, yo, xi, yi, 8, 8) // .gpu_blocks(xo, yo) // .gpu_threads(xi, yi); // We'll leave sharpen as inlined into curved. // Compute the padded input as needed per GPU block, storing the // intermediate result in shared memory. Var::gpu_blocks, and // Var::gpu_threads exist to help you schedule producers within // GPU threads and blocks. padded.compute_at(curved, Var::gpu_blocks()); // Use the GPU threads for the x and y coordinates of the // padded input. padded.gpu_threads(x, y); // JIT-compile the pipeline for the GPU. CUDA or OpenCL are // not enabled by default. We have to construct a Target // object, enable one of them, and then pass that target // object to compile_jit. Otherwise your CPU will very slowly // pretend it's a GPU, and use one thread per output pixel. // Start with a target suitable for the machine you're running // this on. Target target = get_host_target(); // Then enable OpenCL or CUDA. // We'll enable OpenCL here, because it tends to give better // performance than CUDA, even with NVidia's drivers, because // NVidia's open source LLVM backend doesn't seem to do all // the same optimizations their proprietary compiler does. target.features |= Target::OpenCL; // Uncomment the next line and comment out the line above to // try CUDA instead. // target.features |= Target::CUDA; // If you want to see all of the OpenCL or CUDA API calls done // by the pipeline, you can also enable the GPUDebug // flag. This is helpful for figuring out which stages are // slow, or when CPU -> GPU copies happen. It hurts // performance though, so we'll leave it commented out. //target.features |= Target::GPUDebug; curved.compile_jit(target); }
int main(int argc, char **argv) { /* THE ALGORITHM */ // Number of pyramid levels int J = 8; // number of intensity levels Param<int> levels; // Parameters controlling the filter Param<float> alpha, beta; // Takes a 16-bit input ImageParam input(UInt(16), 3); // loop variables Var c, k; // Make the remapping function as a lookup table. Func remap; Expr fx = cast<float>(x) / 256.0f; remap(x) = alpha*fx*exp(-fx*fx/2.0f); // Convert to floating point Func floating; floating(x, y, c) = cast<float>(input(x, y, c)) / 65535.0f; // Set a boundary condition Func clamped; clamped(x, y, c) = floating(clamp(x, 0, input.width()-1), clamp(y, 0, input.height()-1), c); // Get the luminance channel Func gray; gray(x, y) = 0.299f * clamped(x, y, 0) + 0.587f * clamped(x, y, 1) + 0.114f * clamped(x, y, 2); // Make the processed Gaussian pyramid. Func gPyramid[J]; // Do a lookup into a lut with 256 entires per intensity level Expr idx = gray(x, y)*cast<float>(levels-1)*256.0f; idx = clamp(cast<int>(idx), 0, (levels-1)*256); gPyramid[0](x, y, k) = beta*gray(x, y) + remap(idx - 256*k); for (int j = 1; j < J; j++) { gPyramid[j](x, y, k) = downsample(gPyramid[j-1])(x, y, k); } // Get its laplacian pyramid Func lPyramid[J]; lPyramid[J-1] = gPyramid[J-1]; for (int j = J-2; j >= 0; j--) { lPyramid[j](x, y, k) = gPyramid[j](x, y, k) - upsample(gPyramid[j+1])(x, y, k); } // Make the Gaussian pyramid of the input Func inGPyramid[J]; inGPyramid[0] = gray; for (int j = 1; j < J; j++) { inGPyramid[j](x, y) = downsample(inGPyramid[j-1])(x, y); } // Make the laplacian pyramid of the output Func outLPyramid[J]; for (int j = 0; j < J; j++) { // Split input pyramid value into integer and floating parts Expr level = inGPyramid[j](x, y) * cast<float>(levels-1); Expr li = clamp(cast<int>(level), 0, levels-2); Expr lf = level - cast<float>(li); // Linearly interpolate between the nearest processed pyramid levels outLPyramid[j](x, y) = (1.0f - lf) * lPyramid[j](x, y, li) + lf * lPyramid[j](x, y, li+1); } // Make the Gaussian pyramid of the output Func outGPyramid[J]; outGPyramid[J-1] = outLPyramid[J-1]; for (int j = J-2; j >= 0; j--) { outGPyramid[j](x, y) = upsample(outGPyramid[j+1])(x, y) + outLPyramid[j](x, y); } // Reintroduce color (Connelly: use eps to avoid scaling up noise w/ apollo3.png input) Func color; float eps = 0.01f; color(x, y, c) = outGPyramid[0](x, y) * (clamped(x, y, c)+eps) / (gray(x, y)+eps); Func output("local_laplacian"); // Convert back to 16-bit output(x, y, c) = cast<uint16_t>(clamp(color(x, y, c), 0.0f, 1.0f) * 65535.0f); /* THE SCHEDULE */ remap.compute_root(); Var yi; output.split(y, y, yi, 4).parallel(y).vectorize(x, 4); for (int j = 0; j < 4; j++) { inGPyramid[j].compute_root().split(y, y, yi, 4).parallel(y).vectorize(x, 4); if (j > 0) gPyramid[j].compute_root().parallel(k).vectorize(x, 4); outGPyramid[j].compute_root().split(y, y, yi, 4).parallel(y).vectorize(x, 4); } for (int j = 4; j < J; j++) { inGPyramid[j].compute_root().parallel(y); gPyramid[j].compute_root().parallel(k); outGPyramid[j].compute_root().parallel(y); } output.compile_to_file("local_laplacian", levels, alpha, beta, input); return 0; }