int main(int argc, char **argv) { { Param<bool> param; Func f; Var x; f(x) = select(param, x*3, x*17); // Vectorize when the output is large enough Expr cond = (f.output_buffer().width() >= 4); f.specialize(cond).vectorize(x, 4); // This has created a specialization of f that is // vectorized. Now we want to further specialize both the // default case and the special case based on param. We can // retrieve a reference to the specialization using the same // condition again: f.specialize(cond).specialize(param); // Now specialize the narrow case on param as well f.specialize(param); f.set_custom_trace(&my_trace); f.trace_stores(); Image<int> out(100); // Just check that all the specialization didn't change the output. param.set(true); reset_trace(); f.realize(out); for (int i = 0; i < out.width(); i++) { int correct = i*3; if (out(i) != correct) { printf("out(%d) was %d instead of %d\n", i, out(i), correct); } } param.set(false); f.realize(out); for (int i = 0; i < out.width(); i++) { int correct = i*17; if (out(i) != correct) { printf("out(%d) was %d instead of %d\n", i, out(i), correct); } } // Should have used vector stores if (!vector_store || scalar_store) { printf("This was supposed to use vector stores\n"); return -1; } // Now try a smaller input out = Image<int>(3); param.set(true); reset_trace(); f.realize(out); for (int i = 0; i < out.width(); i++) { int correct = i*3; if (out(i) != correct) { printf("out(%d) was %d instead of %d\n", i, out(i), correct); } } param.set(false); f.realize(out); for (int i = 0; i < out.width(); i++) { int correct = i*17; if (out(i) != correct) { printf("out(%d) was %d instead of %d\n", i, out(i), correct); } } // Should have used scalar stores if (vector_store || !scalar_store) { printf("This was supposed to use scalar stores\n"); return -1; } } { Func f1, f2, g1, g2; Var x; // Define pipeline A f1(x) = x + 7; g1(x) = f1(x) + f1(x + 1); // Define pipeline B f2(x) = x * 34; g2(x) = f2(x) + f2(x - 1); // Switch between them based on a boolean param Param<bool> param; Func out; out(x) = select(param, g1(x), g2(x)); // These will be outside the condition that specializes out, // but skip stages will nuke their allocation and computation // for us. f1.compute_root(); g1.compute_root(); f2.compute_root(); out.specialize(param); // Count allocations. out.set_custom_allocator(&my_malloc, &my_free); reset_alloc_counts(); param.set(true); out.realize(100); if (empty_allocs != 1 || nonempty_allocs != 2 || frees != 3) { printf("There were supposed to be 1 empty alloc, 2 nonempty allocs, and 3 frees.\n" "Instead we got %d empty allocs, %d nonempty allocs, and %d frees.\n", empty_allocs, nonempty_allocs, frees); return -1; } reset_alloc_counts(); param.set(false); out.realize(100); if (empty_allocs != 2 || nonempty_allocs != 1 || frees != 3) { printf("There were supposed to be 2 empty allocs, 1 nonempty alloc, and 3 frees.\n" "Instead we got %d empty allocs, %d nonempty allocs, and %d frees.\n", empty_allocs, nonempty_allocs, frees); return -1; } } { // Specialize for interleaved vs planar inputs ImageParam im(Float(32), 1); im.set_stride(0, Expr()); // unconstrain the stride Func f; Var x; f(x) = im(x); // If we have a stride of 1 it's worth vectorizing, but only if the width is also > 8. f.specialize(im.stride(0) == 1 && im.width() >= 8).vectorize(x, 8); f.trace_stores(); f.set_custom_trace(&my_trace); // Check bounds inference is still cool with widths < 8 f.infer_input_bounds(5); int m = im.get().min(0), e = im.get().extent(0); if (m != 0 || e != 5) { printf("min, extent = %d, %d instead of 0, 5\n", m, e); return -1; } // Check we don't crash with the small input, and that it uses scalar stores reset_trace(); f.realize(5); if (!scalar_store || vector_store) { printf("These stores were supposed to be scalar.\n"); return -1; } // Check we don't crash with a larger input, and that it uses vector stores Image<float> image(100); im.set(image); reset_trace(); f.realize(100); if (scalar_store || !vector_store) { printf("These stores were supposed to be vector.\n"); return -1; } } { // Bounds required of the input change depending on the param ImageParam im(Float(32), 1); Param<bool> param; Func f; Var x; f(x) = select(param, im(x + 10), im(x - 10)); f.specialize(param); param.set(true); f.infer_input_bounds(100); int m = im.get().min(0); if (m != 10) { printf("min %d instead of 10\n", m); return -1; } param.set(false); im.set(Buffer()); f.infer_input_bounds(100); m = im.get().min(0); if (m != -10) { printf("min %d instead of -10\n", m); return -1; } } { // Specialize an update definition Func f; Var x; Param<int> start, size; RDom r(start, size); f(x) = x; f(r) = 10 - r; // Special-case for when we only update one element of f f.update().specialize(size == 1); // Also special-case updating no elements of f f.update().specialize(size == 0); start.set(0); size.set(1); // Not crashing is enough f.realize(100); } { // What happens to bounds inference if an input is not used at // all for a given specialization? ImageParam im(Float(32), 1); Param<bool> param; Func f; Var x; f(x) = select(param, im(x), 0.0f); f.specialize(param); param.set(false); Image<float> image(10); im.set(image); // The image is too small, but that should be OK, because the // param is false so the image will never be used. f.realize(100); } { // Specialization inherits the scheduling directives done so far: ImageParam im(Int(32), 2); Func f; Var x, y; f(x, y) = im(x, y); Expr cond = f.output_buffer().width() >= 4; // Unroll y by two innermost. f.reorder(y, x).unroll(y, 2).reorder(x, y); // Vectorize if the output is at least 4-wide. Inherits the // unrolling already done. f.specialize(cond).vectorize(x, 4); // Confirm that the unrolling applies to both cases using bounds inference: f.infer_input_bounds(3, 1); if (im.get().extent(0) != 3) { printf("extent(0) was supposed to be 3.\n"); return -1; } if (im.get().extent(1) != 2) { // Height is 2, because the unrolling also happens in the // specialized case. printf("extent(1) was supposed to be 2.\n"); return -1; } } { // Check we don't need to specialize intermediate stages. ImageParam im(Int(32), 1); Func f, g, h, out; Var x; f(x) = im(x); g(x) = f(x); h(x) = g(x); out(x) = h(x); Expr w = out.output_buffer().extent(0); out.output_buffer().set_min(0, 0); f.compute_root().specialize(w >= 4).vectorize(x, 4); g.compute_root().vectorize(x, 4); h.compute_root().vectorize(x, 4); out.specialize(w >= 4).vectorize(x, 4); Image<int> input(3), output(3); // Shouldn't throw a bounds error: im.set(input); out.realize(output); } { // Check specializations of stages nested in other stages simplify appropriately. ImageParam im(Int(32), 2); Param<bool> cond1, cond2; Func f, out; Var x, y; f(x, y) = im(x, y); out(x, y) = f(x, y); f.compute_at(out, x).specialize(cond1 && cond2).vectorize(x, 4); out.compute_root().specialize(cond1 && cond2).vectorize(x, 4); if_then_else_count = 0; CountIfThenElse pass1; for (auto ff : out.compile_to_module(out.infer_arguments()).functions()) { pass1.mutate(ff.body); } Image<int> input(3, 3), output(3, 3); // Shouldn't throw a bounds error: im.set(input); out.realize(output); if (if_then_else_count != 1) { printf("Expected 1 IfThenElse stmts. Found %d.\n", if_then_else_count); return -1; } } { // Check specializations of stages nested in other stages simplify appropriately. ImageParam im(Int(32), 2); Param<bool> cond1, cond2; Func f, out; Var x, y; f(x, y) = im(x, y); out(x, y) = f(x, y); f.compute_at(out, x).specialize(cond1).vectorize(x, 4); out.compute_root().specialize(cond1 && cond2).vectorize(x, 4); if_then_else_count = 0; CountIfThenElse pass2; for (auto ff : out.compile_to_module(out.infer_arguments()).functions()) { pass2.mutate(ff.body); } Image<int> input(3, 3), output(3, 3); // Shouldn't throw a bounds error: im.set(input); out.realize(output); // There should have been 2 Ifs total: They are the // outer cond1 && cond2, and the condition in the true case // should have been simplified away. The If in the false // branch cannot be simplified. if (if_then_else_count != 2) { printf("Expected 2 IfThenElse stmts. Found %d.\n", if_then_else_count); return -1; } } printf("Success!\n"); return 0; }
int main(int argc, char **argv) { Var x, y; { // Define a reduction with two update steps Func f; f(x) = sin(x); RDom r1(1, 10); Expr xl = r1; // left to right pass Expr xr = 10 - r1; // right to left pass f(xl) = f(xl - 1) + f(xl); f(xr) = f(xr + 1) + f(xr); Image<float> result = f.realize(11); // The same thing in C float ref[11]; for (int i = 0; i < 11; i++) { ref[i] = sinf(i); } for (int i = 1; i < 11; i++) { ref[i] += ref[i-1]; } for (int i = 9; i >= 0; i--) { ref[i] += ref[i+1]; } for (int i = 0; i < 11; i++) { if (fabs(result(i) - ref[i]) > 0.0001f) { printf("result(%d) = %f instead of %f\n", i, result(i), ref[i]); return -1; } } } { // Define a reduction that fills an array, integrates it, then // manually change certain values. One of the values will // depend on another function. Func f, g; g(x) = x*x; f(x) = x; // Integrate from 1 to 10 RDom r(1, 10); f(r) = f(r) + f(r-1); // Clobber two values f(17) = 8; f(109) = 4; // Clobber a range using another func RDom r2(4, 5); f(r2) = g(r2); g.compute_at(f, r2); Image<int> result = f.realize(110); int correct[110]; for (int i = 0; i < 110; i++) { correct[i] = i; } for (int i = 1; i < 11; i++) { correct[i] += correct[i-1]; } correct[17] = 8; correct[109] = 4; for (int i = 4; i < 9; i++) { correct[i] = i*i; } for (int i = 0; i < 110; i++) { if (correct[i] != result(i)) { printf("result(%d) = %d instead of %d\n", i, result(i), correct[i]); return -1; } } } { // Create a fully unrolled fibonacci routine composed almost // entirely of single assignment statements. The horror! Func f; f(x) = 1; for (int i = 2; i < 20; i++) { f(i) = f(i-1) + f(i-2); } Image<int> result = f.realize(20); int ref[20]; ref[0] = 1; ref[1] = 1; for (int i = 2; i < 20; i++) { ref[i] = ref[i-1] + ref[i-2]; if (ref[i] != result(i)) { printf("fibonacci(%d) = %d instead of %d\n", i, result(i), ref[i]); return -1; } } } { // Make an integral image Func f; f(x, y) = sin(x + y); RDom r(1, 99); f(x, r) += f(x, r - 1); f(r, y) += f(r - 1, y); // Walk down the image in vectors f.update(0).vectorize(x, 4); // Walk across the image in parallel. We need to do an unsafe // reorder operation here to move y to the outer loop, because // we don't have the ability to reorder vars with rvars yet. f.update(1).reorder(Var(r.x.name()), y).parallel(y); Image<float> result = f.realize(100, 100); // Now the equivalent in C (cheating and using Halide for the initial image) Image<float> ref = lambda(x, y, sin(x+y)).realize(100, 100); for (int y = 1; y < 100; y++) { for (int x = 0; x < 100; x++) { ref(x, y) += ref(x, y - 1); } } for (int y = 0; y < 100; y++) { for (int x = 1; x < 100; x++) { ref(x, y) += ref(x - 1, y); } } // Check they're the same for (int y = 0; y < 100; y++) { for (int x = 0; x < 100; x++) { if (fabs(ref(x, y) - result(x, y)) > 0.0001f) { printf("integral image at (%d, %d) = %f instead of %f\n", x, y, result(x, y), ref(x, y)); return -1; } } } } { // Walk down an image using a few different factors of splits Func f; RDom r(1, 99); Var xo, xi; ImageParam input(Float(32), 2); f(x, y) = input(x, y); f(x, r) += f(x, r-1) + input(x, r); f(x, r) += f(x, r-1) + input(x, r); f(x, r) += f(x, r-1) + input(x, r); f(x, r) += f(x, r-1) + input(x, r); f.update(0).split(x, x, xi, 11); f.update(1).split(x, x, xi, 13); f.update(2).split(x, x, xi, 17); // So if we ask for an output of size 100x10, we'll need an // input of size 110 x 100. 110 is enough to cover rounding up // 100 to be a multiple of 11, 13, and 17. f.infer_input_bounds(100, 10); Image<float> in = input.get(); if (in.width() != 110 || in.height() != 100) { printf("Unexpected image size: %d x %d instead of 144 x 100\n", in.width(), in.height()); return -1; } } printf("Success!\n"); return 0; }