int main(int argc, char **argv) { Target target = get_jit_target_from_environment(); if (!target.has_feature(Target::OpenCL)) { printf("This test requires opencl.\n"); return 0; } // These calls are only available for AOT-compiled code: // // halide_set_custom_get_symbol(my_get_symbol_impl); // halide_set_custom_load_library(my_load_library_impl); // halide_set_custom_get_library_symbol(my_get_library_symbol_impl); // // For JIT code, we must use JITSharedRuntime::set_default_handlers(). Internal::JITHandlers handlers; handlers.custom_get_symbol = my_get_symbol_impl; handlers.custom_load_library = my_load_library_impl; handlers.custom_get_library_symbol = my_get_library_symbol_impl; Internal::JITSharedRuntime::set_default_handlers(handlers); Var x, y, xi, yi; Func f; f(x, y) = cast<int32_t>(x + y); f.gpu_tile(x, y, xi, yi, 8, 8, TailStrategy::Auto, DeviceAPI::OpenCL); f.set_error_handler(my_error_handler); Buffer<int32_t> out = f.realize(64, 64, target); fprintf(stderr, "Should not get here.\n"); return -1; }
int main(int argc, char **argv) { // Make sure that freeing GPU buffers doesn't occur before the // computation that is filling them completes. Func f; Var x, y; RDom r(0, 100); f(x, y) = sum(sqrt(sqrt(sqrt(sqrt(x+y+r))))); Target t = get_jit_target_from_environment(); if (t.has_feature(Target::OpenCL) || t.has_feature(Target::CUDA)) { f.gpu_tile(x, y, 16, 16); // This allocates a buffer, does gpu compute into it, and then // frees it (calling dev_free) possibly before the compute is // done. for (int i = 0; i < 10; i++) { f.realize(1024, 1024, t); } } else { // Skip this test if gpu target not enabled (it's pretty slow on a cpu). } printf("Success!\n"); return 0; }
// Now a schedule that uses CUDA or OpenCL. void schedule_for_gpu() { // We make the decision about whether to use the GPU for each // Func independently. If you have one Func computed on the // CPU, and the next computed on the GPU, Halide will do the // copy-to-gpu under the hood. For this pipeline, there's no // reason to use the CPU for any of the stages. Halide will // copy the input image to the GPU the first time we run the // pipeline, and leave it there to reuse on subsequent runs. // As before, we'll compute the LUT once at the start of the // pipeline. lut.compute_root(); // Let's compute the look-up-table using the GPU in 16-wide // one-dimensional thread blocks. First we split the index // into blocks of size 16: Var block, thread; lut.split(i, block, thread, 16); // Then we tell cuda that our Vars 'block' and 'thread' // correspond to CUDA's notions of blocks and threads, or // OpenCL's notions of thread groups and threads. lut.gpu_blocks(block) .gpu_threads(thread); // This is a very common scheduling pattern on the GPU, so // there's a shorthand for it: // lut.gpu_tile(i, 16); // Func::gpu_tile method is similar to Func::tile, except that // it also specifies that the tile coordinates correspond to // GPU blocks, and the coordinates within each tile correspond // to GPU threads. // Compute color channels innermost. Promise that there will // be three of them and unroll across them. curved.reorder(c, x, y) .bound(c, 0, 3) .unroll(c); // Compute curved in 2D 8x8 tiles using the GPU. curved.gpu_tile(x, y, 8, 8); // This is equivalent to: // curved.tile(x, y, xo, yo, xi, yi, 8, 8) // .gpu_blocks(xo, yo) // .gpu_threads(xi, yi); // We'll leave sharpen as inlined into curved. // Compute the padded input as needed per GPU block, storing the // intermediate result in shared memory. Var::gpu_blocks, and // Var::gpu_threads exist to help you schedule producers within // GPU threads and blocks. padded.compute_at(curved, Var::gpu_blocks()); // Use the GPU threads for the x and y coordinates of the // padded input. padded.gpu_threads(x, y); // JIT-compile the pipeline for the GPU. CUDA or OpenCL are // not enabled by default. We have to construct a Target // object, enable one of them, and then pass that target // object to compile_jit. Otherwise your CPU will very slowly // pretend it's a GPU, and use one thread per output pixel. // Start with a target suitable for the machine you're running // this on. Target target = get_host_target(); // Then enable OpenCL or CUDA. // We'll enable OpenCL here, because it tends to give better // performance than CUDA, even with NVidia's drivers, because // NVidia's open source LLVM backend doesn't seem to do all // the same optimizations their proprietary compiler does. target.features |= Target::OpenCL; // Uncomment the next line and comment out the line above to // try CUDA instead. // target.features |= Target::CUDA; // If you want to see all of the OpenCL or CUDA API calls done // by the pipeline, you can also enable the GPUDebug // flag. This is helpful for figuring out which stages are // slow, or when CPU -> GPU copies happen. It hurts // performance though, so we'll leave it commented out. //target.features |= Target::GPUDebug; curved.compile_jit(target); }
int main(int argc, char **argv) { if (!get_jit_target_from_environment().has_gpu_feature()) { printf("Not running test because no gpu target enabled\n"); return 0; } { Func f; Var x, y, z; // Construct a Func with lots of potential race conditions, and // then run it in thread blocks on the gpu. f(x, y) = x + 100 * y; const int passes = 10; for (int i = 0; i < passes; i++) { RDom rx(0, 10); // Flip each row, using spots 10-19 as temporary storage f(rx + 10, y) = f(9 - rx, y); f(rx, y) = f(rx + 10, y); // Flip each column the same way RDom ry(0, 8); f(x, ry + 8) = f(x, 7 - ry); f(x, ry) = f(x, ry + 8); } Func g; g(x, y) = f(0, 0)+ f(9, 7); g.gpu_tile(x, y, 16, 8); f.compute_at(g, Var::gpu_blocks()); for (int i = 0; i < passes; i++) { f.update(i*4 + 0).gpu_threads(y); f.update(i*4 + 1).gpu_threads(y); f.update(i*4 + 2).gpu_threads(x); f.update(i*4 + 3).gpu_threads(x); } Image<int> out = g.realize(100, 100); for (int y = 0; y < out.height(); y++) { for (int x = 0; x < out.width(); x++) { int correct = 7*100 + 9; if (out(x, y) != correct) { printf("out(%d, %d) = %d instead of %d\n", x, y, out(x, y), correct); return -1; } } } } { // Construct a Func with undef stages, then run it in thread // blocks and make sure the right number of syncthreads are // added. Func f; Var x, y, z; f(x, y) = undef<int>(); f(x, y) += x + 100 * y; // This next line is dubious, because it entirely masks the // effect of the previous definition. If you add an undefined // value to the previous def, then Halide can evaluate this to // whatever it likes. Currently we'll just elide this update // definition. f(x, y) += undef<int>(); f(x, y) += y * 100 + x; Func g; g(x, y) = f(0, 0) + f(7, 7); g.gpu_tile(x, y, 8, 8); f.compute_at(g, Var::gpu_blocks()); f.gpu_threads(x, y); f.update(0).gpu_threads(x, y); f.update(1).gpu_threads(x, y); f.update(2).gpu_threads(x, y); // There should be two thread barriers: one in between the // non-undef definitions, and one between f and g. g.add_custom_lowering_pass(new CheckBarrierCount(2)); Image<int> out = g.realize(100, 100); } printf("Success!\n"); return 0; }
int main(int argc, char **argv) { Target target = get_jit_target_from_environment(); if (1) { // Test a tuple reduction on the gpu Func f; Var x, y; f(x, y) = Tuple(x + y, x - y); // Updates to a reduction are atomic. f(x, y) = Tuple(f(x, y)[1]*2, f(x, y)[0]*2); // now equals ((x - y)*2, (x + y)*2) if (target.has_gpu_feature()) { f.gpu_tile(x, y, 16, 16); f.update().gpu_tile(x, y, 16, 16); } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { f.hexagon(y).vectorize(x, 32); f.update().hexagon(y).vectorize(x, 32); } Realization result = f.realize(1024, 1024); Image<int> a = result[0], b = result[1]; for (int y = 0; y < a.height(); y++) { for (int x = 0; x < a.width(); x++) { int correct_a = (x - y)*2; int correct_b = (x + y)*2; if (a(x, y) != correct_a || b(x, y) != correct_b) { printf("result(%d, %d) = (%d, %d) instead of (%d, %d)\n", x, y, a(x, y), b(x, y), correct_a, correct_b); return -1; } } } } if (1) { // Now test one that alternates between cpu and gpu per update step Func f; Var x, y; f(x, y) = Tuple(x + y, x - y); for (size_t i = 0; i < 10; i++) { // Swap the tuple elements and increment both f(x, y) = Tuple(f(x, y)[1] + 1, f(x, y)[0] + 1); } // Schedule the pure step and the odd update steps on the gpu if (target.has_gpu_feature()) { f.gpu_tile(x, y, 16, 16); } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { f.hexagon(y).vectorize(x, 32); } for (int i = 0; i < 10; i ++) { if (i & 1) { if (target.has_gpu_feature()) { f.update(i).gpu_tile(x, y, 16, 16); } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { f.update(i).hexagon(y).vectorize(x, 32); } } else { f.update(i); } } Realization result = f.realize(1024, 1024); Image<int> a = result[0], b = result[1]; for (int y = 0; y < a.height(); y++) { for (int x = 0; x < a.width(); x++) { int correct_a = (x + y) + 10; int correct_b = (x - y) + 10; if (a(x, y) != correct_a || b(x, y) != correct_b) { printf("result(%d, %d) = (%d, %d) instead of (%d, %d)\n", x, y, a(x, y), b(x, y), correct_a, correct_b); return -1; } } } } if (1) { // Same as above, but switches which steps are gpu and cpu Func f; Var x, y; f(x, y) = Tuple(x + y, x - y); for (size_t i = 0; i < 10; i++) { // Swap the tuple elements and increment both f(x, y) = Tuple(f(x, y)[1] + 1, f(x, y)[0] + 1); } // Schedule the even update steps on the gpu for (int i = 0; i < 10; i ++) { if (i & 1) { if (target.has_gpu_feature()) { f.update(i).gpu_tile(x, y, 16, 16); } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { f.update(i).hexagon(y).vectorize(x, 32); } } else { f.update(i); } } Realization result = f.realize(1024, 1024); Image<int> a = result[0], b = result[1]; for (int y = 0; y < a.height(); y++) { for (int x = 0; x < a.width(); x++) { int correct_a = (x + y) + 10; int correct_b = (x - y) + 10; if (a(x, y) != correct_a || b(x, y) != correct_b) { printf("result(%d, %d) = (%d, %d) instead of (%d, %d)\n", x, y, a(x, y), b(x, y), correct_a, correct_b); return -1; } } } } if (1) { // In this one, each step only uses one of the tuple elements // of the previous step, so only that buffer should get copied // back to host or copied to device. Func f; Var x, y; f(x, y) = Tuple(x + y - 1000, x - y + 1000); for (size_t i = 0; i < 10; i++) { f(x, y) = Tuple(f(x, y)[1] - 1, f(x, y)[1] + 1); } // Schedule the even update steps on the gpu for (int i = 0; i < 10; i++) { if (i & 1) { f.update(i); } else { if (target.has_gpu_feature()) { f.update(i).gpu_tile(x, y, 16, 16); } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { f.update(i).hexagon(y).vectorize(x, 32); } } } Realization result = f.realize(1024, 1024); Image<int> a = result[0], b = result[1]; for (int y = 0; y < a.height(); y++) { for (int x = 0; x < a.width(); x++) { int correct_a = (x - y + 1000) + 8; int correct_b = (x - y + 1000) + 10; if (a(x, y) != correct_a || b(x, y) != correct_b) { printf("result(%d, %d) = (%d, %d) instead of (%d, %d)\n", x, y, a(x, y), b(x, y), correct_a, correct_b); return -1; } } } } printf("Success!\n"); return 0; }
int main(int argc, char **argv) { if (!get_jit_target_from_environment().has_gpu_feature()) { printf("No gpu target enabled. Skipping test.\n"); return 0; } Var x, y, z, w; Image<int> full(80, 60, 10, 10); const int x_off = 4, y_off = 8, z_off = 2, w_off = 4; const int x_size = 16, y_size = 16, z_size = 3, w_size = 3; buffer_t cropped = *full.raw_buffer(); cropped.host = (uint8_t *)&(full(x_off, y_off, z_off, w_off)); cropped.min[0] = 0; cropped.min[1] = 0; cropped.min[2] = 0; cropped.min[3] = 0; cropped.extent[0] = x_size; cropped.extent[1] = y_size; cropped.extent[2] = z_size; cropped.extent[3] = w_size; cropped.stride[0] *= 2; cropped.stride[1] *= 2; cropped.stride[2] *= 2; cropped.stride[3] *= 2; Buffer out(Int(32), &cropped); // Make a bitmask representing the region inside the crop. Image<bool> in_subregion(80, 60, 10, 10); Expr test = ((x >= x_off) && (x < x_off + x_size*2) && (y >= y_off) && (y < y_off + y_size*2) && (z >= z_off) && (z < z_off + z_size*2) && (w >= w_off) && (w < w_off + w_size*2) && (x % 2 == 0) && (y % 2 == 0) && (z % 2 == 0) && (w % 2 == 0)); Func test_func; test_func(x, y, z, w) = test; test_func.realize(in_subregion); Func f; f(x, y, z, w) = 3*x + 2*y + z + 4*w; f.gpu_tile(x, y, 16, 16); f.output_buffer().set_stride(0, Expr()); f.realize(out); // Put some data in the full host buffer, avoiding the region // being evaluated above. Expr change_out_of_subregion = select(test, undef<int>(), 4*x + 3*y + 2*z + w); lambda(x, y, z, w, change_out_of_subregion).realize(full); // Copy back the output subset from the GPU. out.copy_to_host(); for (int w = 0; w < full.extent(3); ++w) { for (int z = 0; z < full.extent(2); ++z) { for (int y = 0; y < full.extent(1); ++y) { for (int x = 0; x < full.extent(0); ++x) { int correct; if (in_subregion(x, y, z, w)) { int x_ = (x - x_off)/2; int y_ = (y - y_off)/2; int z_ = (z - z_off)/2; int w_ = (w - w_off)/2; correct = 3*x_ + 2*y_ + z_ + 4*w_; } else { correct = 4*x + 3*y + 2*z + w; } if (full(x, y, z, w) != correct) { printf("Error! Incorrect value %i != %i at %i, %i, %i, %i\n", full(x, y, z, w), correct, x, y, z, w); return -1; } } } } } printf("Success!\n"); return 0; }
int main(int argc, char **argv) { Buffer<uint8_t> input(128, 64); for (int y = 0; y < input.height(); y++) { for (int x = 0; x < input.width(); x++) { input(x, y) = y*input.width() + x; } } Var x, y, xi, yi; { Func f; f(x, y) = select(((input(x, y) > 10) && (input(x, y) < 20)) || ((input(x, y) > 40) && (!(input(x, y) > 50))), u8(255), u8(0)); Target target = get_jit_target_from_environment(); if (target.has_gpu_feature()) { f.gpu_tile(x, y, xi, yi, 16, 16).vectorize(xi, 4); } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { f.hexagon().vectorize(x, 128); } else { f.vectorize(x, 8); } Buffer<uint8_t> output = f.realize(input.width(), input.height(), target); for (int y = 0; y < input.height(); y++) { for (int x = 0; x < input.width(); x++) { bool cond = ((input(x, y) > 10) && (input(x, y) < 20)) || ((input(x, y) > 40) && (!(input(x, y) > 50))); uint8_t correct = cond ? 255 : 0; if (correct != output(x, y)) { fprintf(stderr, "output(%d, %d) = %d instead of %d\n", x, y, output(x, y), correct); return -1; } } } } // Test a condition that uses a let resulting from common // subexpression elimination. { Func f; Expr common_cond = input(x, y) > 10; f(x, y) = select((common_cond && (input(x, y) < 20)) || ((input(x, y) > 40) && (!common_cond)), u8(255), u8(0)); Target target = get_jit_target_from_environment(); if (target.has_gpu_feature()) { f.gpu_tile(x, y, xi, yi, 16, 16).vectorize(xi, 4); } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { f.hexagon().vectorize(x, 128); } else { f.vectorize(x, 8); } Buffer<uint8_t> output = f.realize(input.width(), input.height(), target); for (int y = 0; y < input.height(); y++) { for (int x = 0; x < input.width(); x++) { bool common_cond = input(x, y) > 10; bool cond = (common_cond && (input(x, y) < 20)) || ((input(x, y) > 40) && (!common_cond)); uint8_t correct = cond ? 255 : 0; if (correct != output(x, y)) { fprintf(stderr, "output(%d, %d) = %d instead of %d\n", x, y, output(x, y), correct); return -1; } } } } // Test a condition which has vector and scalar inputs. { Func f("f"); f(x, y) = select(x < 10 || x > 20 || y < 10 || y > 20, 0, input(x, y)); Target target = get_jit_target_from_environment(); if (target.has_gpu_feature()) { f.gpu_tile(x, y, xi, yi, 16, 16).vectorize(xi, 4); } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { f.hexagon().vectorize(x, 128); } else { f.vectorize(x, 128); } Buffer<uint8_t> output = f.realize(input.width(), input.height(), target); for (int y = 0; y < input.height(); y++) { for (int x = 0; x < input.width(); x++) { bool cond = x < 10 || x > 20 || y < 10 || y > 20; uint8_t correct = cond ? 0 : input(x,y); if (correct != output(x, y)) { fprintf(stderr, "output(%d, %d) = %d instead of %d\n", x, y, output(x, y), correct); return -1; } } } } // Test a condition that uses differently sized types. { Func f; Expr ten = 10; f(x, y) = select(input(x, y) > ten, u8(255), u8(0)); Target target = get_jit_target_from_environment(); if (target.has_gpu_feature()) { f.gpu_tile(x, y, xi, yi, 16, 16).vectorize(xi, 4); } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { f.hexagon().vectorize(x, 128); } else { f.vectorize(x, 8); } Buffer<uint8_t> output = f.realize(input.width(), input.height(), target); for (int y = 0; y < input.height(); y++) { for (int x = 0; x < input.width(); x++) { bool cond = input(x, y) > 10; uint8_t correct = cond ? 255 : 0; if (correct != output(x, y)) { fprintf(stderr, "output(%d, %d) = %d instead of %d\n", x, y, output(x, y), correct); return -1; } } } } // Test a select where the condition has a different width than // the true/false values. for (int w = 8; w <= 32; w *= 2) { for (int n = 8; n < w; n *= 2) { Type narrow = UInt(n), wide = UInt(w); Func in_wide; in_wide(x, y) = cast(wide, y + x*3); in_wide.compute_root(); Func in_narrow; in_narrow(x, y) = cast(narrow, x*y + x - 17); in_narrow.compute_root(); Func f; f(x, y) = select(in_narrow(x, y) > 10, in_wide(x, y*2), in_wide(x, y*2+1)); Func cpu; cpu(x, y) = f(x, y); Func gpu; gpu(x, y) = f(x, y); Func out; out(x, y) = {cast<uint32_t>(cpu(x, y)), cast<uint32_t>(gpu(x, y))}; cpu.compute_root(); gpu.compute_root(); Target target = get_jit_target_from_environment(); if (target.has_feature(Target::OpenCL) && n == 16 && w == 32) { // Workaround for https://github.com/halide/Halide/issues/2477 printf("Skipping uint%d -> uint%d for OpenCL\n", n, w); continue; } if (target.has_gpu_feature()) { gpu.gpu_tile(x, y, xi, yi, 16, 16).vectorize(xi, 4); } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { gpu.hexagon().vectorize(x, 128); } else { // Just test vectorization gpu.vectorize(x, 8); } Realization r = out.realize(input.width(), input.height(), target); Buffer<uint32_t> cpu_output = r[0]; Buffer<uint32_t> gpu_output = r[1]; for (int y = 0; y < input.height(); y++) { for (int x = 0; x < input.width(); x++) { if (cpu_output(x, y) != gpu_output(x, y)) { fprintf(stderr, "gpu_output(%d, %d) = %d instead of %d for uint%d -> uint%d\n", x, y, gpu_output(x, y), cpu_output(x, y), n, w); return -1; } } } } } printf("Success!\n"); return 0; }
int main(int argc, char **argv) { Target t(get_jit_target_from_environment()); if (!t.has_gpu_feature()) { printf("Not running test because no gpu target enabled\n"); return 0; } const int n_types = 9; Type types[] = {Int(8), Int(16), Int(32), Int(64), UInt(8), UInt(16), UInt(32), UInt(64), Float(32)}; Func funcs[n_types]; Var x; Func out; Type result_type; if (t.has_feature(Target::Metal)) { result_type = UInt(32); } else { result_type = UInt(64); } Expr e = cast(result_type, 0); int offset = 0; for (int i = 0; i < n_types; i++) { int off = 0; if ((types[i].is_int() || types[i].is_uint())) { // Metal does not support 64-bit integers. if (t.has_feature(Target::Metal) && types[i].bits() >= 64) { continue; } if (types[i].bits() <= 64) { off = (1 << (types[i].bits() - 4)) + 17; } } offset += off; funcs[i](x) = cast(types[i], x/16 + off); e += cast(result_type, funcs[i](x)); funcs[i].compute_at(out, Var::gpu_blocks()).gpu_threads(x); } out(x) = e; out.gpu_tile(x, 23); Buffer output = out.realize(23*5); int result; if (t.has_feature(Target::Metal)) { result = check_result<uint32_t>(output, n_types - 2, offset); } else { result = check_result<uint64_t>(output, n_types, offset); } if (result != 0) { return result; } printf("Success!\n"); return 0; }
int main(int argc, char *argv[]) { #if !defined(STANDALONE) && !defined(TESTING_GPU) auto im = afwImage::MaskedImage<float>("../calexp-004207-g3-0123.fits"); int width = im.getWidth(), height = im.getHeight(); #else int width = 2048, height = 1489; // int width = 200, height = 200; printf("[no load]"); #endif printf("Loaded: %d x %d\n", width, height); //store image data in img_var(x, y, 0) and variance data in img_var(x, y, 1) Image<float> image(width, height); Image<float> variance(width, height); Image<uint16_t> mask(width, height); #if !defined(STANDALONE) && !defined(TESTING_GPU) //Read image in for (int y = 0; y < im.getHeight(); y++) { afwImage::MaskedImage<float, lsst::afw::image::MaskPixel, lsst::afw::image::VariancePixel>::x_iterator inPtr = im.x_at(0, y); for (int x = 0; x < im.getWidth(); x++){ image(x, y) = (*inPtr).image(); variance(x, y) = (*inPtr).variance(); mask(x, y) = (*inPtr).mask(); inPtr++; } } #endif int boundingBox = 5; Var x, y, i_v, y0, yi; //compute output image and variance //Polynomials that define weights of spatially variant linear combination of 5 kernels Func polynomial1, polynomial2, polynomial3, polynomial4, polynomial5; polynomial1(x, y) = 0.1f + 0.002f*x + 0.003f*y + 0.4f*x*x + 0.5f*x*y + 0.6f*y*y + 0.0007f*x*x*x + 0.0008f*x*x*y + 0.0009f*x*y*y + 0.00011f*y*y*y; //for experimenting with optimizations polynomial2(x, y) = 1.1f + 1.002f*x + 1.003f*y + 1.4f*x*x + 1.5f*x*y + 1.6f*y*y + 1.0007f*x*x*x + 1.0008f*x*x*y + 1.0009f*x*y*y + 1.00011f*y*y*y; //for experimenting with optimizations polynomial3(x, y) = 2.1f + 2.002f*x + 2.003f*y + 2.4f*x*x + 2.5f*x*y + 2.6f*y*y + 2.0007f*x*x*x + 2.0008f*x*x*y + 2.0009f*x*y*y + 2.00011f*y*y*y; //for experimenting with optimizations polynomial4(x, y) = 3.1f + 3.002f*x + 3.003f*y + 3.4f*x*x + 3.5f*x*y + 3.6f*y*y + 3.0007f*x*x*x + 3.0008f*x*x*y + 3.0009f*x*y*y + 3.00011f*y*y*y; //for experimenting with optimizations polynomial5(x, y) = 4.1f + 4.002f*x + 4.003f*y + 4.4f*x*x + 4.5f*x*y + 4.6f*y*y + 4.0007f*x*x*x + 4.0008f*x*x*y + 4.0009f*x*y*y + 4.00011f*y*y*y; //Kernel #1 Func kernel1; float sigmaX1 = 2.0f; float sigmaY1 = 2.0f; float theta1 = 0.0f; //rotation of sigmaX axis kernel1(x, y) = (exp(-((x*cos(theta1) +y*sin(theta1))*(x*cos(theta1) +y*sin(theta1))) /(2*sigmaX1*sigmaX1)) / (sqrtf(2*M_PI)*sigmaX1)) *(exp(-((y*cos(theta1) - x*sin(theta1))*(y*cos(theta1) - x*sin(theta1))) /(2*sigmaY1*sigmaY1)) / (sqrtf(2*M_PI)*sigmaY1)); //Kernel #2 Func kernel2; float sigmaX2 = 0.5f; float sigmaY2 = 4.0f; float theta2 = 0.0f; //rotation of sigmaX axis kernel2(x, y) = (exp(-((x*cos(theta2) +y*sin(theta2))*(x*cos(theta2) +y*sin(theta2))) /(2*sigmaX2*sigmaX2)) / (sqrtf(2*M_PI)*sigmaX2)) *(exp(-((y*cos(theta2) - x*sin(theta2))*(y*cos(theta2) - x*sin(theta2))) /(2*sigmaY2*sigmaY2)) / (sqrtf(2*M_PI)*sigmaY2)); //Kernel #3 Func kernel3; float sigmaX3 = 0.5f; float sigmaY3 = 4.0f; float theta3 = 3.14159f/4; //rotation of sigmaX axis kernel3(x, y) = (exp(-((x*cos(theta3) +y*sin(theta3))*(x*cos(theta3) +y*sin(theta3))) /(2*sigmaX3*sigmaX3)) / (sqrtf(2*M_PI)*sigmaX3)) *(exp(-((y*cos(theta3) - x*sin(theta3))*(y*cos(theta3) - x*sin(theta3))) /(2*sigmaY3*sigmaY3)) / (sqrtf(2*M_PI)*sigmaY3)); //Kernel #4 Func kernel4; float sigmaX4 = 0.5f; float sigmaY4 = 4.0f; float theta4 = 3.14159f/2; //rotation of sigmaX axis kernel4(x, y) = (exp(-((x*cos(theta4) +y*sin(theta4))*(x*cos(theta4) +y*sin(theta4))) /(2*sigmaX4*sigmaX4)) / (sqrtf(2*M_PI)*sigmaX4)) *(exp(-((y*cos(theta4) - x*sin(theta4))*(y*cos(theta4) - x*sin(theta4))) /(2*sigmaY4*sigmaY4)) / (sqrtf(2*M_PI)*sigmaY4)); //Kernel #5 Func kernel5; float sigmaX5 = 4.0f; float sigmaY5 = 4.0f; float theta5 = 0.0; //rotation of sigmaX axis kernel5(x, y) = (exp(-((x*cos(theta5) +y*sin(theta5))*(x*cos(theta5) +y*sin(theta5))) /(2*sigmaX5*sigmaX5)) / (sqrtf(2*M_PI)*sigmaX5)) *(exp(-((y*cos(theta5) - x*sin(theta5))*(y*cos(theta5) - x*sin(theta5))) /(2*sigmaY5*sigmaY5)) / (sqrtf(2*M_PI)*sigmaY5)); //Compute output image plane Func image_bounded ("image_bounded"); image_bounded = BoundaryConditions::repeat_edge(image); //Spatially Invariant Implementation 1 /* Expr blur_image_help = 0.0f; Expr norm = 0.0f; for(int i = -boundingBox; i <= boundingBox; i++){ for(int j = -boundingBox; j <= boundingBox; j++){ blur_image_help += image_bounded(x + i, y + j) * (kernel1(i, j) + kernel2(i, j) + kernel3(i, j) + kernel4(i, j) + kernel5(i, j)); norm += (kernel1(i, j) + kernel2(i, j) + kernel3(i, j) + kernel4(i, j) + kernel5(i, j)); } } blur_image_help = blur_image_help/norm; Func blurImage ("blurImage"); blurImage(x, y) = blur_image_help; */ //Spatially Invariant Implementation 2 /* Expr blur_image_help1 = 0.0f; Expr norm1 = 0.0f; for(int i = -boundingBox; i <= boundingBox; i++){ for(int j = -boundingBox; j <= boundingBox; j++){ blur_image_help1 += image_bounded(x + i, y + j) * kernel1(i, j); norm1 += kernel1(i, j); } } // blur_image_help1 = blur_image_help1/norm1; Func blurImage1 ("blurImage1"); blurImage1(x, y) = blur_image_help1; Expr blur_image_help2 = 0.0f; Expr norm2 = 0.0f; for(int i = -boundingBox; i <= boundingBox; i++){ for(int j = -boundingBox; j <= boundingBox; j++){ blur_image_help2 += image_bounded(x + i, y + j) * kernel2(i, j); norm2 += kernel2(i, j); } } // blur_image_help2 = blur_image_help2/norm2; Func blurImage2 ("blurImage2"); blurImage2(x, y) = blur_image_help2; Expr blur_image_help3 = 0.0f; Expr norm3 = 0.0f; for(int i = -boundingBox; i <= boundingBox; i++){ for(int j = -boundingBox; j <= boundingBox; j++){ blur_image_help3 += image_bounded(x + i, y + j) * kernel3(i, j); norm3 += kernel3(i, j); } } // blur_image_help3 = blur_image_help3/norm3; Func blurImage3 ("blurImage3"); blurImage3(x, y) = blur_image_help3; Expr blur_image_help4 = 0.0f; Expr norm4 = 0.0f; for(int i = -boundingBox; i <= boundingBox; i++){ for(int j = -boundingBox; j <= boundingBox; j++){ blur_image_help4 += image_bounded(x + i, y + j) * kernel4(i, j); norm4 += kernel4(i, j); } } // blur_image_help4 = blur_image_help4/norm4; Func blurImage4 ("blurImage4"); blurImage4(x, y) = blur_image_help4; Expr blur_image_help5 = 0.0f; Expr norm5 = 0.0f; for(int i = -boundingBox; i <= boundingBox; i++){ for(int j = -boundingBox; j <= boundingBox; j++){ blur_image_help5 += image_bounded(x + i, y + j) * kernel5(i, j); norm5 += kernel5(i, j); } } // blur_image_help5 = blur_image_help5/norm5; Func blurImage5 ("blurImage5"); blurImage5(x, y) = blur_image_help5; Func blurImage ("blurImage"); // blurImage(x, y) = (blurImage1(x, y) + blurImage2(x, y) + blurImage3(x, y) + // blurImage4(x, y) + blurImage5(x, y))/(5*norm1); blurImage(x, y) = (blur_image_help1 + blur_image_help2 + blur_image_help3 + blur_image_help4 + blur_image_help5)/(5*norm1); */ //Spatially Variant Implementation 1 Expr blur_image_help = 0.0f; Expr norm = 0.0f; for(int i = -boundingBox; i <= boundingBox; i++){ for(int j = -boundingBox; j <= boundingBox; j++){ blur_image_help += image_bounded(x + i, y + j) * (polynomial1(x, y)*kernel1(i, j) + polynomial2(x, y)*kernel2(i, j) + polynomial3(x, y)*kernel3(i, j) + polynomial4(x, y)*kernel4(i, j) + polynomial5(x, y)*kernel5(i, j)); norm += (polynomial1(x, y)*kernel1(i, j) + polynomial2(x, y)*kernel2(i, j) + polynomial3(x, y)*kernel3(i, j) + polynomial4(x, y)*kernel4(i, j) + polynomial5(x, y)*kernel5(i, j)); } } blur_image_help = blur_image_help/norm; Func blurImage ("blurImage"); blurImage(x, y) = blur_image_help; //Compute output variance plane Func variance_bounded ("variance_bounded"); variance_bounded = BoundaryConditions::repeat_edge(variance); //compute Variance output Func blurVariance ("blurVariance"); Expr blur_variance_help = 0.0f; Expr vNorm2 = 0.0f; for(int i = -boundingBox; i <= boundingBox; i++){ for(int j = -boundingBox; j <= boundingBox; j++){ blur_variance_help += variance_bounded(x + i, y + j) * (polynomial1(x, y)*kernel1(i, j) + polynomial2(x, y)*kernel2(i, j) + polynomial3(x, y)*kernel3(i, j) + polynomial4(x, y)*kernel4(i, j) + polynomial5(x, y)*kernel5(i, j)) *(polynomial1(x, y)*kernel1(i, j) + polynomial2(x, y)*kernel2(i, j) + polynomial3(x, y)*kernel3(i, j) + polynomial4(x, y)*kernel4(i, j) + polynomial5(x, y)*kernel5(i, j)); vNorm2 += (polynomial1(x, y)*kernel1(i, j) + polynomial2(x, y)*kernel2(i, j) + polynomial3(x, y)*kernel3(i, j) + polynomial4(x, y)*kernel4(i, j) + polynomial5(x, y)*kernel5(i, j)) *(polynomial1(x, y)*kernel1(i, j) + polynomial2(x, y)*kernel2(i, j) + polynomial3(x, y)*kernel3(i, j) + polynomial4(x, y)*kernel4(i, j) + polynomial5(x, y)*kernel5(i, j)); } } // blur_variance_help = blur_variance_help/(norm(x,y)*norm(x,y)); blur_variance_help = blur_variance_help/(vNorm2*vNorm2); blurVariance(x, y) = blur_variance_help; //Compute output mask plane Func mask_bounded ("mask_bounded"); mask_bounded = BoundaryConditions::repeat_edge(mask); Func maskOut ("maskOut"); Expr maskOutHelp = 0; for(int i = -boundingBox; i <= boundingBox; i++){ for(int j = -boundingBox; j <= boundingBox; j++){ maskOutHelp = select((polynomial1(x, y)*kernel1(i, j) + polynomial2(x, y)*kernel2(i, j) + polynomial3(x, y)*kernel3(i, j) + polynomial4(x, y)*kernel4(i, j) + polynomial5(x, y)*kernel5(i, j)) == 0.0f, maskOutHelp, maskOutHelp | mask_bounded(x + i, y + j)); // maskOutHelp = maskOutHelp | mask_bounded(x + i, y + j); } } maskOut(x, y) = maskOutHelp; //Schedule // blur.reorder(i_v, x, y); // kernel1.compute_at(blurImage, x); // kernel1.vectorize(x, 8); // kernel1.split(y, y0, yi, 4); // kernel1.parallel(y0); /* kernel1.compute_root(); kernel2.compute_root(); kernel3.compute_root(); kernel4.compute_root(); kernel5.compute_root(); */ //best schedule found: #ifdef TESTING_GPU blurImage.gpu_tile(x, y, 16, 16); // JIT-compile the pipeline for the GPU. CUDA or OpenCL are // not enabled by default. We have to construct a Target // object, enable one of them, and then pass that target // object to compile_jit. Otherwise your CPU will very slowly // pretend it's a GPU, and use one thread per output pixel. // Start with a target suitable for the machine you're running // this on. Target target = get_host_target(); // Then enable OpenCL or CUDA. // We'll enable OpenCL here, because it tends to give better // performance than CUDA, even with NVidia's drivers, because // NVidia's open source LLVM backend doesn't seem to do all // the same optimizations their proprietary compiler does. target.set_feature(Target::OpenCL); // Uncomment the next line and comment out the line above to // try CUDA instead. // target.set_feature(Target::CUDA); // If you want to see all of the OpenCL or CUDA API calls done // by the pipeline, you can also enable the Debug // flag. This is helpful for figuring out which stages are // slow, or when CPU -> GPU copies happen. It hurts // performance though, so we'll leave it commented out. // target.set_feature(Target::Debug); blurImage.compile_jit(target); #else blurImage.split(y, y0, yi, 4); blurImage.parallel(y0); blurImage.vectorize(x, 8); #endif // Split the y coordinate of the consumer into strips: blurVariance.split(y, y0, yi, 4); // Compute the strips using a thread pool and a task queue. blurVariance.parallel(y0); // Vectorize across x. blurVariance.vectorize(x, 8); // polynomial1.compute_at(blurImage, x).vectorize(x, 8); // kernel1.compute_at(blurImage, x).vectorize(x, 8); // Split the y coordinate of the consumer into strips of 16 scanlines: maskOut.split(y, y0, yi, 30); // Compute the strips using a thread pool and a task queue. maskOut.parallel(y0); // Vectorize across x by a factor of four. maskOut.vectorize(x, 8); // kernel1.trace_stores(); // blurImage.trace_stores(); //Check out what is happening blurImage.print_loop_nest(); // Print out pseudocode for the pipeline. blurImage.compile_to_lowered_stmt("linearCombinationKernelBlurImage.html", {image}, HTML); // blurImage.compile_to_c("linearCombinationKernel_C_Code.cpp", std::vector<Argument>(), "linearCombinationKernel_C_Code"); // blurVariance.compile_to_lowered_stmt("blur.html", {variance}, HTML); // Benchmark the pipeline. #ifdef TESTING_GPU Buffer image_output(Float(32), image.width(), image.height()); //for GPU testing #else Image<float> image_output(image.width(), image.height()); #endif blurImage.realize(image_output); Image<float> variance_output(variance.width(), variance.height()); blurVariance.realize(variance_output); Image<int32_t> mask_output(mask.width(), mask.height()); maskOut.realize(mask_output); #ifdef TESTING_GPU // Run the filter once to initialize any GPU runtime state. blurImage.realize(image_output); // Now take the best of 3 runs for timing. double best_time; for (int i = 0; i < 3; i++) { double t1 = current_time(); // Run the filter 100 times. for (int j = 0; j < 100; j++) { blurImage.realize(image_output); } // Force any GPU code to finish by copying the buffer back to the CPU. image_output.copy_to_host(); double t2 = current_time(); double elapsed = (t2 - t1)/100; if (i == 0 || elapsed < best_time) { best_time = elapsed; } } printf("%1.4f milliseconds\n", best_time); #else double average = 0; double min; double max; double imgTime; double varTime; double maskTime; int numberOfRuns = 5; for (int i = 0; i < numberOfRuns; i++) { double t1 = current_time(); blurImage.realize(image_output); double t2 = current_time(); blurVariance.realize(variance_output); double t3 = current_time(); maskOut.realize(mask_output); double t4 = current_time(); double curTime = (t4-t1); average += curTime; if(i == 0){ min = curTime; max = curTime; imgTime = t2-t1; varTime = t3-t2; maskTime = t4-t3; } else{ if(curTime < min){ min = curTime; imgTime = t2-t1; varTime = t3-t2; maskTime = t4-t3; } if(curTime > max) max = curTime; } } average = average/numberOfRuns; std::cout << "Average Time: " << average << ", Min = " << min << ", Max = " << max << ", with " << numberOfRuns << " runs" << '\n'; cout << "For fastest run total time = " << min << ", imgTime = " << imgTime << ", varTime = " << varTime << "maskTime = " << maskTime << endl; #endif #if !defined(STANDALONE) && !defined(TESTING_GPU) //write image out auto imOut = afwImage::MaskedImage<float, lsst::afw::image::MaskPixel, lsst::afw::image::VariancePixel>(im.getWidth(), im.getHeight()); for (int y = 0; y < imOut.getHeight(); y++) { afwImage::MaskedImage<float, lsst::afw::image::MaskPixel, lsst::afw::image::VariancePixel>::x_iterator inPtr = imOut.x_at(0, y); for (int x = 0; x < imOut.getWidth(); x++){ afwImage::pixel::SinglePixel<float, lsst::afw::image::MaskPixel, lsst::afw::image::VariancePixel> curPixel(image_output(x, y), mask_output(x, y), variance_output(x, y)); (*inPtr) = curPixel; inPtr++; } } imOut.writeFits("./halideLinearCombination5x5.fits"); #endif }
// Now a schedule that uses CUDA or OpenCL. void schedule_for_gpu() { // We make the decision about whether to use the GPU for each // Func independently. If you have one Func computed on the // CPU, and the next computed on the GPU, Halide will do the // copy-to-gpu under the hood. For this pipeline, there's no // reason to use the CPU for any of the stages. Halide will // copy the input image to the GPU the first time we run the // pipeline, and leave it there to reuse on subsequent runs. // As before, we'll compute the LUT once at the start of the // pipeline. lut.compute_root(); // Let's compute the look-up-table using the GPU in 16-wide // one-dimensional thread blocks. First we split the index // into blocks of size 16: Var block, thread; lut.split(i, block, thread, 16); // Then we tell cuda that our Vars 'block' and 'thread' // correspond to CUDA's notions of blocks and threads, or // OpenCL's notions of thread groups and threads. lut.gpu_blocks(block) .gpu_threads(thread); // This is a very common scheduling pattern on the GPU, so // there's a shorthand for it: // lut.gpu_tile(i, block, thread, 16); // Func::gpu_tile behaves the same as Func::tile, except that // it also specifies that the tile coordinates correspond to // GPU blocks, and the coordinates within each tile correspond // to GPU threads. // Compute color channels innermost. Promise that there will // be three of them and unroll across them. curved.reorder(c, x, y) .bound(c, 0, 3) .unroll(c); // Compute curved in 2D 8x8 tiles using the GPU. curved.gpu_tile(x, y, xo, yo, xi, yi, 8, 8); // This is equivalent to: // curved.tile(x, y, xo, yo, xi, yi, 8, 8) // .gpu_blocks(xo, yo) // .gpu_threads(xi, yi); // We'll leave sharpen as inlined into curved. // Compute the padded input as needed per GPU block, storing // the intermediate result in shared memory. In the schedule // above xo corresponds to GPU blocks. padded.compute_at(curved, xo); // Use the GPU threads for the x and y coordinates of the // padded input. padded.gpu_threads(x, y); // JIT-compile the pipeline for the GPU. CUDA, OpenCL, or // Metal are not enabled by default. We have to construct a // Target object, enable one of them, and then pass that // target object to compile_jit. Otherwise your CPU will very // slowly pretend it's a GPU, and use one thread per output // pixel. // Start with a target suitable for the machine you're running // this on. Target target = get_host_target(); // Then enable OpenCL or Metal, depending on which platform // we're on. OS X doesn't update its OpenCL drivers, so they // tend to be broken. CUDA would also be a fine choice on // machines with NVidia GPUs. if (target.os == Target::OSX) { target.set_feature(Target::Metal); } else { target.set_feature(Target::OpenCL); } // Uncomment the next line and comment out the lines above to // try CUDA instead. // target.set_feature(Target::CUDA); // If you want to see all of the OpenCL, Metal, or CUDA API // calls done by the pipeline, you can also enable the Debug // flag. This is helpful for figuring out which stages are // slow, or when CPU -> GPU copies happen. It hurts // performance though, so we'll leave it commented out. // target.set_feature(Target::Debug); curved.compile_jit(target); }
int main(int argc, char **argv) { Target t = get_jit_target_from_environment(); if (!t.features_any_of({Target::CUDACapability50, Target::CUDACapability61})) { printf("This test requires cuda enabled with cuda capability 5.0 or greater\n"); return 0; } { // Shuffle test to do a small convolution Func f, g; Var x, y; f(x, y) = x + y; g(x, y) = f(x-1, y) + f(x+1, y); Var xo, xi, yi, yo; g.gpu_tile(x, y, xi, yi, 32, 2, TailStrategy::RoundUp).gpu_lanes(xi); f.compute_root(); f.in(g).compute_at(g, yi).split(x, xo, xi, 32, TailStrategy::RoundUp).gpu_lanes(xi).unroll(xo); Buffer<int> out = g.realize(32, 4); for (int y = 0; y < out.height(); y++) { for (int x = 0; x < out.width(); x++) { int correct = 2*(x + y); int actual = out(x, y); if (correct != actual) { printf("out(%d, %d) = %d instead of %d\n", x, y, actual, correct); return -1; } } } } { // Broadcast test - an outer product access pattern Func a, b, c; Var x, y; a(x) = cast<float>(x); b(y) = cast<float>(y); c(x, y) = a(x) + 100 * b(y); a.compute_root(); b.compute_root(); Var xi, yi, yii; c.tile(x, y, xi, yi, 32, 32, TailStrategy::RoundUp) .gpu_blocks(x, y) .gpu_lanes(xi); // We're going to be computing 'a' and 'b' at block level, but // we want them in register, not shared, so we explicitly call // store_in. a.in(c).compute_at(c, x) .gpu_lanes(x) .store_in(MemoryType::Register); b.in(c).compute_at(c, x) .gpu_lanes(y) .store_in(MemoryType::Register); Buffer<float> out = c.realize(32, 32); for (int y = 0; y < out.height(); y++) { for (int x = 0; x < out.width(); x++) { float correct = x + 100 * y; float actual = out(x, y); // The floats are small integers, so they should be exact. if (correct != actual) { printf("out(%d, %d) = %f instead of %f\n", x, y, actual, correct); return -1; } } } } { // Vectorized broadcast test. Each lane is responsible for a // 2-vector from 'a' and a 2-vector from 'b' instead of a single // value. Func a, b, c; Var x, y; a(x) = cast<float>(x); b(y) = cast<float>(y); c(x, y) = a(x) + 100 * b(y); a.compute_root(); b.compute_root(); Var xi, yi, yii; c.tile(x, y, xi, yi, 64, 64, TailStrategy::RoundUp) .gpu_blocks(x, y) .split(yi, yi, yii, 64).unroll(yii, 2).gpu_threads(yi) .vectorize(xi, 2).gpu_lanes(xi); a.in(c).compute_at(c, yi).vectorize(x, 2).gpu_lanes(x); b.in(c).compute_at(c, yi).vectorize(y, 2).gpu_lanes(y); Buffer<float> out = c.realize(64, 64); for (int y = 0; y < out.height(); y++) { for (int x = 0; x < out.width(); x++) { float correct = x + 100 * y; float actual = out(x, y); // The floats are small integers, so they should be exact. if (correct != actual) { printf("out(%d, %d) = %f instead of %f\n", x, y, actual, correct); return -1; } } } } { // A stencil chain where many of the lanes will be masked Func a, b, c, d; Var x, y; a(x, y) = x + y; a.compute_root(); b(x, y) = a(x-1, y) + a(x, y) + a(x+1, y); c(x, y) = b(x-1, y) + b(x, y) + b(x+1, y); d(x, y) = c(x-1, y) + c(x, y) + c(x+1, y); Var xi, yi; // Compute 24-wide pieces of output per block. Should use 32 // warp lanes to do so. The footprint on the input is 30, so // the last two lanes are always inactive. 26-wide blocks // would be a more efficient use of the gpu, but a less // interesting test. d.gpu_tile(x, y, xi, yi, 24, 2).gpu_lanes(xi); for (Func stage : {a.in(), b, c}) { stage.compute_at(d, yi).gpu_lanes(x); } Buffer<int> out = d.realize(24, 2); for (int y = 0; y < out.height(); y++) { for (int x = 0; x < out.width(); x++) { int correct = 27*(x + y); int actual = out(x, y); if (correct != actual) { printf("out(%d, %d) = %d instead of %d\n", x, y, actual, correct); return -1; } } } } { // Same as above, but in half-warps Func a, b, c, d; Var x, y; a(x, y) = x + y; a.compute_root(); b(x, y) = a(x-1, y) + a(x, y) + a(x+1, y); c(x, y) = b(x-1, y) + b(x, y) + b(x+1, y); d(x, y) = c(x-1, y) + c(x, y) + c(x+1, y); Var xi, yi; // Compute 10-wide pieces of output per block. Should use 16 // warp lanes to do so. d.gpu_tile(x, y, xi, yi, 10, 2).gpu_lanes(xi); for (Func stage : {a.in(), b, c}) { stage.compute_at(d, yi).gpu_lanes(x); } Buffer<int> out = d.realize(24, 2); for (int y = 0; y < out.height(); y++) { for (int x = 0; x < out.width(); x++) { int correct = 27*(x + y); int actual = out(x, y); if (correct != actual) { printf("out(%d, %d) = %d instead of %d\n", x, y, actual, correct); return -1; } } } } { // A shuffle with a shift amount that depends on the y coord Func a, b; Var x, y; a(x, y) = x + y; b(x, y) = a(x + y, y); Var xi, yi; b.gpu_tile(x, y, xi, yi, 16, 8, TailStrategy::RoundUp).gpu_lanes(xi); a.compute_at(b, yi).gpu_lanes(x); Buffer<int> out = b.realize(32, 32); for (int y = 0; y < out.height(); y++) { for (int x = 0; x < out.width(); x++) { int correct = x + 2*y; int actual = out(x, y); if (correct != actual) { printf("out(%d, %d) = %d instead of %d\n", x, y, actual, correct); return -1; } } } } { // Bilinear upsample Func f, upx, upy; Var x, y; f(x, y) = cast<float>(x + y); f.compute_root(); upx(x, y) = 0.25f * f((x/2) - 1 + 2*(x % 2), y) + 0.75f * f(x/2, y); upy(x, y) = 0.25f * upx(x, (y/2) - 1 + 2*(y % 2)) + 0.75f * upx(x, y/2); // Compute 128x64 tiles of output, which require 66x34 tiles // of input. All intermediate data stored in lanes and // accessed using register shuffles. Var xi, yi, xii, yii; upy.tile(x, y, xi, yi, 128, 64, TailStrategy::RoundUp) .tile(xi, yi, xii, yii, 4, 8).vectorize(xii) .gpu_blocks(x, y).gpu_threads(yi).gpu_lanes(xi); upx.compute_at(upy, yi).unroll(x, 4).gpu_lanes(x).unroll(y); // Stage the input into lanes, doing two dense vector loads // per lane, and use register shuffles to do the upsample in x. f.in().compute_at(upy, yi).align_storage(x, 64) .vectorize(x, 2, TailStrategy::RoundUp) .split(x, x, xi, 32, TailStrategy::GuardWithIf) .reorder(xi, y, x).gpu_lanes(xi).unroll(x).unroll(y); upy.output_buffer().dim(0).set_min(0).dim(1).set_min(0); Buffer<float> out = upy.realize(128, 128); for (int y = 0; y < out.height(); y++) { for (int x = 0; x < out.width(); x++) { float actual = out(x, y); float correct = (x + y - 1) / 2.0f; if (correct != actual) { printf("out(%d, %d) = %f instead of %f\n", x, y, actual, correct); return -1; } } } } { // Box-downsample by a factor of 8 using summation within each // warp. Func f; Var x, y; f(x, y) = cast<float>(x + y); f.compute_root(); Func s1, s2, s3, s4; s1(x, y) = f(2*x, y) + f(2*x + 1, y); s2(x, y) = s1(2*x, y) + s1(2*x + 1, y); s3(x, y) = s2(2*x, y) + s2(2*x + 1, y); s4(x, y) = s3(x, y); Var xi, yi; s4.gpu_tile(x, y, xi, yi, 64, 1, TailStrategy::RoundUp).vectorize(xi, 2).gpu_lanes(xi); s3.compute_at(s4, yi).split(x, x, xi, 32, TailStrategy::RoundUp).gpu_lanes(xi).unroll(x); s2.compute_at(s4, yi).split(x, x, xi, 32, TailStrategy::RoundUp).gpu_lanes(xi).unroll(x); s1.compute_at(s4, yi).split(x, x, xi, 32, TailStrategy::RoundUp).gpu_lanes(xi).unroll(x); f.in().compute_at(s4, yi).split(x, x, xi, 64, TailStrategy::RoundUp).vectorize(xi, 2).gpu_lanes(xi).unroll(x); Buffer<float> out = s4.realize(64, 64); for (int y = 0; y < out.height(); y++) { for (int x = 0; x < out.width(); x++) { float actual = out(x, y); // One factor of 8 from adding instead of averaging, // and another factor of 8 from the compression of the // coordinate system across x. float correct = (x*8 + y)*8 + 28; if (correct != actual) { printf("out(%d, %d) = %f instead of %f\n", x, y, actual, correct); return -1; } } } } { // The same, with a narrower tile in x so that one warp is divided up across many scanlines. Func f; Var x, y; f(x, y) = cast<float>(x + y); f.compute_root(); Func s1, s2, s3, s4; s1(x, y) = f(2*x, y) + f(2*x + 1, y); s2(x, y) = s1(2*x, y) + s1(2*x + 1, y); s3(x, y) = s2(2*x, y) + s2(2*x + 1, y); s4(x, y) = s3(x, y); Var xi, yi; s4.gpu_tile(x, y, xi, yi, 8, 16, TailStrategy::RoundUp).vectorize(xi, 2).gpu_lanes(xi); s3.compute_at(s4, yi).split(x, x, xi, 4, TailStrategy::RoundUp).gpu_lanes(xi).unroll(x); s2.compute_at(s4, yi).split(x, x, xi, 4, TailStrategy::RoundUp).gpu_lanes(xi).unroll(x); s1.compute_at(s4, yi).split(x, x, xi, 4, TailStrategy::RoundUp).gpu_lanes(xi).unroll(x); f.in().compute_at(s4, yi).split(x, x, xi, 8, TailStrategy::RoundUp).vectorize(xi, 2).gpu_lanes(xi).unroll(x); Buffer<float> out = s4.realize(32, 32); for (int y = 0; y < out.height(); y++) { for (int x = 0; x < out.width(); x++) { float actual = out(x, y); float correct = (x*8 + y)*8 + 28; if (correct != actual) { printf("out(%d, %d) = %f instead of %f\n", x, y, actual, correct); return -1; } } } } { Buffer<uint8_t> buf(256, 256); buf.for_each_value([](uint8_t &x) { x = rand(); }); buf.set_host_dirty(); // Store a small LUT in-register, populated at the warp // level. Func lut; Var x, y; lut(x) = cast<uint16_t>(x)+1; Func curved; curved(x, y) = lut(buf(x, y)); Var xi, yi, xo; curved.compute_root().tile(x, y, xi, yi, 32, 32) .gpu_blocks(x, y).gpu_threads(yi).gpu_lanes(xi); lut.compute_root(); // Load the LUT into shared at the start of each block using warp 0. lut.in().compute_at(curved, x).split(x, xo, xi, 32 * 4).vectorize(xi, 4).gpu_lanes(xi).unroll(xo); // Load it from shared into registers for each warp. lut.in().in().compute_at(curved, yi).split(x, xo, xi, 32 * 4).vectorize(xi, 4).gpu_lanes(xi).unroll(xo); Buffer<uint16_t> out = curved.realize(buf.width(), buf.height()); for (int y = 0; y < out.height(); y++) { for (int x = 0; x < out.width(); x++) { uint16_t actual = out(x, y); uint16_t correct = ((uint16_t)buf(x, y)) + 1; if (correct != actual) { printf("out(%d, %d) = %d instead of %d\n", x, y, actual, correct); return -1; } } } } { // Test a case that caused combinatorial explosion Var x; Expr e = x; for (int i = 0; i < 10; i++) { e = fast_pow(e, e + 1); } Func f; f(x) = e; Var xo, xi; f.gpu_tile(x, xo, xi, 32); f.realize(1024); } printf("Success!\n"); return 0; }
bool test(int vec_width, const Target &target) { if (!is_type_supported<A>(vec_width, target) || !is_type_supported<B>(vec_width, target)) { // Type not supported, return pass. return true; } int W = 1024; int H = 1; Buffer<A> input(W, H); for (int y = 0; y < H; y++) { for (int x = 0; x < W; x++) { input(x, y) = (A)((rand()&0xffff)*0.1); } } Var x, y; Func f; f(x, y) = cast<B>(input(x, y)); if (target.has_gpu_feature()) { Var xo, xi; f.gpu_tile(x, xo, xi, 64); } else { if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { // TODO: Non-native vector widths hang the compiler here. //f.hexagon(); } if (vec_width > 1) { f.vectorize(x, vec_width); } } Buffer<B> output = f.realize(W, H); /* for (int y = 0; y < H; y++) { for (int x = 0; x < W; x++) { printf("%d %d -> %d %d\n", x, y, (int)(input(x, y)), (int)(output(x, y))); } } */ for (int y = 0; y < H; y++) { for (int x = 0; x < W; x++) { bool ok = ((B)(input(x, y)) == output(x, y)); if (!ok) { fprintf(stderr, "%s x %d -> %s x %d failed\n", string_of_type<A>(), vec_width, string_of_type<B>(), vec_width); fprintf(stderr, "At %d %d, %f -> %f instead of %f\n", x, y, (double)(input(x, y)), (double)(output(x, y)), (double)((B)(input(x, y)))); return false; } } } return true; }
int main(int arch, char **argv) { const int W = 256, H = 256; Buffer<uint8_t> in(W, H); // Set up the input. for (int y = 0; y < H; y++) { for (int x = 0; x < W; x++) { in(x, y) = rand() & 0xff; } } // Define a convolution kernel, and its sum. Buffer<int8_t> kernel(3, 3); kernel.set_min(-1, -1); for (int y = -1; y <= 1; y++) { for (int x = -1; x <= 1; x++) { kernel(x, y) = rand() % 8 - 4; } } Var x("x"), y("y"), xi("xi"), yi("yi"); RDom r(-1, 3, -1, 3); // Boundary condition. Func input = BoundaryConditions::repeat_edge(in); input.compute_root(); // Test a widening reduction, followed by a narrowing. { Func f; f(x, y) = u8_sat(sum(i16(input(x + r.x, y + r.y)) * kernel(r.x, r.y)) / 16); // Schedule. Target target = get_jit_target_from_environment(); if (target.has_gpu_feature()) { f.gpu_tile(x, y, xi, yi, 16, 16); } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { f.hexagon().vectorize(x, 128); } else { f.vectorize(x, target.natural_vector_size<uint8_t>()); } // Run the pipeline and verify the results are correct. Buffer<uint8_t> out = f.realize(W, H, target); for (int y = 1; y < H-1; y++) { for (int x = 1; x < W-1; x++) { int16_t correct = 0; for (int ry = -1; ry <= 1; ry++) { for (int rx = -1; rx <= 1; rx++) { correct += static_cast<int16_t>(in(x + rx, y + ry)) * kernel(rx, ry); } } correct = std::min(std::max(correct / 16, 0), 255); if (correct != out(x, y)) { std::cout << "out(" << x << ", " << y << ") = " << (int)out(x, y) << " instead of " << correct << "\n"; return -1; } } } } // Test a tuple reduction with widening, followed by narrowing the result. { Func f; f(x, y) = { i16(0), i8(0) }; f(x, y) = { f(x, y)[0] + i16(input(x + r.x, y + r.y)) * kernel(r.x, r.y), f(x, y)[1] + kernel(r.x, r.y), }; Func g; g(x, y) = u8_sat((f(x, y)[0] + f(x, y)[1]) / 16); // Schedule. Target target = get_jit_target_from_environment(); if (target.has_gpu_feature()) { g.gpu_tile(x, y, xi, yi, 16, 16); } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { g.hexagon().vectorize(x, 128); } else { g.vectorize(x, target.natural_vector_size<uint8_t>()); } // Run the pipeline and verify the results are correct. Buffer<uint8_t> out = g.realize(W, H, target); for (int y = 1; y < H-1; y++) { for (int x = 1; x < W-1; x++) { int16_t correct = 0; for (int ry = -1; ry <= 1; ry++) { for (int rx = -1; rx <= 1; rx++) { correct += static_cast<int16_t>(in(x + rx, y + ry)) * kernel(rx, ry); correct += kernel(rx, ry); } } correct = std::min(std::max(correct / 16, 0), 255); if (correct != out(x, y)) { std::cout << "out(" << x << ", " << y << ") = " << (int)out(x, y) << " instead of " << correct << "\n"; return -1; } } } } std::cout << "Success!\n"; return 0; }