int main(int argc, char **argv) { if (!get_jit_target_from_environment().has_gpu_feature()) { printf("Not running test because no gpu target enabled\n"); return 0; } { Func f; Var x, y, z; // Construct a Func with lots of potential race conditions, and // then run it in thread blocks on the gpu. f(x, y) = x + 100 * y; const int passes = 10; for (int i = 0; i < passes; i++) { RDom rx(0, 10); // Flip each row, using spots 10-19 as temporary storage f(rx + 10, y) = f(9 - rx, y); f(rx, y) = f(rx + 10, y); // Flip each column the same way RDom ry(0, 8); f(x, ry + 8) = f(x, 7 - ry); f(x, ry) = f(x, ry + 8); } Func g; g(x, y) = f(0, 0)+ f(9, 7); g.gpu_tile(x, y, 16, 8); f.compute_at(g, Var::gpu_blocks()); for (int i = 0; i < passes; i++) { f.update(i*4 + 0).gpu_threads(y); f.update(i*4 + 1).gpu_threads(y); f.update(i*4 + 2).gpu_threads(x); f.update(i*4 + 3).gpu_threads(x); } Image<int> out = g.realize(100, 100); for (int y = 0; y < out.height(); y++) { for (int x = 0; x < out.width(); x++) { int correct = 7*100 + 9; if (out(x, y) != correct) { printf("out(%d, %d) = %d instead of %d\n", x, y, out(x, y), correct); return -1; } } } } { // Construct a Func with undef stages, then run it in thread // blocks and make sure the right number of syncthreads are // added. Func f; Var x, y, z; f(x, y) = undef<int>(); f(x, y) += x + 100 * y; // This next line is dubious, because it entirely masks the // effect of the previous definition. If you add an undefined // value to the previous def, then Halide can evaluate this to // whatever it likes. Currently we'll just elide this update // definition. f(x, y) += undef<int>(); f(x, y) += y * 100 + x; Func g; g(x, y) = f(0, 0) + f(7, 7); g.gpu_tile(x, y, 8, 8); f.compute_at(g, Var::gpu_blocks()); f.gpu_threads(x, y); f.update(0).gpu_threads(x, y); f.update(1).gpu_threads(x, y); f.update(2).gpu_threads(x, y); // There should be two thread barriers: one in between the // non-undef definitions, and one between f and g. g.add_custom_lowering_pass(new CheckBarrierCount(2)); Image<int> out = g.realize(100, 100); } printf("Success!\n"); return 0; }
// Now a schedule that uses CUDA or OpenCL. void schedule_for_gpu() { // We make the decision about whether to use the GPU for each // Func independently. If you have one Func computed on the // CPU, and the next computed on the GPU, Halide will do the // copy-to-gpu under the hood. For this pipeline, there's no // reason to use the CPU for any of the stages. Halide will // copy the input image to the GPU the first time we run the // pipeline, and leave it there to reuse on subsequent runs. // As before, we'll compute the LUT once at the start of the // pipeline. lut.compute_root(); // Let's compute the look-up-table using the GPU in 16-wide // one-dimensional thread blocks. First we split the index // into blocks of size 16: Var block, thread; lut.split(i, block, thread, 16); // Then we tell cuda that our Vars 'block' and 'thread' // correspond to CUDA's notions of blocks and threads, or // OpenCL's notions of thread groups and threads. lut.gpu_blocks(block) .gpu_threads(thread); // This is a very common scheduling pattern on the GPU, so // there's a shorthand for it: // lut.gpu_tile(i, 16); // Func::gpu_tile method is similar to Func::tile, except that // it also specifies that the tile coordinates correspond to // GPU blocks, and the coordinates within each tile correspond // to GPU threads. // Compute color channels innermost. Promise that there will // be three of them and unroll across them. curved.reorder(c, x, y) .bound(c, 0, 3) .unroll(c); // Compute curved in 2D 8x8 tiles using the GPU. curved.gpu_tile(x, y, 8, 8); // This is equivalent to: // curved.tile(x, y, xo, yo, xi, yi, 8, 8) // .gpu_blocks(xo, yo) // .gpu_threads(xi, yi); // We'll leave sharpen as inlined into curved. // Compute the padded input as needed per GPU block, storing the // intermediate result in shared memory. Var::gpu_blocks, and // Var::gpu_threads exist to help you schedule producers within // GPU threads and blocks. padded.compute_at(curved, Var::gpu_blocks()); // Use the GPU threads for the x and y coordinates of the // padded input. padded.gpu_threads(x, y); // JIT-compile the pipeline for the GPU. CUDA or OpenCL are // not enabled by default. We have to construct a Target // object, enable one of them, and then pass that target // object to compile_jit. Otherwise your CPU will very slowly // pretend it's a GPU, and use one thread per output pixel. // Start with a target suitable for the machine you're running // this on. Target target = get_host_target(); // Then enable OpenCL or CUDA. // We'll enable OpenCL here, because it tends to give better // performance than CUDA, even with NVidia's drivers, because // NVidia's open source LLVM backend doesn't seem to do all // the same optimizations their proprietary compiler does. target.features |= Target::OpenCL; // Uncomment the next line and comment out the line above to // try CUDA instead. // target.features |= Target::CUDA; // If you want to see all of the OpenCL or CUDA API calls done // by the pipeline, you can also enable the GPUDebug // flag. This is helpful for figuring out which stages are // slow, or when CPU -> GPU copies happen. It hurts // performance though, so we'll leave it commented out. //target.features |= Target::GPUDebug; curved.compile_jit(target); }
// Now a schedule that uses CUDA or OpenCL. void schedule_for_gpu() { // We make the decision about whether to use the GPU for each // Func independently. If you have one Func computed on the // CPU, and the next computed on the GPU, Halide will do the // copy-to-gpu under the hood. For this pipeline, there's no // reason to use the CPU for any of the stages. Halide will // copy the input image to the GPU the first time we run the // pipeline, and leave it there to reuse on subsequent runs. // As before, we'll compute the LUT once at the start of the // pipeline. lut.compute_root(); // Let's compute the look-up-table using the GPU in 16-wide // one-dimensional thread blocks. First we split the index // into blocks of size 16: Var block, thread; lut.split(i, block, thread, 16); // Then we tell cuda that our Vars 'block' and 'thread' // correspond to CUDA's notions of blocks and threads, or // OpenCL's notions of thread groups and threads. lut.gpu_blocks(block) .gpu_threads(thread); // This is a very common scheduling pattern on the GPU, so // there's a shorthand for it: // lut.gpu_tile(i, block, thread, 16); // Func::gpu_tile behaves the same as Func::tile, except that // it also specifies that the tile coordinates correspond to // GPU blocks, and the coordinates within each tile correspond // to GPU threads. // Compute color channels innermost. Promise that there will // be three of them and unroll across them. curved.reorder(c, x, y) .bound(c, 0, 3) .unroll(c); // Compute curved in 2D 8x8 tiles using the GPU. curved.gpu_tile(x, y, xo, yo, xi, yi, 8, 8); // This is equivalent to: // curved.tile(x, y, xo, yo, xi, yi, 8, 8) // .gpu_blocks(xo, yo) // .gpu_threads(xi, yi); // We'll leave sharpen as inlined into curved. // Compute the padded input as needed per GPU block, storing // the intermediate result in shared memory. In the schedule // above xo corresponds to GPU blocks. padded.compute_at(curved, xo); // Use the GPU threads for the x and y coordinates of the // padded input. padded.gpu_threads(x, y); // JIT-compile the pipeline for the GPU. CUDA, OpenCL, or // Metal are not enabled by default. We have to construct a // Target object, enable one of them, and then pass that // target object to compile_jit. Otherwise your CPU will very // slowly pretend it's a GPU, and use one thread per output // pixel. // Start with a target suitable for the machine you're running // this on. Target target = get_host_target(); // Then enable OpenCL or Metal, depending on which platform // we're on. OS X doesn't update its OpenCL drivers, so they // tend to be broken. CUDA would also be a fine choice on // machines with NVidia GPUs. if (target.os == Target::OSX) { target.set_feature(Target::Metal); } else { target.set_feature(Target::OpenCL); } // Uncomment the next line and comment out the lines above to // try CUDA instead. // target.set_feature(Target::CUDA); // If you want to see all of the OpenCL, Metal, or CUDA API // calls done by the pipeline, you can also enable the Debug // flag. This is helpful for figuring out which stages are // slow, or when CPU -> GPU copies happen. It hurts // performance though, so we'll leave it commented out. // target.set_feature(Target::Debug); curved.compile_jit(target); }