// Now a schedule that uses CUDA or OpenCL. void schedule_for_gpu() { // We make the decision about whether to use the GPU for each // Func independently. If you have one Func computed on the // CPU, and the next computed on the GPU, Halide will do the // copy-to-gpu under the hood. For this pipeline, there's no // reason to use the CPU for any of the stages. Halide will // copy the input image to the GPU the first time we run the // pipeline, and leave it there to reuse on subsequent runs. // As before, we'll compute the LUT once at the start of the // pipeline. lut.compute_root(); // Let's compute the look-up-table using the GPU in 16-wide // one-dimensional thread blocks. First we split the index // into blocks of size 16: Var block, thread; lut.split(i, block, thread, 16); // Then we tell cuda that our Vars 'block' and 'thread' // correspond to CUDA's notions of blocks and threads, or // OpenCL's notions of thread groups and threads. lut.gpu_blocks(block) .gpu_threads(thread); // This is a very common scheduling pattern on the GPU, so // there's a shorthand for it: // lut.gpu_tile(i, 16); // Func::gpu_tile method is similar to Func::tile, except that // it also specifies that the tile coordinates correspond to // GPU blocks, and the coordinates within each tile correspond // to GPU threads. // Compute color channels innermost. Promise that there will // be three of them and unroll across them. curved.reorder(c, x, y) .bound(c, 0, 3) .unroll(c); // Compute curved in 2D 8x8 tiles using the GPU. curved.gpu_tile(x, y, 8, 8); // This is equivalent to: // curved.tile(x, y, xo, yo, xi, yi, 8, 8) // .gpu_blocks(xo, yo) // .gpu_threads(xi, yi); // We'll leave sharpen as inlined into curved. // Compute the padded input as needed per GPU block, storing the // intermediate result in shared memory. Var::gpu_blocks, and // Var::gpu_threads exist to help you schedule producers within // GPU threads and blocks. padded.compute_at(curved, Var::gpu_blocks()); // Use the GPU threads for the x and y coordinates of the // padded input. padded.gpu_threads(x, y); // JIT-compile the pipeline for the GPU. CUDA or OpenCL are // not enabled by default. We have to construct a Target // object, enable one of them, and then pass that target // object to compile_jit. Otherwise your CPU will very slowly // pretend it's a GPU, and use one thread per output pixel. // Start with a target suitable for the machine you're running // this on. Target target = get_host_target(); // Then enable OpenCL or CUDA. // We'll enable OpenCL here, because it tends to give better // performance than CUDA, even with NVidia's drivers, because // NVidia's open source LLVM backend doesn't seem to do all // the same optimizations their proprietary compiler does. target.features |= Target::OpenCL; // Uncomment the next line and comment out the line above to // try CUDA instead. // target.features |= Target::CUDA; // If you want to see all of the OpenCL or CUDA API calls done // by the pipeline, you can also enable the GPUDebug // flag. This is helpful for figuring out which stages are // slow, or when CPU -> GPU copies happen. It hurts // performance though, so we'll leave it commented out. //target.features |= Target::GPUDebug; curved.compile_jit(target); }
// Now a schedule that uses CUDA or OpenCL. void schedule_for_gpu() { // We make the decision about whether to use the GPU for each // Func independently. If you have one Func computed on the // CPU, and the next computed on the GPU, Halide will do the // copy-to-gpu under the hood. For this pipeline, there's no // reason to use the CPU for any of the stages. Halide will // copy the input image to the GPU the first time we run the // pipeline, and leave it there to reuse on subsequent runs. // As before, we'll compute the LUT once at the start of the // pipeline. lut.compute_root(); // Let's compute the look-up-table using the GPU in 16-wide // one-dimensional thread blocks. First we split the index // into blocks of size 16: Var block, thread; lut.split(i, block, thread, 16); // Then we tell cuda that our Vars 'block' and 'thread' // correspond to CUDA's notions of blocks and threads, or // OpenCL's notions of thread groups and threads. lut.gpu_blocks(block) .gpu_threads(thread); // This is a very common scheduling pattern on the GPU, so // there's a shorthand for it: // lut.gpu_tile(i, block, thread, 16); // Func::gpu_tile behaves the same as Func::tile, except that // it also specifies that the tile coordinates correspond to // GPU blocks, and the coordinates within each tile correspond // to GPU threads. // Compute color channels innermost. Promise that there will // be three of them and unroll across them. curved.reorder(c, x, y) .bound(c, 0, 3) .unroll(c); // Compute curved in 2D 8x8 tiles using the GPU. curved.gpu_tile(x, y, xo, yo, xi, yi, 8, 8); // This is equivalent to: // curved.tile(x, y, xo, yo, xi, yi, 8, 8) // .gpu_blocks(xo, yo) // .gpu_threads(xi, yi); // We'll leave sharpen as inlined into curved. // Compute the padded input as needed per GPU block, storing // the intermediate result in shared memory. In the schedule // above xo corresponds to GPU blocks. padded.compute_at(curved, xo); // Use the GPU threads for the x and y coordinates of the // padded input. padded.gpu_threads(x, y); // JIT-compile the pipeline for the GPU. CUDA, OpenCL, or // Metal are not enabled by default. We have to construct a // Target object, enable one of them, and then pass that // target object to compile_jit. Otherwise your CPU will very // slowly pretend it's a GPU, and use one thread per output // pixel. // Start with a target suitable for the machine you're running // this on. Target target = get_host_target(); // Then enable OpenCL or Metal, depending on which platform // we're on. OS X doesn't update its OpenCL drivers, so they // tend to be broken. CUDA would also be a fine choice on // machines with NVidia GPUs. if (target.os == Target::OSX) { target.set_feature(Target::Metal); } else { target.set_feature(Target::OpenCL); } // Uncomment the next line and comment out the lines above to // try CUDA instead. // target.set_feature(Target::CUDA); // If you want to see all of the OpenCL, Metal, or CUDA API // calls done by the pipeline, you can also enable the Debug // flag. This is helpful for figuring out which stages are // slow, or when CPU -> GPU copies happen. It hurts // performance though, so we'll leave it commented out. // target.set_feature(Target::Debug); curved.compile_jit(target); }