int main(int argc, char **argv) { Var x, y; Func f; f(x, y) = x+y; // Dig out the raw function pointer so we can use it as if we were // compiling statically void (*function)(buffer_t *) = (void (*)(buffer_t *))(f.compile_jit()); buffer_t out; memset(&out, 0, sizeof(out)); out.host = (uint8_t *)malloc(10*10); out.elem_size = 1; // should be 4! out.extent[0] = 10; out.extent[1] = 10; out.stride[0] = 1; out.stride[1] = 10; f.set_error_handler(&halide_error); error_occurred = false; function(&out); if (error_occurred) { printf("Success!\n"); return 0; } else { printf("There should have been a runtime error\n"); return -1; } }
// Now we define methods that give our pipeline several different // schedules. void schedule_for_cpu() { // Compute the look-up-table ahead of time. lut.compute_root(); // Compute color channels innermost. Promise that there will // be three of them and unroll across them. curved.reorder(c, x, y) .bound(c, 0, 3) .unroll(c); // Look-up-tables don't vectorize well, so just parallelize // curved in slices of 16 scanlines. Var yo, yi; curved.split(y, yo, yi, 16) .parallel(yo); // Compute sharpen as needed per scanline of curved. sharpen.compute_at(curved, yi); // Vectorize the sharpen. It's 16-bit so we'll vectorize it 8-wide. sharpen.vectorize(x, 8); // Compute the padded input as needed per scanline of curved, // reusing previous values computed within the same strip of // 16 scanlines. padded.store_at(curved, yo) .compute_at(curved, yi); // Also vectorize the padding. It's 8-bit, so we'll vectorize // 16-wide. padded.vectorize(x, 16); // JIT-compile the pipeline for the CPU. curved.compile_jit(); }
double test(Func f, bool test_correctness = true) { f.compile_to_assembly(f.name() + ".s", Internal::vec<Argument>(input), f.name()); f.compile_jit(); f.realize(output); if (test_correctness) { for (int y = 0; y < output.height(); y++) { for (int x = 0; x < output.width(); x++) { int ix1 = std::max(std::min(x, MAX), MIN); int ix2 = std::max(std::min(x+1, MAX), MIN); uint16_t correct = input(ix1, y) * 3 + input(ix2, y); if (output(x, y) != correct) { printf("output(%d, %d) = %d instead of %d\n", x, y, output(x, y), correct); exit(-1); } } } } double t1 = currentTime(); for (int i = 0; i < 10; i++) { f.realize(output); } return currentTime() - t1; }
int main(int argc, char **argv) { Buffer<int> input(100, 50); // This image represents the range [100, 199]*[50, 99] input.set_min(100, 50); input(100, 50) = 123; input(198, 99) = 234; Func f; Var x, y; f(x, y) = input(2*x, y/2); f.compile_jit(); // The output will represent the range from [50, 99]*[100, 199] Buffer<int> result(50, 100); result.set_min(50, 100); f.realize(result); if (result(50, 100) != 123 || result(99, 199) != 234) { fprintf(stderr, "Err: f(50, 100) = %d (supposed to be 123)\n" "f(99, 199) = %d (supposed to be 234)\n", result(50, 100), result(99, 199)); return -1; } printf("Success!\n"); return 0; }
int main(int argc, char **argv) { ImageParam input(UInt(8), 1); input.dim(0).set_bounds(0, size); { Func f; Var x; f(x) = input(x); // Output must have the same size as the input. f.output_buffer().dim(0).set_bounds(input.dim(0).min(), input.dim(0).extent()); f.add_custom_lowering_pass(new Validator); f.compile_jit(); Buffer<uint8_t> dummy(size); dummy.fill(42); input.set(dummy); Buffer<uint8_t> out = f.realize(size); if (!out.all_equal(42)) { std::cerr << "wrong output" << std::endl; exit(-1); } } { Func f; Var x; f(x) = undef(UInt(8)); RDom r(input); f(r.x) = cast<uint8_t>(42); f.add_custom_lowering_pass(new Validator); f.compile_jit(); Buffer<uint8_t> dummy(size); input.set(dummy); Buffer<uint8_t> out = f.realize(size); if (!out.all_equal(42)) { std::cerr << "wrong output" << std::endl; exit(-1); } } std::cout << "Success!" << std::endl; return 0; }
int main(int argc, char **argv) { ImageParam src(UInt(8), 1); Func dst; Var x; dst(x) = src(x); Var xo; dst.split(x, xo, x, 8*4096); // dst.parallel(xo); speeds up halide's memcpy considerably, but doesn't seem sporting dst.vectorize(x, 16); dst.compile_to_assembly("memcpy.s", {src}, "memcpy"); dst.compile_jit(); const int32_t buffer_size = 12345678; const int iterations = 50; Image<uint8_t> input(buffer_size); Image<uint8_t> output(buffer_size); src.set(input); // Get past one-time set-up issues for the ptx backend. dst.realize(output); double halide = 0, system = 0; for (int i = 0; i < iterations; i++) { double t1 = current_time(); dst.realize(output); dst.realize(output); dst.realize(output); double t2 = current_time(); memcpy(output.data(), input.data(), input.width()); memcpy(output.data(), input.data(), input.width()); memcpy(output.data(), input.data(), input.width()); double t3 = current_time(); system += t3-t2; halide += t2-t1; } printf("system memcpy: %.3e byte/s\n", (buffer_size / system) * 3 * 1000 * iterations); printf("halide memcpy: %.3e byte/s\n", (buffer_size / halide) * 3 * 1000 * iterations); // memcpy will win by a little bit for large inputs because it uses streaming stores if (halide > system * 2) { printf("Halide memcpy is slower than it should be.\n"); return -1; } printf("Success!\n"); return 0; }
int main(int argc, char **argv) { // Define a pipeline that dumps some squares to a file using an // external consumer stage. Func source; Var x; source(x) = x*x; Param<int> min, extent; Param<const char *> filename; Func sink; std::vector<ExternFuncArgument> args; args.push_back(source); args.push_back(filename); args.push_back(min); args.push_back(extent); sink.define_extern("dump_to_file", args, Int(32), 0); source.compute_root(); sink.compile_jit(); // Dump the first 10 squares to a file filename.set("halide_test_extern_consumer.txt"); min.set(0); extent.set(10); sink.realize(); if (!check_result()) return -1; // Test ImageParam ExternFuncArgument via passed in image. Image<int32_t> buf = source.realize(10); ImageParam passed_in(Int(32), 1); passed_in.set(buf); Func sink2; std::vector<ExternFuncArgument> args2; args2.push_back(passed_in); args2.push_back(filename); args2.push_back(min); args2.push_back(extent); sink2.define_extern("dump_to_file", args2, Int(32), 0); sink2.realize(); if (!check_result()) return -1; printf("Success!\n"); return 0; }
int main(int argc, char **argv) { Func f; Var x, y; ImageParam in(Float(32), 2); ImageParam x_coord(Int(32), 2); ImageParam y_coord(Int(32), 2); f(x, y) = 0.0f; RDom r(0, 100, 0, 100); f(x_coord(r.x, r.y), y_coord(r.x, r.y)) += in(r.x, r.y); f.compile_jit(); printf("I should not have reached here\n"); return 0; }
int main(int argc, char **argv) { Func f; Var x, y; f(x, y) = x + y; f.parallel(x); // Having more threads than tasks shouldn't hurt performance too much. double correct_time = 0; for (int t = 2; t <= 64; t *= 2) { std::ostringstream ss; ss << "HL_NUM_THREADS=" << t; std::string str = ss.str(); char buf[32] = {0}; memcpy(buf, str.c_str(), str.size()); putenv(buf); Halide::Internal::JITSharedRuntime::release_all(); f.compile_jit(); // Start the thread pool without giving any hints as to the // number of tasks we'll be using. f.realize(t, 1); double min_time = 1e20; for (int i = 0; i < 3; i++) { double t1 = current_time(); f.realize(2, 1000000); double t2 = current_time() - t1; if (t2 < min_time) min_time = t2; } printf("%d: %f ms\n", t, min_time); if (t == 2) { correct_time = min_time; } else if (min_time > correct_time * 5) { printf("Unacceptable overhead when using %d threads for 2 tasks: %f ms vs %f ms\n", t, min_time, correct_time); return -1; } } printf("Success!\n"); return 0; }
int main(int argc, char **argv) { ImageParam input(Float(32), 2); Var x, y, z; RDom dom(0, input.width()*8); Func f; Expr hard_to_reason_about = cast<int>(hypot(input.width(), input.height())); f(x, y, z) = 1; f(x, y, dom / hard_to_reason_about) += 1; f.compile_jit(); Image<float> im(32, 32); input.set(im); f.realize(100, 100, 16); printf("Success!\n"); return 0; }
// Now a schedule that uses CUDA or OpenCL. void schedule_for_gpu() { // We make the decision about whether to use the GPU for each // Func independently. If you have one Func computed on the // CPU, and the next computed on the GPU, Halide will do the // copy-to-gpu under the hood. For this pipeline, there's no // reason to use the CPU for any of the stages. Halide will // copy the input image to the GPU the first time we run the // pipeline, and leave it there to reuse on subsequent runs. // As before, we'll compute the LUT once at the start of the // pipeline. lut.compute_root(); // Let's compute the look-up-table using the GPU in 16-wide // one-dimensional thread blocks. First we split the index // into blocks of size 16: Var block, thread; lut.split(i, block, thread, 16); // Then we tell cuda that our Vars 'block' and 'thread' // correspond to CUDA's notions of blocks and threads, or // OpenCL's notions of thread groups and threads. lut.gpu_blocks(block) .gpu_threads(thread); // This is a very common scheduling pattern on the GPU, so // there's a shorthand for it: // lut.gpu_tile(i, 16); // Func::gpu_tile method is similar to Func::tile, except that // it also specifies that the tile coordinates correspond to // GPU blocks, and the coordinates within each tile correspond // to GPU threads. // Compute color channels innermost. Promise that there will // be three of them and unroll across them. curved.reorder(c, x, y) .bound(c, 0, 3) .unroll(c); // Compute curved in 2D 8x8 tiles using the GPU. curved.gpu_tile(x, y, 8, 8); // This is equivalent to: // curved.tile(x, y, xo, yo, xi, yi, 8, 8) // .gpu_blocks(xo, yo) // .gpu_threads(xi, yi); // We'll leave sharpen as inlined into curved. // Compute the padded input as needed per GPU block, storing the // intermediate result in shared memory. Var::gpu_blocks, and // Var::gpu_threads exist to help you schedule producers within // GPU threads and blocks. padded.compute_at(curved, Var::gpu_blocks()); // Use the GPU threads for the x and y coordinates of the // padded input. padded.gpu_threads(x, y); // JIT-compile the pipeline for the GPU. CUDA or OpenCL are // not enabled by default. We have to construct a Target // object, enable one of them, and then pass that target // object to compile_jit. Otherwise your CPU will very slowly // pretend it's a GPU, and use one thread per output pixel. // Start with a target suitable for the machine you're running // this on. Target target = get_host_target(); // Then enable OpenCL or CUDA. // We'll enable OpenCL here, because it tends to give better // performance than CUDA, even with NVidia's drivers, because // NVidia's open source LLVM backend doesn't seem to do all // the same optimizations their proprietary compiler does. target.features |= Target::OpenCL; // Uncomment the next line and comment out the line above to // try CUDA instead. // target.features |= Target::CUDA; // If you want to see all of the OpenCL or CUDA API calls done // by the pipeline, you can also enable the GPUDebug // flag. This is helpful for figuring out which stages are // slow, or when CPU -> GPU copies happen. It hurts // performance though, so we'll leave it commented out. //target.features |= Target::GPUDebug; curved.compile_jit(target); }
int main(int argc, char **argv) { const int N = 1 << 10; Image<int> data(N); for (int i = 0; i < N; i++) { data(i) = rand() & 0xfffff; } Func input = lambda(x, data(x)); printf("Bitonic sort...\n"); Func f = bitonic_sort(input, N); f.bound(x, 0, N); f.compile_jit(); printf("Running...\n"); Image<int> bitonic_sorted(N); f.realize(bitonic_sorted); double t1 = current_time(); for (int i = 0; i < 10; i++) { f.realize(bitonic_sorted); } double t2 = current_time(); printf("Merge sort...\n"); f = merge_sort(input, N); f.bound(x, 0, N); f.compile_jit(); printf("Running...\n"); Image<int> merge_sorted(N); f.realize(merge_sorted); double t3 = current_time(); for (int i = 0; i < 10; i++) { f.realize(merge_sorted); } double t4 = current_time(); Image<int> correct(N); for (int i = 0; i < N; i++) { correct(i) = data(i); } printf("std::sort...\n"); double t5 = current_time(); std::sort(&correct(0), &correct(N)); double t6 = current_time(); printf("Times:\n" "bitonic sort: %f \n" "merge sort: %f \n" "std::sort %f\n", (t2-t1)/10, (t4-t3)/10, t6-t5); if (N <= 100) { for (int i = 0; i < N; i++) { printf("%8d %8d %8d\n", correct(i), bitonic_sorted(i), merge_sorted(i)); } } for (int i = 0; i < N; i++) { if (bitonic_sorted(i) != correct(i)) { printf("bitonic sort failed: %d -> %d instead of %d\n", i, bitonic_sorted(i), correct(i)); return -1; } if (merge_sorted(i) != correct(i)) { printf("merge sort failed: %d -> %d instead of %d\n", i, merge_sorted(i), correct(i)); return -1; } } return 0; }
int main(int argc, char **argv) { const int N = 1 << 10; Buffer<int> data(N); for (int i = 0; i < N; i++) { data(i) = rand() & 0xfffff; } Func input = lambda(x, data(x)); printf("Bitonic sort...\n"); Func f = bitonic_sort(input, N); f.bound(x, 0, N); f.compile_jit(); printf("Running...\n"); Buffer<int> bitonic_sorted(N); f.realize(bitonic_sorted); double t_bitonic = benchmark(1, 10, [&]() { f.realize(bitonic_sorted); }); printf("Merge sort...\n"); f = merge_sort(input, N); f.bound(x, 0, N); f.compile_jit(); printf("Running...\n"); Buffer<int> merge_sorted(N); f.realize(merge_sorted); double t_merge = benchmark(1, 10, [&]() { f.realize(merge_sorted); }); Buffer<int> correct(N); for (int i = 0; i < N; i++) { correct(i) = data(i); } printf("std::sort...\n"); double t_std = benchmark(1, 1, [&]() { std::sort(&correct(0), &correct(N)); }); printf("Times:\n" "bitonic sort: %fms \n" "merge sort: %fms \n" "std::sort %fms\n", t_bitonic * 1e3, t_merge * 1e3, t_std * 1e3); if (N <= 100) { for (int i = 0; i < N; i++) { printf("%8d %8d %8d\n", correct(i), bitonic_sorted(i), merge_sorted(i)); } } for (int i = 0; i < N; i++) { if (bitonic_sorted(i) != correct(i)) { printf("bitonic sort failed: %d -> %d instead of %d\n", i, bitonic_sorted(i), correct(i)); return -1; } if (merge_sorted(i) != correct(i)) { printf("merge sort failed: %d -> %d instead of %d\n", i, merge_sorted(i), correct(i)); return -1; } } return 0; }
int main(int argc, char *argv[]) { #if !defined(STANDALONE) && !defined(TESTING_GPU) auto im = afwImage::MaskedImage<float>("../calexp-004207-g3-0123.fits"); int width = im.getWidth(), height = im.getHeight(); #else int width = 2048, height = 1489; // int width = 200, height = 200; printf("[no load]"); #endif printf("Loaded: %d x %d\n", width, height); //store image data in img_var(x, y, 0) and variance data in img_var(x, y, 1) Image<float> image(width, height); Image<float> variance(width, height); Image<uint16_t> mask(width, height); #if !defined(STANDALONE) && !defined(TESTING_GPU) //Read image in for (int y = 0; y < im.getHeight(); y++) { afwImage::MaskedImage<float, lsst::afw::image::MaskPixel, lsst::afw::image::VariancePixel>::x_iterator inPtr = im.x_at(0, y); for (int x = 0; x < im.getWidth(); x++){ image(x, y) = (*inPtr).image(); variance(x, y) = (*inPtr).variance(); mask(x, y) = (*inPtr).mask(); inPtr++; } } #endif int boundingBox = 5; Var x, y, i_v, y0, yi; //compute output image and variance //Polynomials that define weights of spatially variant linear combination of 5 kernels Func polynomial1, polynomial2, polynomial3, polynomial4, polynomial5; polynomial1(x, y) = 0.1f + 0.002f*x + 0.003f*y + 0.4f*x*x + 0.5f*x*y + 0.6f*y*y + 0.0007f*x*x*x + 0.0008f*x*x*y + 0.0009f*x*y*y + 0.00011f*y*y*y; //for experimenting with optimizations polynomial2(x, y) = 1.1f + 1.002f*x + 1.003f*y + 1.4f*x*x + 1.5f*x*y + 1.6f*y*y + 1.0007f*x*x*x + 1.0008f*x*x*y + 1.0009f*x*y*y + 1.00011f*y*y*y; //for experimenting with optimizations polynomial3(x, y) = 2.1f + 2.002f*x + 2.003f*y + 2.4f*x*x + 2.5f*x*y + 2.6f*y*y + 2.0007f*x*x*x + 2.0008f*x*x*y + 2.0009f*x*y*y + 2.00011f*y*y*y; //for experimenting with optimizations polynomial4(x, y) = 3.1f + 3.002f*x + 3.003f*y + 3.4f*x*x + 3.5f*x*y + 3.6f*y*y + 3.0007f*x*x*x + 3.0008f*x*x*y + 3.0009f*x*y*y + 3.00011f*y*y*y; //for experimenting with optimizations polynomial5(x, y) = 4.1f + 4.002f*x + 4.003f*y + 4.4f*x*x + 4.5f*x*y + 4.6f*y*y + 4.0007f*x*x*x + 4.0008f*x*x*y + 4.0009f*x*y*y + 4.00011f*y*y*y; //Kernel #1 Func kernel1; float sigmaX1 = 2.0f; float sigmaY1 = 2.0f; float theta1 = 0.0f; //rotation of sigmaX axis kernel1(x, y) = (exp(-((x*cos(theta1) +y*sin(theta1))*(x*cos(theta1) +y*sin(theta1))) /(2*sigmaX1*sigmaX1)) / (sqrtf(2*M_PI)*sigmaX1)) *(exp(-((y*cos(theta1) - x*sin(theta1))*(y*cos(theta1) - x*sin(theta1))) /(2*sigmaY1*sigmaY1)) / (sqrtf(2*M_PI)*sigmaY1)); //Kernel #2 Func kernel2; float sigmaX2 = 0.5f; float sigmaY2 = 4.0f; float theta2 = 0.0f; //rotation of sigmaX axis kernel2(x, y) = (exp(-((x*cos(theta2) +y*sin(theta2))*(x*cos(theta2) +y*sin(theta2))) /(2*sigmaX2*sigmaX2)) / (sqrtf(2*M_PI)*sigmaX2)) *(exp(-((y*cos(theta2) - x*sin(theta2))*(y*cos(theta2) - x*sin(theta2))) /(2*sigmaY2*sigmaY2)) / (sqrtf(2*M_PI)*sigmaY2)); //Kernel #3 Func kernel3; float sigmaX3 = 0.5f; float sigmaY3 = 4.0f; float theta3 = 3.14159f/4; //rotation of sigmaX axis kernel3(x, y) = (exp(-((x*cos(theta3) +y*sin(theta3))*(x*cos(theta3) +y*sin(theta3))) /(2*sigmaX3*sigmaX3)) / (sqrtf(2*M_PI)*sigmaX3)) *(exp(-((y*cos(theta3) - x*sin(theta3))*(y*cos(theta3) - x*sin(theta3))) /(2*sigmaY3*sigmaY3)) / (sqrtf(2*M_PI)*sigmaY3)); //Kernel #4 Func kernel4; float sigmaX4 = 0.5f; float sigmaY4 = 4.0f; float theta4 = 3.14159f/2; //rotation of sigmaX axis kernel4(x, y) = (exp(-((x*cos(theta4) +y*sin(theta4))*(x*cos(theta4) +y*sin(theta4))) /(2*sigmaX4*sigmaX4)) / (sqrtf(2*M_PI)*sigmaX4)) *(exp(-((y*cos(theta4) - x*sin(theta4))*(y*cos(theta4) - x*sin(theta4))) /(2*sigmaY4*sigmaY4)) / (sqrtf(2*M_PI)*sigmaY4)); //Kernel #5 Func kernel5; float sigmaX5 = 4.0f; float sigmaY5 = 4.0f; float theta5 = 0.0; //rotation of sigmaX axis kernel5(x, y) = (exp(-((x*cos(theta5) +y*sin(theta5))*(x*cos(theta5) +y*sin(theta5))) /(2*sigmaX5*sigmaX5)) / (sqrtf(2*M_PI)*sigmaX5)) *(exp(-((y*cos(theta5) - x*sin(theta5))*(y*cos(theta5) - x*sin(theta5))) /(2*sigmaY5*sigmaY5)) / (sqrtf(2*M_PI)*sigmaY5)); //Compute output image plane Func image_bounded ("image_bounded"); image_bounded = BoundaryConditions::repeat_edge(image); //Spatially Invariant Implementation 1 /* Expr blur_image_help = 0.0f; Expr norm = 0.0f; for(int i = -boundingBox; i <= boundingBox; i++){ for(int j = -boundingBox; j <= boundingBox; j++){ blur_image_help += image_bounded(x + i, y + j) * (kernel1(i, j) + kernel2(i, j) + kernel3(i, j) + kernel4(i, j) + kernel5(i, j)); norm += (kernel1(i, j) + kernel2(i, j) + kernel3(i, j) + kernel4(i, j) + kernel5(i, j)); } } blur_image_help = blur_image_help/norm; Func blurImage ("blurImage"); blurImage(x, y) = blur_image_help; */ //Spatially Invariant Implementation 2 /* Expr blur_image_help1 = 0.0f; Expr norm1 = 0.0f; for(int i = -boundingBox; i <= boundingBox; i++){ for(int j = -boundingBox; j <= boundingBox; j++){ blur_image_help1 += image_bounded(x + i, y + j) * kernel1(i, j); norm1 += kernel1(i, j); } } // blur_image_help1 = blur_image_help1/norm1; Func blurImage1 ("blurImage1"); blurImage1(x, y) = blur_image_help1; Expr blur_image_help2 = 0.0f; Expr norm2 = 0.0f; for(int i = -boundingBox; i <= boundingBox; i++){ for(int j = -boundingBox; j <= boundingBox; j++){ blur_image_help2 += image_bounded(x + i, y + j) * kernel2(i, j); norm2 += kernel2(i, j); } } // blur_image_help2 = blur_image_help2/norm2; Func blurImage2 ("blurImage2"); blurImage2(x, y) = blur_image_help2; Expr blur_image_help3 = 0.0f; Expr norm3 = 0.0f; for(int i = -boundingBox; i <= boundingBox; i++){ for(int j = -boundingBox; j <= boundingBox; j++){ blur_image_help3 += image_bounded(x + i, y + j) * kernel3(i, j); norm3 += kernel3(i, j); } } // blur_image_help3 = blur_image_help3/norm3; Func blurImage3 ("blurImage3"); blurImage3(x, y) = blur_image_help3; Expr blur_image_help4 = 0.0f; Expr norm4 = 0.0f; for(int i = -boundingBox; i <= boundingBox; i++){ for(int j = -boundingBox; j <= boundingBox; j++){ blur_image_help4 += image_bounded(x + i, y + j) * kernel4(i, j); norm4 += kernel4(i, j); } } // blur_image_help4 = blur_image_help4/norm4; Func blurImage4 ("blurImage4"); blurImage4(x, y) = blur_image_help4; Expr blur_image_help5 = 0.0f; Expr norm5 = 0.0f; for(int i = -boundingBox; i <= boundingBox; i++){ for(int j = -boundingBox; j <= boundingBox; j++){ blur_image_help5 += image_bounded(x + i, y + j) * kernel5(i, j); norm5 += kernel5(i, j); } } // blur_image_help5 = blur_image_help5/norm5; Func blurImage5 ("blurImage5"); blurImage5(x, y) = blur_image_help5; Func blurImage ("blurImage"); // blurImage(x, y) = (blurImage1(x, y) + blurImage2(x, y) + blurImage3(x, y) + // blurImage4(x, y) + blurImage5(x, y))/(5*norm1); blurImage(x, y) = (blur_image_help1 + blur_image_help2 + blur_image_help3 + blur_image_help4 + blur_image_help5)/(5*norm1); */ //Spatially Variant Implementation 1 Expr blur_image_help = 0.0f; Expr norm = 0.0f; for(int i = -boundingBox; i <= boundingBox; i++){ for(int j = -boundingBox; j <= boundingBox; j++){ blur_image_help += image_bounded(x + i, y + j) * (polynomial1(x, y)*kernel1(i, j) + polynomial2(x, y)*kernel2(i, j) + polynomial3(x, y)*kernel3(i, j) + polynomial4(x, y)*kernel4(i, j) + polynomial5(x, y)*kernel5(i, j)); norm += (polynomial1(x, y)*kernel1(i, j) + polynomial2(x, y)*kernel2(i, j) + polynomial3(x, y)*kernel3(i, j) + polynomial4(x, y)*kernel4(i, j) + polynomial5(x, y)*kernel5(i, j)); } } blur_image_help = blur_image_help/norm; Func blurImage ("blurImage"); blurImage(x, y) = blur_image_help; //Compute output variance plane Func variance_bounded ("variance_bounded"); variance_bounded = BoundaryConditions::repeat_edge(variance); //compute Variance output Func blurVariance ("blurVariance"); Expr blur_variance_help = 0.0f; Expr vNorm2 = 0.0f; for(int i = -boundingBox; i <= boundingBox; i++){ for(int j = -boundingBox; j <= boundingBox; j++){ blur_variance_help += variance_bounded(x + i, y + j) * (polynomial1(x, y)*kernel1(i, j) + polynomial2(x, y)*kernel2(i, j) + polynomial3(x, y)*kernel3(i, j) + polynomial4(x, y)*kernel4(i, j) + polynomial5(x, y)*kernel5(i, j)) *(polynomial1(x, y)*kernel1(i, j) + polynomial2(x, y)*kernel2(i, j) + polynomial3(x, y)*kernel3(i, j) + polynomial4(x, y)*kernel4(i, j) + polynomial5(x, y)*kernel5(i, j)); vNorm2 += (polynomial1(x, y)*kernel1(i, j) + polynomial2(x, y)*kernel2(i, j) + polynomial3(x, y)*kernel3(i, j) + polynomial4(x, y)*kernel4(i, j) + polynomial5(x, y)*kernel5(i, j)) *(polynomial1(x, y)*kernel1(i, j) + polynomial2(x, y)*kernel2(i, j) + polynomial3(x, y)*kernel3(i, j) + polynomial4(x, y)*kernel4(i, j) + polynomial5(x, y)*kernel5(i, j)); } } // blur_variance_help = blur_variance_help/(norm(x,y)*norm(x,y)); blur_variance_help = blur_variance_help/(vNorm2*vNorm2); blurVariance(x, y) = blur_variance_help; //Compute output mask plane Func mask_bounded ("mask_bounded"); mask_bounded = BoundaryConditions::repeat_edge(mask); Func maskOut ("maskOut"); Expr maskOutHelp = 0; for(int i = -boundingBox; i <= boundingBox; i++){ for(int j = -boundingBox; j <= boundingBox; j++){ maskOutHelp = select((polynomial1(x, y)*kernel1(i, j) + polynomial2(x, y)*kernel2(i, j) + polynomial3(x, y)*kernel3(i, j) + polynomial4(x, y)*kernel4(i, j) + polynomial5(x, y)*kernel5(i, j)) == 0.0f, maskOutHelp, maskOutHelp | mask_bounded(x + i, y + j)); // maskOutHelp = maskOutHelp | mask_bounded(x + i, y + j); } } maskOut(x, y) = maskOutHelp; //Schedule // blur.reorder(i_v, x, y); // kernel1.compute_at(blurImage, x); // kernel1.vectorize(x, 8); // kernel1.split(y, y0, yi, 4); // kernel1.parallel(y0); /* kernel1.compute_root(); kernel2.compute_root(); kernel3.compute_root(); kernel4.compute_root(); kernel5.compute_root(); */ //best schedule found: #ifdef TESTING_GPU blurImage.gpu_tile(x, y, 16, 16); // JIT-compile the pipeline for the GPU. CUDA or OpenCL are // not enabled by default. We have to construct a Target // object, enable one of them, and then pass that target // object to compile_jit. Otherwise your CPU will very slowly // pretend it's a GPU, and use one thread per output pixel. // Start with a target suitable for the machine you're running // this on. Target target = get_host_target(); // Then enable OpenCL or CUDA. // We'll enable OpenCL here, because it tends to give better // performance than CUDA, even with NVidia's drivers, because // NVidia's open source LLVM backend doesn't seem to do all // the same optimizations their proprietary compiler does. target.set_feature(Target::OpenCL); // Uncomment the next line and comment out the line above to // try CUDA instead. // target.set_feature(Target::CUDA); // If you want to see all of the OpenCL or CUDA API calls done // by the pipeline, you can also enable the Debug // flag. This is helpful for figuring out which stages are // slow, or when CPU -> GPU copies happen. It hurts // performance though, so we'll leave it commented out. // target.set_feature(Target::Debug); blurImage.compile_jit(target); #else blurImage.split(y, y0, yi, 4); blurImage.parallel(y0); blurImage.vectorize(x, 8); #endif // Split the y coordinate of the consumer into strips: blurVariance.split(y, y0, yi, 4); // Compute the strips using a thread pool and a task queue. blurVariance.parallel(y0); // Vectorize across x. blurVariance.vectorize(x, 8); // polynomial1.compute_at(blurImage, x).vectorize(x, 8); // kernel1.compute_at(blurImage, x).vectorize(x, 8); // Split the y coordinate of the consumer into strips of 16 scanlines: maskOut.split(y, y0, yi, 30); // Compute the strips using a thread pool and a task queue. maskOut.parallel(y0); // Vectorize across x by a factor of four. maskOut.vectorize(x, 8); // kernel1.trace_stores(); // blurImage.trace_stores(); //Check out what is happening blurImage.print_loop_nest(); // Print out pseudocode for the pipeline. blurImage.compile_to_lowered_stmt("linearCombinationKernelBlurImage.html", {image}, HTML); // blurImage.compile_to_c("linearCombinationKernel_C_Code.cpp", std::vector<Argument>(), "linearCombinationKernel_C_Code"); // blurVariance.compile_to_lowered_stmt("blur.html", {variance}, HTML); // Benchmark the pipeline. #ifdef TESTING_GPU Buffer image_output(Float(32), image.width(), image.height()); //for GPU testing #else Image<float> image_output(image.width(), image.height()); #endif blurImage.realize(image_output); Image<float> variance_output(variance.width(), variance.height()); blurVariance.realize(variance_output); Image<int32_t> mask_output(mask.width(), mask.height()); maskOut.realize(mask_output); #ifdef TESTING_GPU // Run the filter once to initialize any GPU runtime state. blurImage.realize(image_output); // Now take the best of 3 runs for timing. double best_time; for (int i = 0; i < 3; i++) { double t1 = current_time(); // Run the filter 100 times. for (int j = 0; j < 100; j++) { blurImage.realize(image_output); } // Force any GPU code to finish by copying the buffer back to the CPU. image_output.copy_to_host(); double t2 = current_time(); double elapsed = (t2 - t1)/100; if (i == 0 || elapsed < best_time) { best_time = elapsed; } } printf("%1.4f milliseconds\n", best_time); #else double average = 0; double min; double max; double imgTime; double varTime; double maskTime; int numberOfRuns = 5; for (int i = 0; i < numberOfRuns; i++) { double t1 = current_time(); blurImage.realize(image_output); double t2 = current_time(); blurVariance.realize(variance_output); double t3 = current_time(); maskOut.realize(mask_output); double t4 = current_time(); double curTime = (t4-t1); average += curTime; if(i == 0){ min = curTime; max = curTime; imgTime = t2-t1; varTime = t3-t2; maskTime = t4-t3; } else{ if(curTime < min){ min = curTime; imgTime = t2-t1; varTime = t3-t2; maskTime = t4-t3; } if(curTime > max) max = curTime; } } average = average/numberOfRuns; std::cout << "Average Time: " << average << ", Min = " << min << ", Max = " << max << ", with " << numberOfRuns << " runs" << '\n'; cout << "For fastest run total time = " << min << ", imgTime = " << imgTime << ", varTime = " << varTime << "maskTime = " << maskTime << endl; #endif #if !defined(STANDALONE) && !defined(TESTING_GPU) //write image out auto imOut = afwImage::MaskedImage<float, lsst::afw::image::MaskPixel, lsst::afw::image::VariancePixel>(im.getWidth(), im.getHeight()); for (int y = 0; y < imOut.getHeight(); y++) { afwImage::MaskedImage<float, lsst::afw::image::MaskPixel, lsst::afw::image::VariancePixel>::x_iterator inPtr = imOut.x_at(0, y); for (int x = 0; x < imOut.getWidth(); x++){ afwImage::pixel::SinglePixel<float, lsst::afw::image::MaskPixel, lsst::afw::image::VariancePixel> curPixel(image_output(x, y), mask_output(x, y), variance_output(x, y)); (*inPtr) = curPixel; inPtr++; } } imOut.writeFits("./halideLinearCombination5x5.fits"); #endif }
// Now a schedule that uses CUDA or OpenCL. void schedule_for_gpu() { // We make the decision about whether to use the GPU for each // Func independently. If you have one Func computed on the // CPU, and the next computed on the GPU, Halide will do the // copy-to-gpu under the hood. For this pipeline, there's no // reason to use the CPU for any of the stages. Halide will // copy the input image to the GPU the first time we run the // pipeline, and leave it there to reuse on subsequent runs. // As before, we'll compute the LUT once at the start of the // pipeline. lut.compute_root(); // Let's compute the look-up-table using the GPU in 16-wide // one-dimensional thread blocks. First we split the index // into blocks of size 16: Var block, thread; lut.split(i, block, thread, 16); // Then we tell cuda that our Vars 'block' and 'thread' // correspond to CUDA's notions of blocks and threads, or // OpenCL's notions of thread groups and threads. lut.gpu_blocks(block) .gpu_threads(thread); // This is a very common scheduling pattern on the GPU, so // there's a shorthand for it: // lut.gpu_tile(i, block, thread, 16); // Func::gpu_tile behaves the same as Func::tile, except that // it also specifies that the tile coordinates correspond to // GPU blocks, and the coordinates within each tile correspond // to GPU threads. // Compute color channels innermost. Promise that there will // be three of them and unroll across them. curved.reorder(c, x, y) .bound(c, 0, 3) .unroll(c); // Compute curved in 2D 8x8 tiles using the GPU. curved.gpu_tile(x, y, xo, yo, xi, yi, 8, 8); // This is equivalent to: // curved.tile(x, y, xo, yo, xi, yi, 8, 8) // .gpu_blocks(xo, yo) // .gpu_threads(xi, yi); // We'll leave sharpen as inlined into curved. // Compute the padded input as needed per GPU block, storing // the intermediate result in shared memory. In the schedule // above xo corresponds to GPU blocks. padded.compute_at(curved, xo); // Use the GPU threads for the x and y coordinates of the // padded input. padded.gpu_threads(x, y); // JIT-compile the pipeline for the GPU. CUDA, OpenCL, or // Metal are not enabled by default. We have to construct a // Target object, enable one of them, and then pass that // target object to compile_jit. Otherwise your CPU will very // slowly pretend it's a GPU, and use one thread per output // pixel. // Start with a target suitable for the machine you're running // this on. Target target = get_host_target(); // Then enable OpenCL or Metal, depending on which platform // we're on. OS X doesn't update its OpenCL drivers, so they // tend to be broken. CUDA would also be a fine choice on // machines with NVidia GPUs. if (target.os == Target::OSX) { target.set_feature(Target::Metal); } else { target.set_feature(Target::OpenCL); } // Uncomment the next line and comment out the lines above to // try CUDA instead. // target.set_feature(Target::CUDA); // If you want to see all of the OpenCL, Metal, or CUDA API // calls done by the pipeline, you can also enable the Debug // flag. This is helpful for figuring out which stages are // slow, or when CPU -> GPU copies happen. It hurts // performance though, so we'll leave it commented out. // target.set_feature(Target::Debug); curved.compile_jit(target); }