Func process(Func raw, Type result_type, ImageParam matrix_3200, ImageParam matrix_7000, Param<float> color_temp, Param<float> gamma, Param<float> contrast, Param<int> blackLevel, Param<int> whiteLevel) { Var yii, xi; Func denoised = hot_pixel_suppression(raw); Func deinterleaved = deinterleave(denoised); Func demosaiced = demosaic(deinterleaved); Func corrected = color_correct(demosaiced, matrix_3200, matrix_7000, color_temp); Func curved = apply_curve(corrected, result_type, gamma, contrast, blackLevel, whiteLevel); processed(x, y, c) = curved(x, y, c); // Schedule Expr out_width = processed.output_buffer().width(); Expr out_height = processed.output_buffer().height(); int strip_size = 32; int vec = target.natural_vector_size(UInt(16)); if (target.has_feature(Target::HVX_64)) { vec = 32; } else if (target.has_feature(Target::HVX_128)) { vec = 64; } denoised.compute_at(processed, yi).store_at(processed, yo) .fold_storage(y, 8) .vectorize(x, vec); deinterleaved.compute_at(processed, yi).store_at(processed, yo) .fold_storage(y, 4) .vectorize(x, 2*vec, TailStrategy::RoundUp) .reorder(c, x, y) .unroll(c); corrected.compute_at(processed, x) .vectorize(x, vec) .reorder(c, x, y) .unroll(c); processed.compute_root() .split(y, yo, yi, strip_size) .split(yi, yi, yii, 2) .split(x, x, xi, 2*vec, TailStrategy::RoundUp) .reorder(xi, c, yii, x, yi, yo) .vectorize(xi, 2*vec) .parallel(yo); if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { processed.hexagon(); denoised.align_storage(x, vec); deinterleaved.align_storage(x, vec); corrected.align_storage(x, vec); } // We can generate slightly better code if we know the splits divide the extent. processed .bound(c, 0, 3) .bound(x, 0, ((out_width)/(2*vec))*(2*vec)) .bound(y, 0, (out_height/strip_size)*strip_size); return processed; }
void schedule(Func f, const Target &t) { // TODO: Add GPU schedule where supported. if (t.features_any_of({Target::HVX_64, Target::HVX_128})) { f.hexagon().vectorize(x, 32); } else { f.vectorize(x, 16); } }
int main(int argc, char **argv) { Target target = get_jit_target_from_environment(); if (target.has_feature(Target::Profile)) { // The profiler adds lots of extra prints, so counting the // number of prints is not useful. printf("Skipping test because profiler is active\n"); return 0; } if (target.has_feature(Target::Debug)) { // Same thing here: the runtime debug adds lots of extra prints, // so counting the number of prints is not useful. printf("Skipping test because runtime debug is active\n"); return 0; } Var x; { Func f; f(x) = print(x * x, "the answer is", 42.0f, "unsigned", cast<uint32_t>(145)); f.set_custom_print(halide_print); Buffer<int32_t> result = f.realize(10); for (int32_t i = 0; i < 10; i++) { if (result(i) != i * i) { return -1; } } assert(messages.size() == 10); for (size_t i = 0; i < messages.size(); i++) { long square; float forty_two; unsigned long one_forty_five; int scan_count = sscanf(messages[i].c_str(), "%ld the answer is %f unsigned %lu", &square, &forty_two, &one_forty_five); assert(scan_count == 3); assert(square == static_cast<long long>(i * i)); assert(forty_two == 42.0f); assert(one_forty_five == 145); } } messages.clear(); { Func f; Param<int> param; param.set(127); // Test a string containing a printf format specifier (It should print it as-is). f(x) = print_when(x == 3, x * x, "g", 42.0f, "%s", param); f.set_custom_print(halide_print); Buffer<int32_t> result = f.realize(10); for (int32_t i = 0; i < 10; i++) { if (result(i) != i * i) { return -1; } } assert(messages.size() == 1); long nine; float forty_two; long p; int scan_count = sscanf(messages[0].c_str(), "%ld g %f %%s %ld", &nine, &forty_two, &p); assert(scan_count == 3); assert(nine == 9); assert(forty_two == 42.0f); assert(p == 127); } messages.clear(); { Func f; // Test a single message longer than 8K. std::vector<Expr> args; for (int i = 0; i < 500; i++) { uint64_t n = i; n *= n; n *= n; n *= n; n *= n; n += 100; int32_t hi = n >> 32; int32_t lo = n & 0xffffffff; args.push_back((cast<uint64_t>(hi) << 32) | lo); Expr dn = cast<double>((float)(n)); args.push_back(dn); } f(x) = print(args); f.set_custom_print(halide_print); Buffer<uint64_t> result = f.realize(1); if (result(0) != 100) { return -1; } assert(messages.back().size() == 8191); } messages.clear(); // Check that Halide's stringification of floats and doubles // matches %f and %e respectively. #ifndef _WIN32 // msvc's library has different ideas about how %f and %e should come out. { Func f, g; const int N = 1000000; Expr e = reinterpret(Float(32), random_uint()); // Make sure we cover some special values. e = select(x == 0, 0.0f, x == 1, -0.0f, x == 2, std::numeric_limits<float>::infinity(), x == 3, -std::numeric_limits<float>::infinity(), x == 4, std::numeric_limits<float>::quiet_NaN(), x == 5, -std::numeric_limits<float>::quiet_NaN(), e); e = select(x == 5, std::numeric_limits<float>::denorm_min(), x == 6, -std::numeric_limits<float>::denorm_min(), x == 7, std::numeric_limits<float>::min(), x == 8, -std::numeric_limits<float>::min(), x == 9, std::numeric_limits<float>::max(), x == 10, -std::numeric_limits<float>::max(), x == 11, 1.0f - 1.0f / (1 << 22), e); f(x) = print(e); f.set_custom_print(halide_print); Buffer<float> imf = f.realize(N); assert(messages.size() == (size_t)N); char correct[1024]; for (int i = 0; i < N; i++) { snprintf(correct, sizeof(correct), "%f\n", imf(i)); // Some versions of the std library can emit some NaN patterns // as "-nan", due to sloppy conversion (or not) of the sign bit. // Halide considers all NaN's equivalent, so paper over this // noise in the test by normalizing all -nan -> nan. if (messages[i] == "-nan\n") messages[i] = "nan\n"; if (!strcmp(correct, "-nan\n")) strcpy(correct, "nan\n"); if (messages[i] != correct) { printf("float %d: %s vs %s for %10.20e\n", i, messages[i].c_str(), correct, imf(i)); return -1; } } messages.clear(); g(x) = print(reinterpret(Float(64), (cast<uint64_t>(random_uint()) << 32) | random_uint())); g.set_custom_print(halide_print); Buffer<double> img = g.realize(N); assert(messages.size() == (size_t)N); for (int i = 0; i < N; i++) { snprintf(correct, sizeof(correct), "%e\n", img(i)); // Some versions of the std library can emit some NaN patterns // as "-nan", due to sloppy conversion (or not) of the sign bit. // Halide considers all NaN's equivalent, so paper over this // noise in the test by normalizing all -nan -> nan. if (messages[i] == "-nan\n") messages[i] = "nan\n"; if (!strcmp(correct, "-nan\n")) strcpy(correct, "nan\n"); if (messages[i] != correct) { printf("double %d: %s vs %s for %10.20e\n", i, messages[i].c_str(), correct, img(i)); return -1; } } } #endif messages.clear(); { Func f; // Test a vectorized print. f(x) = print(x * 3); f.set_custom_print(halide_print); f.vectorize(x, 32); if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { f.hexagon(); } Buffer<int> result = f.realize(128); if (!target.features_any_of({Target::HVX_64, Target::HVX_128})) { assert((int)messages.size() == result.width()); for (size_t i = 0; i < messages.size(); i++) { assert(messages[i] == std::to_string(i * 3) + "\n"); } } else { // The Hexagon simulator prints directly to stderr, so we // can't read the messages. } } messages.clear(); { Func f; // Test a vectorized print_when. f(x) = print_when(x % 2 == 0, x * 3); f.set_custom_print(halide_print); f.vectorize(x, 32); if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { f.hexagon(); } Buffer<int> result = f.realize(128); if (!target.features_any_of({Target::HVX_64, Target::HVX_128})) { assert((int)messages.size() == result.width() / 2); for (size_t i = 0; i < messages.size(); i++) { assert(messages[i] == std::to_string(i * 2 * 3) + "\n"); } } else { // The Hexagon simulator prints directly to stderr, so we // can't read the messages. } } printf("Success!\n"); return 0; }
int main(int argc, char **argv) { Target target = get_jit_target_from_environment(); if (1) { // Test a tuple reduction on the gpu Func f; Var x, y; f(x, y) = Tuple(x + y, x - y); // Updates to a reduction are atomic. f(x, y) = Tuple(f(x, y)[1]*2, f(x, y)[0]*2); // now equals ((x - y)*2, (x + y)*2) if (target.has_gpu_feature()) { f.gpu_tile(x, y, 16, 16); f.update().gpu_tile(x, y, 16, 16); } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { f.hexagon(y).vectorize(x, 32); f.update().hexagon(y).vectorize(x, 32); } Realization result = f.realize(1024, 1024); Image<int> a = result[0], b = result[1]; for (int y = 0; y < a.height(); y++) { for (int x = 0; x < a.width(); x++) { int correct_a = (x - y)*2; int correct_b = (x + y)*2; if (a(x, y) != correct_a || b(x, y) != correct_b) { printf("result(%d, %d) = (%d, %d) instead of (%d, %d)\n", x, y, a(x, y), b(x, y), correct_a, correct_b); return -1; } } } } if (1) { // Now test one that alternates between cpu and gpu per update step Func f; Var x, y; f(x, y) = Tuple(x + y, x - y); for (size_t i = 0; i < 10; i++) { // Swap the tuple elements and increment both f(x, y) = Tuple(f(x, y)[1] + 1, f(x, y)[0] + 1); } // Schedule the pure step and the odd update steps on the gpu if (target.has_gpu_feature()) { f.gpu_tile(x, y, 16, 16); } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { f.hexagon(y).vectorize(x, 32); } for (int i = 0; i < 10; i ++) { if (i & 1) { if (target.has_gpu_feature()) { f.update(i).gpu_tile(x, y, 16, 16); } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { f.update(i).hexagon(y).vectorize(x, 32); } } else { f.update(i); } } Realization result = f.realize(1024, 1024); Image<int> a = result[0], b = result[1]; for (int y = 0; y < a.height(); y++) { for (int x = 0; x < a.width(); x++) { int correct_a = (x + y) + 10; int correct_b = (x - y) + 10; if (a(x, y) != correct_a || b(x, y) != correct_b) { printf("result(%d, %d) = (%d, %d) instead of (%d, %d)\n", x, y, a(x, y), b(x, y), correct_a, correct_b); return -1; } } } } if (1) { // Same as above, but switches which steps are gpu and cpu Func f; Var x, y; f(x, y) = Tuple(x + y, x - y); for (size_t i = 0; i < 10; i++) { // Swap the tuple elements and increment both f(x, y) = Tuple(f(x, y)[1] + 1, f(x, y)[0] + 1); } // Schedule the even update steps on the gpu for (int i = 0; i < 10; i ++) { if (i & 1) { if (target.has_gpu_feature()) { f.update(i).gpu_tile(x, y, 16, 16); } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { f.update(i).hexagon(y).vectorize(x, 32); } } else { f.update(i); } } Realization result = f.realize(1024, 1024); Image<int> a = result[0], b = result[1]; for (int y = 0; y < a.height(); y++) { for (int x = 0; x < a.width(); x++) { int correct_a = (x + y) + 10; int correct_b = (x - y) + 10; if (a(x, y) != correct_a || b(x, y) != correct_b) { printf("result(%d, %d) = (%d, %d) instead of (%d, %d)\n", x, y, a(x, y), b(x, y), correct_a, correct_b); return -1; } } } } if (1) { // In this one, each step only uses one of the tuple elements // of the previous step, so only that buffer should get copied // back to host or copied to device. Func f; Var x, y; f(x, y) = Tuple(x + y - 1000, x - y + 1000); for (size_t i = 0; i < 10; i++) { f(x, y) = Tuple(f(x, y)[1] - 1, f(x, y)[1] + 1); } // Schedule the even update steps on the gpu for (int i = 0; i < 10; i++) { if (i & 1) { f.update(i); } else { if (target.has_gpu_feature()) { f.update(i).gpu_tile(x, y, 16, 16); } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { f.update(i).hexagon(y).vectorize(x, 32); } } } Realization result = f.realize(1024, 1024); Image<int> a = result[0], b = result[1]; for (int y = 0; y < a.height(); y++) { for (int x = 0; x < a.width(); x++) { int correct_a = (x - y + 1000) + 8; int correct_b = (x - y + 1000) + 10; if (a(x, y) != correct_a || b(x, y) != correct_b) { printf("result(%d, %d) = (%d, %d) instead of (%d, %d)\n", x, y, a(x, y), b(x, y), correct_a, correct_b); return -1; } } } } printf("Success!\n"); return 0; }
int main(int argc, char **argv) { Buffer<uint8_t> input(128, 64); for (int y = 0; y < input.height(); y++) { for (int x = 0; x < input.width(); x++) { input(x, y) = y*input.width() + x; } } Var x, y, xi, yi; { Func f; f(x, y) = select(((input(x, y) > 10) && (input(x, y) < 20)) || ((input(x, y) > 40) && (!(input(x, y) > 50))), u8(255), u8(0)); Target target = get_jit_target_from_environment(); if (target.has_gpu_feature()) { f.gpu_tile(x, y, xi, yi, 16, 16).vectorize(xi, 4); } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { f.hexagon().vectorize(x, 128); } else { f.vectorize(x, 8); } Buffer<uint8_t> output = f.realize(input.width(), input.height(), target); for (int y = 0; y < input.height(); y++) { for (int x = 0; x < input.width(); x++) { bool cond = ((input(x, y) > 10) && (input(x, y) < 20)) || ((input(x, y) > 40) && (!(input(x, y) > 50))); uint8_t correct = cond ? 255 : 0; if (correct != output(x, y)) { fprintf(stderr, "output(%d, %d) = %d instead of %d\n", x, y, output(x, y), correct); return -1; } } } } // Test a condition that uses a let resulting from common // subexpression elimination. { Func f; Expr common_cond = input(x, y) > 10; f(x, y) = select((common_cond && (input(x, y) < 20)) || ((input(x, y) > 40) && (!common_cond)), u8(255), u8(0)); Target target = get_jit_target_from_environment(); if (target.has_gpu_feature()) { f.gpu_tile(x, y, xi, yi, 16, 16).vectorize(xi, 4); } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { f.hexagon().vectorize(x, 128); } else { f.vectorize(x, 8); } Buffer<uint8_t> output = f.realize(input.width(), input.height(), target); for (int y = 0; y < input.height(); y++) { for (int x = 0; x < input.width(); x++) { bool common_cond = input(x, y) > 10; bool cond = (common_cond && (input(x, y) < 20)) || ((input(x, y) > 40) && (!common_cond)); uint8_t correct = cond ? 255 : 0; if (correct != output(x, y)) { fprintf(stderr, "output(%d, %d) = %d instead of %d\n", x, y, output(x, y), correct); return -1; } } } } // Test a condition which has vector and scalar inputs. { Func f("f"); f(x, y) = select(x < 10 || x > 20 || y < 10 || y > 20, 0, input(x, y)); Target target = get_jit_target_from_environment(); if (target.has_gpu_feature()) { f.gpu_tile(x, y, xi, yi, 16, 16).vectorize(xi, 4); } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { f.hexagon().vectorize(x, 128); } else { f.vectorize(x, 128); } Buffer<uint8_t> output = f.realize(input.width(), input.height(), target); for (int y = 0; y < input.height(); y++) { for (int x = 0; x < input.width(); x++) { bool cond = x < 10 || x > 20 || y < 10 || y > 20; uint8_t correct = cond ? 0 : input(x,y); if (correct != output(x, y)) { fprintf(stderr, "output(%d, %d) = %d instead of %d\n", x, y, output(x, y), correct); return -1; } } } } // Test a condition that uses differently sized types. { Func f; Expr ten = 10; f(x, y) = select(input(x, y) > ten, u8(255), u8(0)); Target target = get_jit_target_from_environment(); if (target.has_gpu_feature()) { f.gpu_tile(x, y, xi, yi, 16, 16).vectorize(xi, 4); } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { f.hexagon().vectorize(x, 128); } else { f.vectorize(x, 8); } Buffer<uint8_t> output = f.realize(input.width(), input.height(), target); for (int y = 0; y < input.height(); y++) { for (int x = 0; x < input.width(); x++) { bool cond = input(x, y) > 10; uint8_t correct = cond ? 255 : 0; if (correct != output(x, y)) { fprintf(stderr, "output(%d, %d) = %d instead of %d\n", x, y, output(x, y), correct); return -1; } } } } // Test a select where the condition has a different width than // the true/false values. for (int w = 8; w <= 32; w *= 2) { for (int n = 8; n < w; n *= 2) { Type narrow = UInt(n), wide = UInt(w); Func in_wide; in_wide(x, y) = cast(wide, y + x*3); in_wide.compute_root(); Func in_narrow; in_narrow(x, y) = cast(narrow, x*y + x - 17); in_narrow.compute_root(); Func f; f(x, y) = select(in_narrow(x, y) > 10, in_wide(x, y*2), in_wide(x, y*2+1)); Func cpu; cpu(x, y) = f(x, y); Func gpu; gpu(x, y) = f(x, y); Func out; out(x, y) = {cast<uint32_t>(cpu(x, y)), cast<uint32_t>(gpu(x, y))}; cpu.compute_root(); gpu.compute_root(); Target target = get_jit_target_from_environment(); if (target.has_feature(Target::OpenCL) && n == 16 && w == 32) { // Workaround for https://github.com/halide/Halide/issues/2477 printf("Skipping uint%d -> uint%d for OpenCL\n", n, w); continue; } if (target.has_gpu_feature()) { gpu.gpu_tile(x, y, xi, yi, 16, 16).vectorize(xi, 4); } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { gpu.hexagon().vectorize(x, 128); } else { // Just test vectorization gpu.vectorize(x, 8); } Realization r = out.realize(input.width(), input.height(), target); Buffer<uint32_t> cpu_output = r[0]; Buffer<uint32_t> gpu_output = r[1]; for (int y = 0; y < input.height(); y++) { for (int x = 0; x < input.width(); x++) { if (cpu_output(x, y) != gpu_output(x, y)) { fprintf(stderr, "gpu_output(%d, %d) = %d instead of %d for uint%d -> uint%d\n", x, y, gpu_output(x, y), cpu_output(x, y), n, w); return -1; } } } } } printf("Success!\n"); return 0; }
int main(int arch, char **argv) { const int W = 256, H = 256; Buffer<uint8_t> in(W, H); // Set up the input. for (int y = 0; y < H; y++) { for (int x = 0; x < W; x++) { in(x, y) = rand() & 0xff; } } // Define a convolution kernel, and its sum. Buffer<int8_t> kernel(3, 3); kernel.set_min(-1, -1); for (int y = -1; y <= 1; y++) { for (int x = -1; x <= 1; x++) { kernel(x, y) = rand() % 8 - 4; } } Var x("x"), y("y"), xi("xi"), yi("yi"); RDom r(-1, 3, -1, 3); // Boundary condition. Func input = BoundaryConditions::repeat_edge(in); input.compute_root(); // Test a widening reduction, followed by a narrowing. { Func f; f(x, y) = u8_sat(sum(i16(input(x + r.x, y + r.y)) * kernel(r.x, r.y)) / 16); // Schedule. Target target = get_jit_target_from_environment(); if (target.has_gpu_feature()) { f.gpu_tile(x, y, xi, yi, 16, 16); } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { f.hexagon().vectorize(x, 128); } else { f.vectorize(x, target.natural_vector_size<uint8_t>()); } // Run the pipeline and verify the results are correct. Buffer<uint8_t> out = f.realize(W, H, target); for (int y = 1; y < H-1; y++) { for (int x = 1; x < W-1; x++) { int16_t correct = 0; for (int ry = -1; ry <= 1; ry++) { for (int rx = -1; rx <= 1; rx++) { correct += static_cast<int16_t>(in(x + rx, y + ry)) * kernel(rx, ry); } } correct = std::min(std::max(correct / 16, 0), 255); if (correct != out(x, y)) { std::cout << "out(" << x << ", " << y << ") = " << (int)out(x, y) << " instead of " << correct << "\n"; return -1; } } } } // Test a tuple reduction with widening, followed by narrowing the result. { Func f; f(x, y) = { i16(0), i8(0) }; f(x, y) = { f(x, y)[0] + i16(input(x + r.x, y + r.y)) * kernel(r.x, r.y), f(x, y)[1] + kernel(r.x, r.y), }; Func g; g(x, y) = u8_sat((f(x, y)[0] + f(x, y)[1]) / 16); // Schedule. Target target = get_jit_target_from_environment(); if (target.has_gpu_feature()) { g.gpu_tile(x, y, xi, yi, 16, 16); } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { g.hexagon().vectorize(x, 128); } else { g.vectorize(x, target.natural_vector_size<uint8_t>()); } // Run the pipeline and verify the results are correct. Buffer<uint8_t> out = g.realize(W, H, target); for (int y = 1; y < H-1; y++) { for (int x = 1; x < W-1; x++) { int16_t correct = 0; for (int ry = -1; ry <= 1; ry++) { for (int rx = -1; rx <= 1; rx++) { correct += static_cast<int16_t>(in(x + rx, y + ry)) * kernel(rx, ry); correct += kernel(rx, ry); } } correct = std::min(std::max(correct / 16, 0), 255); if (correct != out(x, y)) { std::cout << "out(" << x << ", " << y << ") = " << (int)out(x, y) << " instead of " << correct << "\n"; return -1; } } } } std::cout << "Success!\n"; return 0; }