int main(int argc, char **argv) { Var x("x"); Func f("f"); Param<float> u; f(x) = u; Target target = get_target_from_environment(); if (target.features & Target::CUDA) { f.cuda_tile(x, 256); } u.set(17.0f); Image<float> out_17 = f.realize(1024); u.set(123.0f); Image<float> out_123 = f.realize(1024); for (int i = 0; i < 1024; i++) { if (out_17(i) != 17.0f || out_123(i) != 123.0f) { printf("Failed!\n"); for (int i = 0; i < 1024; i++) { printf("%f %f\n", out_17(i), out_123(i)); } return -1; } } printf("Success!\n"); return 0; }
int main(int argc, char **argv) { Var x("x"); Func f("f"); Param<float> u; f(x) = u; std::string target = get_target(); if (target == "ptx" || target == "ptx-debug") { f.cuda_tile(x, 256); } u.set(17.0f); Image<float> out_17 = f.realize(1024); u.set(123.0f); Image<float> out_123 = f.realize(1024); for (int i = 0; i < 1024; i++) { if (out_17(i) != 17.0f || out_123(i) != 123.0f) { printf("Failed!\n"); for (int i = 0; i < 1024; i++) { printf("%f %f\n", out_17(i), out_123(i)); } return -1; } } printf("Success!\n"); return 0; }
int simple_rfactor_with_specialize_test(bool compile_module) { Func f("f"), g("g"); Var x("x"), y("y"); f(x, y) = x + y; f.compute_root(); g(x, y) = 40; RDom r(10, 20, 30, 40); g(r.x, r.y) = min(f(r.x, r.y) + 2, g(r.x, r.y)); Param<int> p; Var u("u"); Func intm = g.update(0).specialize(p >= 10).rfactor(r.y, u); intm.compute_root(); intm.vectorize(u, 8); intm.update(0).vectorize(r.x, 2); if (compile_module) { p.set(20); // Check the call graphs. Module m = g.compile_to_module({g.infer_arguments()}); CheckCalls checker; m.functions().front().body.accept(&checker); CallGraphs expected = { {g.name(), {}}, {g.update(0).name(), {f.name(), intm.name(), g.name()}}, {intm.name(), {}}, {intm.update(0).name(), {f.name(), intm.name()}}, {f.name(), {}}, }; if (check_call_graphs(checker.calls, expected) != 0) { return -1; } } else { { p.set(0); Image<int> im = g.realize(80, 80); auto func = [](int x, int y, int z) { return (10 <= x && x <= 29) && (30 <= y && y <= 69) ? std::min(x + y + 2, 40) : 40; }; if (check_image(im, func)) { return -1; } } { p.set(20); Image<int> im = g.realize(80, 80); auto func = [](int x, int y, int z) { return (10 <= x && x <= 29) && (30 <= y && y <= 69) ? std::min(x + y + 2, 40) : 40; }; if (check_image(im, func)) { return -1; } } } return 0; }
int tuple_memoize_test(bool toggle_val, int index) { buffer_index = index; Param<bool> toggle; Func f1("f1_" + std::to_string(index)), f2("f2_" + std::to_string(index)); Var x; f1(x) = Tuple(2*x, 2*x); f2(x) = Tuple(select(toggle, f1(x)[0], 1), select(toggle, f1(x)[1], 1)); f1.compute_root().memoize(); f2.set_custom_trace(&single_toggle_trace); f1.trace_stores(); f2.compile_jit(); set_toggle1 = toggle_val; toggle.set(set_toggle1); Realization out = f2.realize(128); Image<int> out0 = out[0]; Image<int> out1 = out[1]; if (check_correctness_single(out0, set_toggle1) != 0) { return -1; } if (check_correctness_single(out1, set_toggle1) != 0) { return -1; } return 0; }
int main(int argc, char **argv) { ImageParam im1(UInt(8), 1); Buffer<uint8_t> im2(10), im3(20); Param<int> j; assert(im1.dimensions() == 1); assert(im2.dimensions() == 1); assert(im3.dimensions() == 1); Func f; Var x; f(x) = x + im1.width(); RDom r(0, clamp(im2(j), 0, 99)); f(r) = 37; im2(3) = 10; j.set(3); im1.set(im3); Buffer<int> result = f.realize(100); for (int i = 0; i < 100; i++) { int correct = i < im2(3) ? 37 : (i+20); if (result(i) != correct) { printf("result(%d) = %d instead of %d\n", i, result(i), correct); return -1; } } printf("Success!\n"); return 0; }
int non_trivial_allocate_predicate_test(bool toggle_val, int index) { buffer_index = index; Param<bool> toggle; Func f1("f1_" + std::to_string(index)), f2("f2_" + std::to_string(index)); Func f3("f3_" + std::to_string(index)); Var x; // Generate allocate f1[...] if toggle f1(x) = 2*x; f2(x) = select(toggle, f1(x), 1); f3(x) = select(toggle, f2(x), 1); f1.compute_root().memoize(); f2.compute_root().memoize(); f3.set_custom_trace(&double_toggle_trace); f1.trace_stores(); f2.trace_stores(); f3.compile_jit(); set_toggle1 = toggle_val; set_toggle2 = toggle_val; toggle.set(set_toggle1); Image<int> out = f3.realize(10); if (check_correctness_single(out, set_toggle1) != 0) { return -1; } return 0; }
int single_memoize_test(bool toggle_val, int index) { buffer_index = index; Param<bool> toggle; Func f1("f1_" + std::to_string(index)), f2("f2_" + std::to_string(index)); Var x; f1(x) = 2*x; f2(x) = select(toggle, f1(x), 1); f1.compute_root().memoize(); f2.set_custom_trace(&single_toggle_trace); f1.trace_stores(); f2.compile_jit(); set_toggle1 = toggle_val; toggle.set(set_toggle1); Image<int> out = f2.realize(10); if (check_correctness_single(out, set_toggle1) != 0) { return -1; } return 0; }
void check(MemoryType t1, MemoryType t2, MemoryType t3) { Var x; // By default, small constant-sized allocations, or // allocations that can be bounded with a small constant size, // go on the stack. Other allocations go on the heap. Func f1, f2, f3; f1(x) = x; f1.compute_root().store_in(t1); f2(x) = x; f2.compute_root().store_in(t2); f3(x) = x; f3.compute_root().store_in(t3); Func f; Param<bool> p; f(x) = (f1(0) + f1(1)) + f2(select(p, 0, 2)) + f2(0) + f3(x % 1000); p.set(true); int expected_mallocs = ((t1 == MemoryType::Heap ? 1 : 0) + (t2 == MemoryType::Heap ? 1 : 0) + (t3 == MemoryType::Heap ? 1 : 0)); mallocs = 0; f.set_custom_allocator(my_malloc, my_free); f.realize(1024); if (mallocs != expected_mallocs) { std::cerr << "Wrong number of mallocs for " << t1 << ", " << t2 << ", " << t3 << "\n" << "Expected " << expected_mallocs << " got " << mallocs << "\n"; exit(-1); } }
int main(int argc, char **argv) { Var x, y, z; Func f; Param<int> k; k.set(3); f(x, y, z) = x*y+z*k+1; f.parallel(x); f.parallel(y); f.parallel(z); Image<int> im = f.realize(64, 64, 64); for (int x = 0; x < 64; x++) { for (int y = 0; y < 64; y++) { for (int z = 0; z < 64; z++) { if (im(x, y, z) != x*y+z*3+1) { printf("im(%d, %d, %d) = %d\n", x, y, z, im(x, y, z)); return -1; } } } } printf("Success!\n"); return 0; }
int main(int argc, char **argv) { // Define a pipeline that dumps some squares to a file using an // external consumer stage. Func source; Var x; source(x) = x*x; Param<int> min, extent; Param<const char *> filename; Func sink; std::vector<ExternFuncArgument> args; args.push_back(source); args.push_back(filename); args.push_back(min); args.push_back(extent); sink.define_extern("dump_to_file", args, Int(32), 0); source.compute_root(); sink.compile_jit(); // Dump the first 10 squares to a file filename.set("halide_test_extern_consumer.txt"); min.set(0); extent.set(10); sink.realize(); if (!check_result()) return -1; // Test ImageParam ExternFuncArgument via passed in image. Image<int32_t> buf = source.realize(10); ImageParam passed_in(Int(32), 1); passed_in.set(buf); Func sink2; std::vector<ExternFuncArgument> args2; args2.push_back(passed_in); args2.push_back(filename); args2.push_back(min); args2.push_back(extent); sink2.define_extern("dump_to_file", args2, Int(32), 0); sink2.realize(); if (!check_result()) return -1; printf("Success!\n"); return 0; }
CoordXform() : m0("m0"), m1("m1"), m2("m2"), m3("m3"), m4("m4"), m5("m5") { m0.set(m[0]); m1.set(m[1]); m2.set(m[2]); m3.set(m[3]); m4.set(m[4]); m5.set(m[5]); }
int main(int argc, char **argv) { Param<float> val; Func f, g; Var x, y; f(x, y) = val + cast<uint8_t>(x); g(x, y) = f(x, y) + f(x - 1, y) + f(x + 1, y); g.split(y, y, _, 16); f.store_root(); f.compute_at(g, y).memoize(); val.set(23.0f); Image<uint8_t> out = g.realize(128, 128); for (int32_t i = 0; i < 128; i++) { for (int32_t j = 0; j < 128; j++) { assert(out(i, j) == (uint8_t)(3 * 23 + i + (i - 1) + (i + 1))); } } }
int main(int argc, char **argv) { Var x, y, z; Func f, g; Param<int> k; k.set(3); f(x, y, z) = x*y+z*k+1; g(x, y, z) = f(x, y, z) + 2; f.parallel(x); f.parallel(y); g.parallel(z); f.compute_at(g, z); auto target = get_jit_target_from_environment(); if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { g.hexagon().vectorize(x, 32); f.vectorize(x, 32); } Buffer<int> im = g.realize(64, 64, 64); for (int x = 0; x < 64; x++) { for (int y = 0; y < 64; y++) { for (int z = 0; z < 64; z++) { if (im(x, y, z) != x*y+z*3+3) { printf("im(%d, %d, %d) = %d\n", x, y, z, im(x, y, z)); return -1; } } } } printf("Success!\n"); return 0; }
int main(int argc, char **argv) { Var x; Func f; Param<int> k; k.set(3); f(x) = x*k; f.parallel(x); Buffer<int> im = f.realize(16); for (int i = 0; i < 16; i++) { if (im(i) != i*3) { printf("im(%d) = %d\n", i, im(i)); return -1; } } printf("Success!\n"); return 0; }
static int run(int argc, char **argv) { static const MeCab::Option long_options[] = { { "dicdir", 'd', ".", "DIR", "set DIR as dicdir(default \".\" )" }, { "outdir", 'o', ".", "DIR", "set DIR as output dir" }, { "model", 'm', 0, "FILE", "use FILE as model file" }, { "version", 'v', 0, 0, "show the version and exit" }, { "training-algorithm", 'a', "crf", "(crf|hmm)", "set training algorithm" }, { "default-emission-cost", 'E', "4000", "INT", "set default emission cost for HMM" }, { "default-transition-cost", 'T', "4000", "INT", "set default transition cost for HMM" }, { "help", 'h', 0, 0, "show this help and exit." }, { 0, 0, 0, 0 } }; Param param; if (!param.open(argc, argv, long_options)) { std::cout << param.what() << "\n\n" << COPYRIGHT << "\ntry '--help' for more information." << std::endl; return -1; } if (!param.help_version()) return 0; ContextID cid; DecoderFeatureIndex fi; DictionaryRewriter rewrite; const std::string dicdir = param.get<std::string>("dicdir"); const std::string outdir = param.get<std::string>("outdir"); const std::string model = param.get<std::string>("model"); #define DCONF(file) create_filename(dicdir, std::string(file)).c_str() #define OCONF(file) create_filename(outdir, std::string(file)).c_str() CHECK_DIE(param.load(DCONF(DICRC))) << "no such file or directory: " << DCONF(DICRC); std::string charset; { Dictionary dic; CHECK_DIE(dic.open(DCONF(SYS_DIC_FILE), "r")); charset = dic.charset(); CHECK_DIE(!charset.empty()); } int default_emission_cost = 0; int default_transition_cost = 0; std::string type = param.get<std::string>("training-algorithm"); toLower(&type); if (type == "hmm") { default_emission_cost = param.get<int>("default-emission-cost"); default_transition_cost = param.get<int>("default-transition-cost"); CHECK_DIE(default_transition_cost > 0) << "default transition cost must be > 0"; CHECK_DIE(default_emission_cost > 0) << "default transition cost must be > 0"; param.set("identity-template", 1); } CharProperty property; CHECK_DIE(property.open(param)); property.set_charset(charset.c_str()); const std::string bos = param.get<std::string>("bos-feature"); const int factor = param.get<int>("cost-factor"); std::vector<std::string> dic; enum_csv_dictionaries(dicdir.c_str(), &dic); { CHECK_DIE(dicdir != outdir) << "output directory = dictionary directory! " "Please specify different directory."; CHECK_DIE(!outdir.empty()) << "output directory is empty"; CHECK_DIE(!model.empty()) << "model file is empty"; CHECK_DIE(fi.open(param)) << fi.what(); CHECK_DIE(factor > 0) << "cost factor needs to be positive value"; CHECK_DIE(!bos.empty()) << "bos-feature is empty"; CHECK_DIE(dic.size()) << "no dictionary is found in " << dicdir; CHECK_DIE(rewrite.open(DCONF(REWRITE_FILE))); } gencid_bos(bos, &rewrite, &cid); gencid(DCONF(UNK_DEF_FILE), &rewrite, &cid); for (std::vector<std::string>::const_iterator it = dic.begin(); it != dic.end(); ++it) { gencid(it->c_str(), &rewrite, &cid); } std::cout << "emitting " << OCONF(LEFT_ID_FILE) << "/ " << OCONF(RIGHT_ID_FILE) << std::endl; cid.build(); cid.save(OCONF(LEFT_ID_FILE), OCONF(RIGHT_ID_FILE)); gendic(DCONF(UNK_DEF_FILE), OCONF(UNK_DEF_FILE), property, &rewrite, cid, &fi, true, factor, default_emission_cost); for (std::vector<std::string>::const_iterator it = dic.begin(); it != dic.end(); ++it) { std::string file = *it; remove_pathname(&file); gendic(it->c_str(), OCONF(file.c_str()), property, &rewrite, cid, &fi, false, factor, default_emission_cost); } genmatrix(OCONF(MATRIX_DEF_FILE), cid, &fi, factor, default_transition_cost); copy(DCONF(CHAR_PROPERTY_DEF_FILE), OCONF(CHAR_PROPERTY_DEF_FILE)); copy(DCONF(REWRITE_FILE), OCONF(REWRITE_FILE)); copy(DCONF(DICRC), OCONF(DICRC)); if (type == "crf") copy(DCONF(FEATURE_FILE), OCONF(FEATURE_FILE)); #undef OCONF #undef DCONF std::cout << "\ndone!\n"; return 0; }
int tuple_specialize_rdom_predicate_rfactor_test(bool compile_module) { Func f("f"), g("g"); Var x("x"), y("y"), z("z"); f(x, y, z) = Tuple(x + y + z, x - y + z); f.compute_root(); RDom r(5, 20, 5, 20, 5, 20); r.where(r.x*r.x + r.z*r.z <= 200); r.where(r.y*r.z + r.z*r.z > 100); Func ref("ref"); ref(x, y) = Tuple(1, 3); ref(x, y) = Tuple(ref(x, y)[0]*f(r.x, r.y, r.z)[0], ref(x, y)[1] + 2*f(r.x, r.y, r.z)[1]); Realization ref_rn = ref.realize(10, 10); g(x, y) = Tuple(1, 3); g(x, y) = Tuple(g(x, y)[0]*f(r.x, r.y, r.z)[0], g(x, y)[1] + 2*f(r.x, r.y, r.z)[1]); Param<int> p; Param<bool> q; Var u("u"), v("v"), w("w"); Func intm1 = g.update(0).specialize(p >= 5).rfactor({{r.y, v}, {r.z, w}}); intm1.update(0).parallel(v, 2); RVar rxi("rxi"), rxo("rxo"); intm1.update(0).split(r.x, rxo, rxi, 2); Var t("t"); Func intm2 = intm1.update(0).specialize(q).rfactor(rxi, t); Func intm3 = intm1.update(0).specialize(!q).rfactor(rxo, t); Func intm4 = g.update(0).rfactor({{r.x, u}, {r.z, w}}); intm4.update(0).vectorize(u); if (compile_module) { // Check the call graphs. Module m = g.compile_to_module({g.infer_arguments()}); CheckCalls checker; m.functions().front().body.accept(&checker); CallGraphs expected = { {g.name(), {}}, {g.update(0).name(), {intm1.name() + ".0", intm1.name() + ".1", intm4.name() + ".0", intm4.name() + ".1", g.name() + ".0", g.name() + ".1"}}, {intm1.name(), {}}, {intm1.update(0).name(), {intm2.name() + ".0", intm2.name() + ".1", intm3.name() + ".0", intm3.name() + ".1", intm1.name() + ".0", intm1.name() + ".1"}}, {intm2.name(), {}}, {intm2.update(0).name(), {f.name() + ".0", f.name() + ".1", intm2.name() + ".0", intm2.name() + ".1"}}, {intm3.name(), {}}, {intm3.update(0).name(), {f.name() + ".0", f.name() + ".1", intm3.name() + ".0", intm3.name() + ".1"}}, {intm4.name(), {}}, {intm4.update(0).name(), {f.name() + ".0", f.name() + ".1", intm4.name() + ".0", intm4.name() + ".1"}}, {f.name(), {}}, }; if (check_call_graphs(checker.calls, expected) != 0) { return -1; } } else { { p.set(10); q.set(true); Realization rn = g.realize(10, 10); Image<int> im1(rn[0]); Image<int> im2(rn[1]); Image<int> ref_im1(ref_rn[0]); Image<int> ref_im2(ref_rn[1]); auto func1 = [&ref_im1](int x, int y, int z) { return ref_im1(x, y, z); }; if (check_image(im1, func1)) { return -1; } auto func2 = [&ref_im2](int x, int y, int z) { return ref_im2(x, y, z); }; if (check_image(im2, func2)) { return -1; } } { p.set(10); q.set(false); Realization rn = g.realize(10, 10); Image<int> im1(rn[0]); Image<int> im2(rn[1]); Image<int> ref_im1(ref_rn[0]); Image<int> ref_im2(ref_rn[1]); auto func1 = [&ref_im1](int x, int y, int z) { return ref_im1(x, y, z); }; if (check_image(im1, func1)) { return -1; } auto func2 = [&ref_im2](int x, int y, int z) { return ref_im2(x, y, z); }; if (check_image(im2, func2)) { return -1; } } { p.set(0); q.set(true); Realization rn = g.realize(10, 10); Image<int> im1(rn[0]); Image<int> im2(rn[1]); Image<int> ref_im1(ref_rn[0]); Image<int> ref_im2(ref_rn[1]); auto func1 = [&ref_im1](int x, int y, int z) { return ref_im1(x, y, z); }; if (check_image(im1, func1)) { return -1; } auto func2 = [&ref_im2](int x, int y, int z) { return ref_im2(x, y, z); }; if (check_image(im2, func2)) { return -1; } } { p.set(0); q.set(false); Realization rn = g.realize(10, 10); Image<int> im1(rn[0]); Image<int> im2(rn[1]); Image<int> ref_im1(ref_rn[0]); Image<int> ref_im2(ref_rn[1]); auto func1 = [&ref_im1](int x, int y, int z) { return ref_im1(x, y, z); }; if (check_image(im1, func1)) { return -1; } auto func2 = [&ref_im2](int x, int y, int z) { return ref_im2(x, y, z); }; if (check_image(im2, func2)) { return -1; } } } return 0; }
int intermediate_computed_if_param_test(int index) { buffer_index = index; Func f("f_" + std::to_string(index)), g("g_" + std::to_string(index)); Var x("x"), y("y"); Param<int> p; g(x, y) = x + y; f(x, y) = x + y; RDom r(0, 100, 0, 100); r.where(p > 3); f(r.x, r.y) += 2*g(r.x, r.y); // Expect g to be only computed over x=[0,99] and y=[0,99] if param is bigger // than 3. g.compute_root(); f.set_custom_trace(&box_bound_trace); g.trace_stores(); g.trace_realizations(); { printf("....Set p to 5, expect g to be computed\n"); p.set(5); run_tracer = false; niters_expected = 100*100; niters = 0; Image<int> im = f.realize(200, 200); for (int y = 0; y < im.height(); y++) { for (int x = 0; x < im.width(); x++) { int correct = x + y; if ((0 <= x && x <= 99) && (0 <= y && y <= 99)) { correct = 3*correct; } if (im(x, y) != correct) { printf("im(%d, %d) = %d instead of %d\n", x, y, im(x, y), correct); return -1; } } } if (niters_expected != niters) { printf("intermediate_computed_if_param_test : Expect niters on g to be %d but got %d instead\n", niters_expected, niters); return -1; } } { printf("....Set p to 0, expect g to be not computed\n"); p.set(0); run_tracer = false; niters_expected = 0; niters = 0; Image<int> im = f.realize(200, 200); for (int y = 0; y < im.height(); y++) { for (int x = 0; x < im.width(); x++) { int correct = x + y; if (im(x, y) != correct) { printf("im(%d, %d) = %d instead of %d\n", x, y, im(x, y), correct); return -1; } } } if (niters_expected != niters) { printf("intermediate_computed_if_param_test : Expect niters on g to be %d but got %d instead\n", niters_expected, niters); return -1; } } return 0; }
int update_defined_after_wrap_test() { Func source("source"), g("g"); Var x("x"), y("y"); source(x, y) = x + y; ImageParam img(Int(32), 2, "img"); Buffer<int> buf = source.realize(200, 200); img.set(buf); g(x, y) = img(x, y); Func wrapper = img.in(g); // Update of 'g' is defined after img.in(g) is called. g's updates should // still call img's wrapper. RDom r(0, 100, 0, 100); r.where(r.x < r.y); g(r.x, r.y) += 2*img(r.x, r.y); Param<bool> param; Var xi("xi"); RVar rxo("rxo"), rxi("rxi"); g.specialize(param).vectorize(x, 8).unroll(x, 2).split(x, x, xi, 4).parallel(x); g.update(0).split(r.x, rxo, rxi, 2).unroll(rxi); Func img_f = img; img_f.compute_root(); wrapper.compute_root().vectorize(_0, 8).unroll(_0, 2).split(_0, _0, xi, 4).parallel(_0); { param.set(true); // Check the call graphs. // Expect initialization of 'g' to call 'wrapper' and its update to call // 'wrapper' and 'g', wrapper' to call 'img_f', 'img_f' to call 'img' Module m = g.compile_to_module({g.infer_arguments()}); CheckCalls c; m.functions().front().body.accept(&c); CallGraphs expected = { {g.name(), {wrapper.name(), g.name()}}, {wrapper.name(), {img_f.name()}}, {img_f.name(), {img.name()}}, }; if (check_call_graphs(c.calls, expected) != 0) { return -1; } Buffer<int> im = g.realize(200, 200); auto func = [](int x, int y) { return ((0 <= x && x <= 99) && (0 <= y && y <= 99) && (x < y)) ? 3*(x + y) : (x + y); }; if (check_image(im, func)) { return -1; } } { param.set(false); // Check the call graphs. // Expect initialization of 'g' to call 'wrapper' and its update to call // 'wrapper' and 'g', wrapper' to call 'img_f', 'img_f' to call 'img' Module m = g.compile_to_module({g.infer_arguments()}); CheckCalls c; m.functions().front().body.accept(&c); CallGraphs expected = { {g.name(), {wrapper.name(), g.name()}}, {wrapper.name(), {img_f.name()}}, {img_f.name(), {img.name()}}, }; if (check_call_graphs(c.calls, expected) != 0) { return -1; } Buffer<int> im = g.realize(200, 200); auto func = [](int x, int y) { return ((0 <= x && x <= 99) && (0 <= y && y <= 99) && (x < y)) ? 3*(x + y) : (x + y); }; if (check_image(im, func)) { return -1; } } return 0; }
int main(int argc, char **argv) { { call_count = 0; Func count_calls; count_calls.define_extern("count_calls", std::vector<ExternFuncArgument>(), UInt(8), 2); Func f; f() = count_calls(0, 0); f.compute_root().memoize(); Image<uint8_t> result1 = f.realize(); Image<uint8_t> result2 = f.realize(); assert(result1(0) == 42); assert(result2(0) == 42); assert(call_count == 1); } { call_count = 0; Param<int32_t> coord; Func count_calls; count_calls.define_extern("count_calls", std::vector<ExternFuncArgument>(), UInt(8), 2); Func f, g; Var x, y; f() = count_calls(coord, coord); f.compute_root().memoize(); g(x, y) = f(); coord.set(0); Image<uint8_t> out1 = g.realize(256, 256); Image<uint8_t> out2 = g.realize(256, 256); for (int32_t i = 0; i < 256; i++) { for (int32_t j = 0; j < 256; j++) { assert(out1(i, j) == 42); assert(out2(i, j) == 42); } } assert(call_count == 1); coord.set(1); Image<uint8_t> out3 = g.realize(256, 256); Image<uint8_t> out4 = g.realize(256, 256); for (int32_t i = 0; i < 256; i++) { for (int32_t j = 0; j < 256; j++) { assert(out3(i, j) == 42); assert(out4(i, j) == 42); } } assert(call_count == 2); } { call_count = 0; Func count_calls; count_calls.define_extern("count_calls", std::vector<ExternFuncArgument>(), UInt(8), 2); Func f; Var x, y; f(x, y) = count_calls(x, y) + count_calls(x, y); count_calls.compute_root().memoize(); Image<uint8_t> out1 = f.realize(256, 256); Image<uint8_t> out2 = f.realize(256, 256); for (int32_t i = 0; i < 256; i++) { for (int32_t j = 0; j < 256; j++) { assert(out1(i, j) == (42 + 42)); assert(out2(i, j) == (42 + 42)); } } assert(call_count == 1); } call_count = 0; { Func count_calls_23; count_calls_23.define_extern("count_calls_with_arg", Internal::vec(ExternFuncArgument(cast<uint8_t>(23))), UInt(8), 2); Func count_calls_42; count_calls_42.define_extern("count_calls_with_arg", Internal::vec(ExternFuncArgument(cast<uint8_t>(42))), UInt(8), 2); Func f; Var x, y; f(x, y) = count_calls_23(x, y) + count_calls_42(x, y); count_calls_23.compute_root().memoize(); count_calls_42.compute_root().memoize(); Image<uint8_t> out1 = f.realize(256, 256); Image<uint8_t> out2 = f.realize(256, 256); for (int32_t i = 0; i < 256; i++) { for (int32_t j = 0; j < 256; j++) { assert(out1(i, j) == (23 + 42)); assert(out2(i, j) == (23 + 42)); } } assert(call_count_with_arg == 2); } { Param<uint8_t> val1; Param<uint8_t> val2; call_count_with_arg = 0; Func count_calls_val1; count_calls_val1.define_extern("count_calls_with_arg", Internal::vec(ExternFuncArgument(Expr(val1))), UInt(8), 2); Func count_calls_val2; count_calls_val2.define_extern("count_calls_with_arg", Internal::vec(ExternFuncArgument(Expr(val2))), UInt(8), 2); Func f; Var x, y; f(x, y) = count_calls_val1(x, y) + count_calls_val2(x, y); count_calls_val1.compute_root().memoize(); count_calls_val2.compute_root().memoize(); val1.set(23); val2.set(42); Image<uint8_t> out1 = f.realize(256, 256); Image<uint8_t> out2 = f.realize(256, 256); val1.set(42); Image<uint8_t> out3 = f.realize(256, 256); val1.set(23); Image<uint8_t> out4 = f.realize(256, 256); val1.set(42); Image<uint8_t> out5 = f.realize(256, 256); val2.set(57); Image<uint8_t> out6 = f.realize(256, 256); for (int32_t i = 0; i < 256; i++) { for (int32_t j = 0; j < 256; j++) { assert(out1(i, j) == (23 + 42)); assert(out2(i, j) == (23 + 42)); assert(out3(i, j) == (42 + 42)); assert(out4(i, j) == (23 + 42)); assert(out5(i, j) == (42 + 42)); assert(out6(i, j) == (42 + 57)); } } assert(call_count_with_arg == 4); } { Param<float> val; call_count_with_arg = 0; Func count_calls; count_calls.define_extern("count_calls_with_arg", Internal::vec(ExternFuncArgument(cast<uint8_t>(val))), UInt(8), 2); Func f; Var x, y; f(x, y) = count_calls(x, y) + count_calls(x, y); count_calls.compute_root().memoize(); val.set(23.0f); Image<uint8_t> out1 = f.realize(256, 256); val.set(23.4f); Image<uint8_t> out2 = f.realize(256, 256); for (int32_t i = 0; i < 256; i++) { for (int32_t j = 0; j < 256; j++) { assert(out1(i, j) == (23 + 23)); assert(out2(i, j) == (23 + 23)); } } assert(call_count_with_arg == 2); } { Param<float> val; call_count_with_arg = 0; Func count_calls; count_calls.define_extern("count_calls_with_arg", Internal::vec(ExternFuncArgument(memoize_tag(cast<uint8_t>(val)))), UInt(8), 2); Func f; Var x, y; f(x, y) = count_calls(x, y) + count_calls(x, y); count_calls.compute_root().memoize(); val.set(23.0f); Image<uint8_t> out1 = f.realize(256, 256); val.set(23.4f); Image<uint8_t> out2 = f.realize(256, 256); for (int32_t i = 0; i < 256; i++) { for (int32_t j = 0; j < 256; j++) { assert(out1(i, j) == (23 + 23)); assert(out2(i, j) == (23 + 23)); } } assert(call_count_with_arg == 1); } { // Case with bounds computed not equal to bounds realized. Param<float> val; Param<int32_t> index; call_count_with_arg = 0; Func count_calls; count_calls.define_extern("count_calls_with_arg", Internal::vec(ExternFuncArgument(cast<uint8_t>(val))), UInt(8), 2); Func f, g, h; Var x; f(x) = count_calls(x, 0) + cast<uint8_t>(x); g(x) = f(x); h(x) = g(4) + g(index); f.compute_root().memoize(); g.vectorize(x, 8).compute_at(h, x); val.set(23.0f); index.set(2); Image<uint8_t> out1 = h.realize(1); assert(out1(0) == (uint8_t)(2 * 23 + 4 + 2)); assert(call_count_with_arg == 3); index.set(4); out1 = h.realize(1); assert(out1(0) == (uint8_t)(2 * 23 + 4 + 4)); assert(call_count_with_arg == 4); } { // Test Tuple case Param<float> val; call_count_with_arg = 0; Func count_calls; count_calls.define_extern("count_calls_with_arg", Internal::vec(ExternFuncArgument(cast<uint8_t>(val))), UInt(8), 2); Func f; Var x, y, xi, yi; f(x, y) = Tuple(count_calls(x, y) + cast<uint8_t>(x), x); count_calls.compute_root().memoize(); f.compute_root().memoize(); Func g; g(x, y) = Tuple(f(x, y)[0] + f(x - 1, y)[0] + f(x + 1, y)[0], f(x, y)[1]); val.set(23.0f); Realization out = g.realize(128, 128); Image<uint8_t> out0 = out[0]; Image<int32_t> out1 = out[1]; for (int32_t i = 0; i < 100; i++) { for (int32_t j = 0; j < 100; j++) { assert(out0(i, j) == (uint8_t)(3 * 23 + i + (i - 1) + (i + 1))); assert(out1(i, j) == i); } } out = g.realize(128, 128); out0 = out[0]; out1 = out[1]; for (int32_t i = 0; i < 100; i++) { for (int32_t j = 0; j < 100; j++) { assert(out0(i, j) == (uint8_t)(3 * 23 + i + (i - 1) + (i + 1))); assert(out1(i, j) == i); } } assert(call_count_with_arg == 1); } { // Test cache eviction Param<float> val; call_count_with_arg = 0; Func count_calls; count_calls.define_extern("count_calls_with_arg", Internal::vec(ExternFuncArgument(cast<uint8_t>(val))), UInt(8), 2); Func f; Var x, y, xi, yi; f(x, y) = count_calls(x, y) + cast<uint8_t>(x); count_calls.compute_root().memoize(); Func g; g(x, y) = f(x, y) + f(x - 1, y) + f(x + 1, y); Internal::JITSharedRuntime::memoization_cache_set_size(1000000); for (int v = 0; v < 1000; v++) { int r = rand() % 256; val.set((float)r); Image<uint8_t> out1 = g.realize(128, 128); for (int32_t i = 0; i < 100; i++) { for (int32_t j = 0; j < 100; j++) { assert(out1(i, j) == (uint8_t)(3 * r + i + (i - 1) + (i + 1))); } } } // TODO work out an assertion on call count here. fprintf(stderr, "Call count is %d.\n", call_count_with_arg); // Return cache size to default. Internal::JITSharedRuntime::memoization_cache_set_size(0); } { // Test flushing entire cache with a single element larger than the cache Param<float> val; call_count_with_arg = 0; Func count_calls; count_calls.define_extern("count_calls_with_arg", Internal::vec(ExternFuncArgument(cast<uint8_t>(val))), UInt(8), 2); Func f; Var x, y, xi, yi; f(x, y) = count_calls(x, y) + cast<uint8_t>(x); count_calls.compute_root().memoize(); Func g; g(x, y) = f(x, y) + f(x - 1, y) + f(x + 1, y); Internal::JITSharedRuntime::memoization_cache_set_size(1000000); for (int v = 0; v < 1000; v++) { int r = rand() % 256; val.set((float)r); Image<uint8_t> out1 = g.realize(128, 128); for (int32_t i = 0; i < 100; i++) { for (int32_t j = 0; j < 100; j++) { assert(out1(i, j) == (uint8_t)(3 * r + i + (i - 1) + (i + 1))); } } } // TODO work out an assertion on call count here. fprintf(stderr, "Call count before oversize realize is %d.\n", call_count_with_arg); call_count_with_arg = 0; Image<uint8_t> big = g.realize(1024, 1024); Image<uint8_t> big2 = g.realize(1024, 1024); // TODO work out an assertion on call count here. fprintf(stderr, "Call count after oversize realize is %d.\n", call_count_with_arg); call_count_with_arg = 0; for (int v = 0; v < 1000; v++) { int r = rand() % 256; val.set((float)r); Image<uint8_t> out1 = g.realize(128, 128); for (int32_t i = 0; i < 100; i++) { for (int32_t j = 0; j < 100; j++) { assert(out1(i, j) == (uint8_t)(3 * r + i + (i - 1) + (i + 1))); } } } fprintf(stderr, "Call count is %d.\n", call_count_with_arg); // Return cache size to default. Internal::JITSharedRuntime::memoization_cache_set_size(0); } { // Test parallel cache access Param<float> val; Func count_calls; count_calls.define_extern("count_calls_with_arg_parallel", Internal::vec(ExternFuncArgument(cast<uint8_t>(val))), UInt(8), 3); Func f; Var x, y; // Ensure that all calls map to the same cache key, but pass a thread ID // through to avoid having to do locking or an atomic add f(x, y) = count_calls(x, y % 4, memoize_tag(y / 16, 0)) + cast<uint8_t>(x); Func g; g(x, y) = f(x, y) + f(x - 1, y) + f(x + 1, y); count_calls.compute_at(f, y).memoize(); f.compute_at(g, y).memoize(); g.parallel(y, 16); val.set(23.0f); Internal::JITSharedRuntime::memoization_cache_set_size(1000000); Image<uint8_t> out = g.realize(128, 128); for (int32_t i = 0; i < 128; i++) { for (int32_t j = 0; j < 128; j++) { assert(out(i, j) == (uint8_t)(3 * 23 + i + (i - 1) + (i + 1))); } } // TODO work out an assertion on call counts here. for (int i = 0; i < 8; i++) { fprintf(stderr, "Call count for thread %d is %d.\n", i, call_count_with_arg_parallel[i]); } // Return cache size to default. Internal::JITSharedRuntime::memoization_cache_set_size(0); } { Param<float> val; Func f; Var x, y; f(x, y) = cast<uint8_t>((x << 8) + y); Func prev_func = f; Func stage[4]; for (int i = 0; i < 4; i++) { std::vector<ExternFuncArgument> args(3); args[0] = cast<int32_t>(i); args[1] = cast<int32_t>(val); args[2] = prev_func; stage[i].define_extern("count_calls_staged", args, UInt(8), 2); prev_func = stage[i]; } f.compute_root(); for (int i = 0; i < 3; i++) { stage[i].compute_root(); } stage[3].compute_root().memoize(); val.set(23.0f); Image<uint8_t> result = stage[3].realize(128, 128); for (int32_t i = 0; i < 128; i++) { for (int32_t j = 0; j < 128; j++) { assert(result(i, j) == (uint8_t)((i << 8) + j + 4 * 23)); } } for (int i = 0; i < 4; i++) { fprintf(stderr, "Call count for stage %d is %d.\n", i, call_count_staged[i]); } result = stage[3].realize(128, 128); for (int32_t i = 0; i < 128; i++) { for (int32_t j = 0; j < 128; j++) { assert(result(i, j) == (uint8_t)((i << 8) + j + 4 * 23)); } } for (int i = 0; i < 4; i++) { fprintf(stderr, "Call count for stage %d is %d.\n", i, call_count_staged[i]); } } fprintf(stderr, "Success!\n"); return 0; }
int main(int argc, char **argv) { // Test bool check_range<bool, uint8_t>(0, 2, 0, 1, 0, 2, 0, 1, 0, 256, 0, 1, "<bool, uint8_t> exhaustive"); // Exhaustively test 8-bit cases check_range<uint8_t, uint8_t>(0, 256, 0, 1, 0, 256, 0, 1, 0, 256, 0, 1, "<uint8_t, uint8_t> exhaustive"); check_range<int8_t, uint8_t>(0, 256, -128, 1, 0, 256, -128, 1, 0, 256, 0, 1, "<int8_t, uint8_t> exhaustive"); check_range<uint8_t, float>(0, 256, 0, 1, 0, 256, 0, 1, 0, 256, 0, 1/255.0f, "<uint8_t, float> exhaustive"); check_range<int8_t, float>(0, 256, -128, 1, 0, 256, -128, 1, 0, 256, 0, 1/255.0f, "<int8_t, float> exhaustive"); // Check all delta values for 16-bit, verify swapping arguments doesn't break check_range<uint16_t, uint16_t>(0, 65536, 0, 1, 65535, 1, 0, 1, 0, 257, 255, 1, "<uint16_t, uint16_t> all zero starts"); check_range<uint16_t, uint16_t>(65535, 1, 0, 1, 0, 65536, 0, 1, 0, 257, 255, 1, "<uint16_t, uint16_t> all one starts"); // Verify different bit sizes for value and weight types check_range<uint16_t, uint8_t>(0, 1, 0, 1, 65535, 1, 0, 1, 0, 255, 1, 1, "<uint16_t, uint8_t> zero, one uint8_t weight test"); check_range<uint16_t, uint32_t>(0, 1, 0, 1, 65535, 1, 0, 1, std::numeric_limits<int32_t>::min(), 257, 255 * 65535, 1, "<uint16_t, uint8_t> zero, one uint32_t weight test"); check_range<uint32_t, uint8_t>(0, 1, 0, 1, 1 << 31, 1, 0, 1, 0, 255, 0, 1, "<uint32_t, uint8_t> weight test"); check_range<uint32_t, uint16_t>(0, 1, 0, 1, 1 << 31, 1, 0, 1, 0, 65535, 0, 1, "<uint32_t, uint16_t> weight test"); // Verify float weights with integer values check_range<uint16_t, float>(0, 1, 0, 1, 65535, 1, 0, 1, 0, 257, 0, 255.0f/65535.0f, "<uint16_t, float> zero, one float weight test"); check_range<int16_t, uint16_t>(0, 65536, -32768, 1, 0, 1, 0, 1, 0, 257, 0, 255, "<int16_t, uint16_t> all zero starts"); #if 0 // takes too long, difficult to test with uint32_t // Check all delta values for 32-bit, do it in signed arithmetic check_range<int32_t, uint32_t>(std::numeric_limits<int32_t>::min(), std::numeric_limits<int32_t>::max(), 0, 1, 1 << 31, 1, 0, 1, 0, 1, 1 << 31, 1, "<uint32_t, uint32_t> all zero starts"); #endif check_range<float, float>(0, 100, 0, .01, 0, 100, 0, .01, 0, 100, 0, .01, "<float, float> float values 0 to 1 by 1/100ths"); check_range<float, float>(0, 100, -5, .1, 0, 100, 0, .1, 0, 100, 0, .1, "<float, float> float values -5 to 5 by 1/100ths"); // Verify float values with integer weights check_range<float, uint8_t>(0, 100, -5, .1, 0, 100, 0, .1, 0, 255, 0, 1, "<float, uint8_t> float values -5 to 5 by 1/100ths"); check_range<float, uint16_t>(0, 100, -5, .1, 0, 100, 0, .1, 0, 255, 0, 257, "<float, uint16_t> float values -5 to 5 by 1/100ths"); check_range<float, uint32_t>(0, 100, -5, .1, 0, 100, 0, .1, std::numeric_limits<int32_t>::min(), 257, 255 * 65535, 1, "<float, uint32_t> float values -5 to 5 by 1/100ths"); // Check constant and constant case: Func lerp_constants("lerp_constants"); lerp_constants() = lerp(0, cast<uint32_t>(1023), .5f); Image<uint32_t> result = lerp_constants.realize(); uint32_t expected = evaluate<uint32_t>(cast<uint32_t>(lerp(0, cast<uint16_t>(1023), .5f))); if (result(0) != expected) std::cerr << "Expected " << expected << " got " << result(0) << std::endl; assert(result(0) == expected); // Add a little more coverage for uint32_t as this was failing // without being detected for a long time. Image<uint8_t> input_a_img(16, 16); Image<uint8_t> input_b_img(16, 16); for (int i = 0; i < 16; i ++) { for (int j = 0; j < 16; j ++) { input_a_img(i, j) = (i << 4) + j; input_b_img(i, j) = ((15 - i) << 4) + (15 - j); } } ImageParam input_a(UInt(8), 2); ImageParam input_b(UInt(8), 2); Var x, y; Func lerp_with_casts; Param<float> w; lerp_with_casts(x, y) = lerp(cast<int32_t>(input_a(x, y)), cast<int32_t>(input_b(x, y)), w); lerp_with_casts.vectorize(x, 4); input_a.set(input_a_img); input_b.set(input_b_img); w.set(0.0f); Image<int32_t> result_should_be_a = lerp_with_casts.realize(16, 16); w.set(1.0f); Image<int32_t> result_should_be_b = lerp_with_casts.realize(16, 16); for (int i = 0; i < 16; i ++) { for (int j = 0; j < 16; j ++) { assert(input_a_img(i, j) == result_should_be_a(i, j)); assert(input_b_img(i, j) == result_should_be_b(i, j)); } } std::cout << "Success!" << std::endl; }
int gpu_intermediate_computed_if_param_test(int index) { buffer_index = index; Func f("f_" + std::to_string(index)), g("g_" + std::to_string(index)), h("h_" + std::to_string(index)); Var x("x"), y("y"); Param<int> p; g(x, y) = x + y; h(x, y) = 10; f(x, y) = x + y; RDom r1(0, 100, 0, 100); r1.where(p > 3); f(r1.x, r1.y) += 2*g(r1.x, r1.y); RDom r2(0, 100, 0, 100); r2.where(p <= 3); f(r2.x, r2.y) += h(r2.x, r2.y) + g(r2.x, r2.y); f.update(0).specialize(p >= 2).gpu_tile(r1.x, r1.y, 4, 4); g.compute_root(); h.compute_root(); h.gpu_tile(x, y, 8, 8); { printf("....Set p to 5, expect g to be computed\n"); p.set(5); run_tracer = false; niters_expected = 100*100; niters = 0; Image<int> im = f.realize(200, 200); for (int y = 0; y < im.height(); y++) { for (int x = 0; x < im.width(); x++) { int correct = x + y; if ((0 <= x && x <= 99) && (0 <= y && y <= 99)) { correct = 3*correct; } if (im(x, y) != correct) { printf("im(%d, %d) = %d instead of %d\n", x, y, im(x, y), correct); return -1; } } } } { printf("....Set p to 0, expect g to be not computed\n"); p.set(0); run_tracer = false; niters_expected = 0; niters = 0; Image<int> im = f.realize(200, 200); for (int y = 0; y < im.height(); y++) { for (int x = 0; x < im.width(); x++) { int correct = x + y; if ((0 <= x && x <= 99) && (0 <= y && y <= 99)) { correct += 10 + correct; } if (im(x, y) != correct) { printf("im(%d, %d) = %d instead of %d\n", x, y, im(x, y), correct); return -1; } } } } return 0; }
int main(int argc, char **argv) { { Param<bool> param; Func f; Var x; f(x) = select(param, x*3, x*17); // Vectorize when the output is large enough Expr cond = (f.output_buffer().width() >= 4); f.specialize(cond).vectorize(x, 4); // This has created a specialization of f that is // vectorized. Now we want to further specialize both the // default case and the special case based on param. We can // retrieve a reference to the specialization using the same // condition again: f.specialize(cond).specialize(param); // Now specialize the narrow case on param as well f.specialize(param); f.set_custom_trace(&my_trace); f.trace_stores(); Image<int> out(100); // Just check that all the specialization didn't change the output. param.set(true); reset_trace(); f.realize(out); for (int i = 0; i < out.width(); i++) { int correct = i*3; if (out(i) != correct) { printf("out(%d) was %d instead of %d\n", i, out(i), correct); } } param.set(false); f.realize(out); for (int i = 0; i < out.width(); i++) { int correct = i*17; if (out(i) != correct) { printf("out(%d) was %d instead of %d\n", i, out(i), correct); } } // Should have used vector stores if (!vector_store || scalar_store) { printf("This was supposed to use vector stores\n"); return -1; } // Now try a smaller input out = Image<int>(3); param.set(true); reset_trace(); f.realize(out); for (int i = 0; i < out.width(); i++) { int correct = i*3; if (out(i) != correct) { printf("out(%d) was %d instead of %d\n", i, out(i), correct); } } param.set(false); f.realize(out); for (int i = 0; i < out.width(); i++) { int correct = i*17; if (out(i) != correct) { printf("out(%d) was %d instead of %d\n", i, out(i), correct); } } // Should have used scalar stores if (vector_store || !scalar_store) { printf("This was supposed to use scalar stores\n"); return -1; } } { Func f1, f2, g1, g2; Var x; // Define pipeline A f1(x) = x + 7; g1(x) = f1(x) + f1(x + 1); // Define pipeline B f2(x) = x * 34; g2(x) = f2(x) + f2(x - 1); // Switch between them based on a boolean param Param<bool> param; Func out; out(x) = select(param, g1(x), g2(x)); // These will be outside the condition that specializes out, // but skip stages will nuke their allocation and computation // for us. f1.compute_root(); g1.compute_root(); f2.compute_root(); out.specialize(param); // Count allocations. out.set_custom_allocator(&my_malloc, &my_free); reset_alloc_counts(); param.set(true); out.realize(100); if (empty_allocs != 1 || nonempty_allocs != 2 || frees != 3) { printf("There were supposed to be 1 empty alloc, 2 nonempty allocs, and 3 frees.\n" "Instead we got %d empty allocs, %d nonempty allocs, and %d frees.\n", empty_allocs, nonempty_allocs, frees); return -1; } reset_alloc_counts(); param.set(false); out.realize(100); if (empty_allocs != 2 || nonempty_allocs != 1 || frees != 3) { printf("There were supposed to be 2 empty allocs, 1 nonempty alloc, and 3 frees.\n" "Instead we got %d empty allocs, %d nonempty allocs, and %d frees.\n", empty_allocs, nonempty_allocs, frees); return -1; } } { // Specialize for interleaved vs planar inputs ImageParam im(Float(32), 1); im.set_stride(0, Expr()); // unconstrain the stride Func f; Var x; f(x) = im(x); // If we have a stride of 1 it's worth vectorizing, but only if the width is also > 8. f.specialize(im.stride(0) == 1 && im.width() >= 8).vectorize(x, 8); f.trace_stores(); f.set_custom_trace(&my_trace); // Check bounds inference is still cool with widths < 8 f.infer_input_bounds(5); int m = im.get().min(0), e = im.get().extent(0); if (m != 0 || e != 5) { printf("min, extent = %d, %d instead of 0, 5\n", m, e); return -1; } // Check we don't crash with the small input, and that it uses scalar stores reset_trace(); f.realize(5); if (!scalar_store || vector_store) { printf("These stores were supposed to be scalar.\n"); return -1; } // Check we don't crash with a larger input, and that it uses vector stores Image<float> image(100); im.set(image); reset_trace(); f.realize(100); if (scalar_store || !vector_store) { printf("These stores were supposed to be vector.\n"); return -1; } } { // Bounds required of the input change depending on the param ImageParam im(Float(32), 1); Param<bool> param; Func f; Var x; f(x) = select(param, im(x + 10), im(x - 10)); f.specialize(param); param.set(true); f.infer_input_bounds(100); int m = im.get().min(0); if (m != 10) { printf("min %d instead of 10\n", m); return -1; } param.set(false); im.set(Buffer()); f.infer_input_bounds(100); m = im.get().min(0); if (m != -10) { printf("min %d instead of -10\n", m); return -1; } } { // Specialize an update definition Func f; Var x; Param<int> start, size; RDom r(start, size); f(x) = x; f(r) = 10 - r; // Special-case for when we only update one element of f f.update().specialize(size == 1); // Also special-case updating no elements of f f.update().specialize(size == 0); start.set(0); size.set(1); // Not crashing is enough f.realize(100); } { // What happens to bounds inference if an input is not used at // all for a given specialization? ImageParam im(Float(32), 1); Param<bool> param; Func f; Var x; f(x) = select(param, im(x), 0.0f); f.specialize(param); param.set(false); Image<float> image(10); im.set(image); // The image is too small, but that should be OK, because the // param is false so the image will never be used. f.realize(100); } { // Specialization inherits the scheduling directives done so far: ImageParam im(Int(32), 2); Func f; Var x, y; f(x, y) = im(x, y); Expr cond = f.output_buffer().width() >= 4; // Unroll y by two innermost. f.reorder(y, x).unroll(y, 2).reorder(x, y); // Vectorize if the output is at least 4-wide. Inherits the // unrolling already done. f.specialize(cond).vectorize(x, 4); // Confirm that the unrolling applies to both cases using bounds inference: f.infer_input_bounds(3, 1); if (im.get().extent(0) != 3) { printf("extent(0) was supposed to be 3.\n"); return -1; } if (im.get().extent(1) != 2) { // Height is 2, because the unrolling also happens in the // specialized case. printf("extent(1) was supposed to be 2.\n"); return -1; } } { // Check we don't need to specialize intermediate stages. ImageParam im(Int(32), 1); Func f, g, h, out; Var x; f(x) = im(x); g(x) = f(x); h(x) = g(x); out(x) = h(x); Expr w = out.output_buffer().extent(0); out.output_buffer().set_min(0, 0); f.compute_root().specialize(w >= 4).vectorize(x, 4); g.compute_root().vectorize(x, 4); h.compute_root().vectorize(x, 4); out.specialize(w >= 4).vectorize(x, 4); Image<int> input(3), output(3); // Shouldn't throw a bounds error: im.set(input); out.realize(output); } { // Check specializations of stages nested in other stages simplify appropriately. ImageParam im(Int(32), 2); Param<bool> cond1, cond2; Func f, out; Var x, y; f(x, y) = im(x, y); out(x, y) = f(x, y); f.compute_at(out, x).specialize(cond1 && cond2).vectorize(x, 4); out.compute_root().specialize(cond1 && cond2).vectorize(x, 4); if_then_else_count = 0; CountIfThenElse pass1; for (auto ff : out.compile_to_module(out.infer_arguments()).functions()) { pass1.mutate(ff.body); } Image<int> input(3, 3), output(3, 3); // Shouldn't throw a bounds error: im.set(input); out.realize(output); if (if_then_else_count != 1) { printf("Expected 1 IfThenElse stmts. Found %d.\n", if_then_else_count); return -1; } } { // Check specializations of stages nested in other stages simplify appropriately. ImageParam im(Int(32), 2); Param<bool> cond1, cond2; Func f, out; Var x, y; f(x, y) = im(x, y); out(x, y) = f(x, y); f.compute_at(out, x).specialize(cond1).vectorize(x, 4); out.compute_root().specialize(cond1 && cond2).vectorize(x, 4); if_then_else_count = 0; CountIfThenElse pass2; for (auto ff : out.compile_to_module(out.infer_arguments()).functions()) { pass2.mutate(ff.body); } Image<int> input(3, 3), output(3, 3); // Shouldn't throw a bounds error: im.set(input); out.realize(output); // There should have been 2 Ifs total: They are the // outer cond1 && cond2, and the condition in the true case // should have been simplified away. The If in the false // branch cannot be simplified. if (if_then_else_count != 2) { printf("Expected 2 IfThenElse stmts. Found %d.\n", if_then_else_count); return -1; } } printf("Success!\n"); return 0; }
int main(int argc, char **argv) { if (get_jit_target_from_environment().has_feature(Target::Profile)) { // The profiler adds lots of extra prints, so counting the // number of prints is not useful. printf("Skipping test because profiler is active\n"); return 0; } Var x; { Func f; f(x) = print(x * x, "the answer is", 42.0f, "unsigned", cast<uint32_t>(145)); f.set_custom_print(halide_print); Buffer<int32_t> result = f.realize(10); for (int32_t i = 0; i < 10; i++) { if (result(i) != i * i) { return -1; } } assert(messages.size() == 10); for (size_t i = 0; i < messages.size(); i++) { long square; float forty_two; unsigned long one_forty_five; int scan_count = sscanf(messages[i].c_str(), "%ld the answer is %f unsigned %lu", &square, &forty_two, &one_forty_five); assert(scan_count == 3); assert(square == static_cast<long long>(i * i)); assert(forty_two == 42.0f); assert(one_forty_five == 145); } } messages.clear(); { Func f; Param<int> param; param.set(127); // Test a string containing a printf format specifier (It should print it as-is). f(x) = print_when(x == 3, x * x, "g", 42.0f, "%s", param); f.set_custom_print(halide_print); Buffer<int32_t> result = f.realize(10); for (int32_t i = 0; i < 10; i++) { if (result(i) != i * i) { return -1; } } assert(messages.size() == 1); long nine; float forty_two; long p; int scan_count = sscanf(messages[0].c_str(), "%ld g %f %%s %ld", &nine, &forty_two, &p); assert(scan_count == 3); assert(nine == 9); assert(forty_two == 42.0f); assert(p == 127); } messages.clear(); { Func f; // Test a single message longer than 8K. std::vector<Expr> args; for (int i = 0; i < 500; i++) { uint64_t n = i; n *= n; n *= n; n *= n; n *= n; n += 100; int32_t hi = n >> 32; int32_t lo = n & 0xffffffff; args.push_back((cast<uint64_t>(hi) << 32) | lo); Expr dn = cast<double>((float)(n)); args.push_back(dn); } f(x) = print(args); f.set_custom_print(halide_print); Buffer<uint64_t> result = f.realize(1); if (result(0) != 100) { return -1; } assert(messages.back().size() == 8191); } messages.clear(); // Check that Halide's stringification of floats and doubles // matches %f and %e respectively. #ifndef _WIN32 // msvc's library has different ideas about how %f and %e should come out. { Func f, g; const int N = 1000000; Expr e = reinterpret(Float(32), random_uint()); // Make sure we cover some special values. e = select(x == 0, 0.0f, x == 1, -0.0f, x == 2, std::numeric_limits<float>::infinity(), x == 3, -std::numeric_limits<float>::infinity(), x == 4, std::numeric_limits<float>::quiet_NaN(), x == 5, -std::numeric_limits<float>::quiet_NaN(), e); e = select(x == 5, std::numeric_limits<float>::denorm_min(), x == 6, -std::numeric_limits<float>::denorm_min(), x == 7, std::numeric_limits<float>::min(), x == 8, -std::numeric_limits<float>::min(), x == 9, std::numeric_limits<float>::max(), x == 10, -std::numeric_limits<float>::max(), x == 11, 1.0f - 1.0f / (1 << 22), e); f(x) = print(e); f.set_custom_print(halide_print); Buffer<float> imf = f.realize(N); assert(messages.size() == (size_t)N); char correct[1024]; for (int i = 0; i < N; i++) { snprintf(correct, sizeof(correct), "%f\n", imf(i)); // OS X prints -nan as nan #ifdef __APPLE__ if (messages[i] == "-nan\n") messages[i] = "nan\n"; #endif if (messages[i] != correct) { printf("float %d: %s vs %s for %10.20e\n", i, messages[i].c_str(), correct, imf(i)); return -1; } } messages.clear(); g(x) = print(reinterpret(Float(64), (cast<uint64_t>(random_uint()) << 32) | random_uint())); g.set_custom_print(halide_print); Buffer<double> img = g.realize(N); assert(messages.size() == (size_t)N); for (int i = 0; i < N; i++) { snprintf(correct, sizeof(correct), "%e\n", img(i)); #ifdef __APPLE__ if (messages[i] == "-nan\n") messages[i] = "nan\n"; #endif if (messages[i] != correct) { printf("double %d: %s vs %s for %10.20e\n", i, messages[i].c_str(), correct, img(i)); return -1; } } } #endif printf("Success!\n"); return 0; }
int main(int argc, char **argv) { Func f1, f2, f3, f4, f5; Func g1, g2, g3, g4, g5; Var x, xi; Expr v = x*1.34f + 1.0142f; Param<float> p; p.set(1.0f); // Test accuracy of reciprocals. // First prevent any optimizations by hiding 1.0 in a param. f1(x) = p / v; // Now test various vectorization widths with an explicit 1.0. On // arm 2 and 4 trigger optimizations. On x86 4 and 8 do. f2(x) = fast_inverse(v); f2.vectorize(x, 2); f3(x) = fast_inverse(v); f3.vectorize(x, 4); f4(x) = fast_inverse(v); f4.vectorize(x, 8); // Same thing for reciprocal square root. g1(x) = p / sqrt(v); g2(x) = fast_inverse_sqrt(v); g2.vectorize(x, 2); g3(x) = fast_inverse_sqrt(v); g3.vectorize(x, 4); g4(x) = fast_inverse_sqrt(v); g4.vectorize(x, 8); // Also test both on the GPU. f5(x) = fast_inverse(v); g5(x) = fast_inverse_sqrt(v); Target t = get_jit_target_from_environment(); if (t.has_gpu_feature()) { f5.gpu_tile(x, xi, 16); g5.gpu_tile(x, xi, 16); } Buffer<float> imf1 = f1.realize(10000); Buffer<float> imf2 = f2.realize(10000); Buffer<float> imf3 = f3.realize(10000); Buffer<float> imf4 = f4.realize(10000); Buffer<float> imf5 = f5.realize(10000); Buffer<float> img1 = g1.realize(10000); Buffer<float> img2 = g2.realize(10000); Buffer<float> img3 = g3.realize(10000); Buffer<float> img4 = g4.realize(10000); Buffer<float> img5 = g5.realize(10000); printf("Testing accuracy of inverse\n"); check(imf1, imf2); check(imf1, imf3); check(imf1, imf4); check(imf1, imf5); printf("Pass.\n"); printf("Testing accuracy of inverse sqrt\n"); check(img1, img2); check(img1, img3); check(img1, img4); check(img1, img5); printf("Pass.\n"); printf("Success!\n"); return 0; }
int main(int argc, char **argv) { Func f1, f2, f3, f4; Func g1, g2, g3, g4; Var x; Expr v = x*1.34f + 1.0142f; Param<float> p; p.set(1.0f); // Test accuracy of reciprocals. // First prevent any optimizations by hiding 1.0 in a param. f1(x) = p / v; // Now test various vectorization widths with an explicit 1.0. On // arm 2 and 4 trigger optimizations. On x86 4 and 8 do. f2(x) = 1.0f / v; f2.vectorize(x, 2); f3(x) = 1.0f / v; f3.vectorize(x, 4); f4(x) = 1.0f / v; f4.vectorize(x, 8); // Same thing for reciprocal square root. g1(x) = p / sqrt(v); g2(x) = 1.0f / sqrt(v); g2.vectorize(x, 2); g3(x) = 1.0f / sqrt(v); g3.vectorize(x, 4); g4(x) = 1.0f / sqrt(v); g4.vectorize(x, 8); Image<float> imf1 = f1.realize(10000); Image<float> imf2 = f2.realize(10000); Image<float> imf3 = f3.realize(10000); Image<float> imf4 = f4.realize(10000); Image<float> img1 = g1.realize(10000); Image<float> img2 = g2.realize(10000); Image<float> img3 = g3.realize(10000); Image<float> img4 = g4.realize(10000); printf("Testing accuracy of inverse\n"); check(imf1, imf2); check(imf1, imf3); check(imf1, imf4); printf("Pass.\n"); printf("Testing accuracy of inverse sqrt\n"); check(img1, img2); check(img1, img3); check(img1, img4); printf("Pass.\n"); printf("Success!\n"); return 0; }
int main(int argc, char **argv) { Target target = get_jit_target_from_environment(); if (target.has_feature(Target::Profile)) { // The profiler adds lots of extra prints, so counting the // number of prints is not useful. printf("Skipping test because profiler is active\n"); return 0; } if (target.has_feature(Target::Debug)) { // Same thing here: the runtime debug adds lots of extra prints, // so counting the number of prints is not useful. printf("Skipping test because runtime debug is active\n"); return 0; } Var x; { Func f; f(x) = print(x * x, "the answer is", 42.0f, "unsigned", cast<uint32_t>(145)); f.set_custom_print(halide_print); Buffer<int32_t> result = f.realize(10); for (int32_t i = 0; i < 10; i++) { if (result(i) != i * i) { return -1; } } assert(messages.size() == 10); for (size_t i = 0; i < messages.size(); i++) { long square; float forty_two; unsigned long one_forty_five; int scan_count = sscanf(messages[i].c_str(), "%ld the answer is %f unsigned %lu", &square, &forty_two, &one_forty_five); assert(scan_count == 3); assert(square == static_cast<long long>(i * i)); assert(forty_two == 42.0f); assert(one_forty_five == 145); } } messages.clear(); { Func f; Param<int> param; param.set(127); // Test a string containing a printf format specifier (It should print it as-is). f(x) = print_when(x == 3, x * x, "g", 42.0f, "%s", param); f.set_custom_print(halide_print); Buffer<int32_t> result = f.realize(10); for (int32_t i = 0; i < 10; i++) { if (result(i) != i * i) { return -1; } } assert(messages.size() == 1); long nine; float forty_two; long p; int scan_count = sscanf(messages[0].c_str(), "%ld g %f %%s %ld", &nine, &forty_two, &p); assert(scan_count == 3); assert(nine == 9); assert(forty_two == 42.0f); assert(p == 127); } messages.clear(); { Func f; // Test a single message longer than 8K. std::vector<Expr> args; for (int i = 0; i < 500; i++) { uint64_t n = i; n *= n; n *= n; n *= n; n *= n; n += 100; int32_t hi = n >> 32; int32_t lo = n & 0xffffffff; args.push_back((cast<uint64_t>(hi) << 32) | lo); Expr dn = cast<double>((float)(n)); args.push_back(dn); } f(x) = print(args); f.set_custom_print(halide_print); Buffer<uint64_t> result = f.realize(1); if (result(0) != 100) { return -1; } assert(messages.back().size() == 8191); } messages.clear(); // Check that Halide's stringification of floats and doubles // matches %f and %e respectively. #ifndef _WIN32 // msvc's library has different ideas about how %f and %e should come out. { Func f, g; const int N = 1000000; Expr e = reinterpret(Float(32), random_uint()); // Make sure we cover some special values. e = select(x == 0, 0.0f, x == 1, -0.0f, x == 2, std::numeric_limits<float>::infinity(), x == 3, -std::numeric_limits<float>::infinity(), x == 4, std::numeric_limits<float>::quiet_NaN(), x == 5, -std::numeric_limits<float>::quiet_NaN(), e); e = select(x == 5, std::numeric_limits<float>::denorm_min(), x == 6, -std::numeric_limits<float>::denorm_min(), x == 7, std::numeric_limits<float>::min(), x == 8, -std::numeric_limits<float>::min(), x == 9, std::numeric_limits<float>::max(), x == 10, -std::numeric_limits<float>::max(), x == 11, 1.0f - 1.0f / (1 << 22), e); f(x) = print(e); f.set_custom_print(halide_print); Buffer<float> imf = f.realize(N); assert(messages.size() == (size_t)N); char correct[1024]; for (int i = 0; i < N; i++) { snprintf(correct, sizeof(correct), "%f\n", imf(i)); // Some versions of the std library can emit some NaN patterns // as "-nan", due to sloppy conversion (or not) of the sign bit. // Halide considers all NaN's equivalent, so paper over this // noise in the test by normalizing all -nan -> nan. if (messages[i] == "-nan\n") messages[i] = "nan\n"; if (!strcmp(correct, "-nan\n")) strcpy(correct, "nan\n"); if (messages[i] != correct) { printf("float %d: %s vs %s for %10.20e\n", i, messages[i].c_str(), correct, imf(i)); return -1; } } messages.clear(); g(x) = print(reinterpret(Float(64), (cast<uint64_t>(random_uint()) << 32) | random_uint())); g.set_custom_print(halide_print); Buffer<double> img = g.realize(N); assert(messages.size() == (size_t)N); for (int i = 0; i < N; i++) { snprintf(correct, sizeof(correct), "%e\n", img(i)); // Some versions of the std library can emit some NaN patterns // as "-nan", due to sloppy conversion (or not) of the sign bit. // Halide considers all NaN's equivalent, so paper over this // noise in the test by normalizing all -nan -> nan. if (messages[i] == "-nan\n") messages[i] = "nan\n"; if (!strcmp(correct, "-nan\n")) strcpy(correct, "nan\n"); if (messages[i] != correct) { printf("double %d: %s vs %s for %10.20e\n", i, messages[i].c_str(), correct, img(i)); return -1; } } } #endif messages.clear(); { Func f; // Test a vectorized print. f(x) = print(x * 3); f.set_custom_print(halide_print); f.vectorize(x, 32); if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { f.hexagon(); } Buffer<int> result = f.realize(128); if (!target.features_any_of({Target::HVX_64, Target::HVX_128})) { assert((int)messages.size() == result.width()); for (size_t i = 0; i < messages.size(); i++) { assert(messages[i] == std::to_string(i * 3) + "\n"); } } else { // The Hexagon simulator prints directly to stderr, so we // can't read the messages. } } messages.clear(); { Func f; // Test a vectorized print_when. f(x) = print_when(x % 2 == 0, x * 3); f.set_custom_print(halide_print); f.vectorize(x, 32); if (target.features_any_of({Target::HVX_64, Target::HVX_128})) { f.hexagon(); } Buffer<int> result = f.realize(128); if (!target.features_any_of({Target::HVX_64, Target::HVX_128})) { assert((int)messages.size() == result.width() / 2); for (size_t i = 0; i < messages.size(); i++) { assert(messages[i] == std::to_string(i * 2 * 3) + "\n"); } } else { // The Hexagon simulator prints directly to stderr, so we // can't read the messages. } } printf("Success!\n"); return 0; }
int main(int argc, char **argv) { Func f; Var x; f(x) = x; f.compute_root(); // Halide will partition a loop into three pieces in a few // situations. The pieces are 1) a messy prologue, 2) a clean // steady state, and 3) a messy epilogue. One way to trigger this // is if you use a boundary condition helper: { Func g = BoundaryConditions::repeat_edge(f, 0, 100); count_partitions(g, 3); } // If you vectorize or otherwise split, then the last vector // (which gets shifted leftwards) is its own partition. This // removes some clamping logic from the inner loop. { Func g; g(x) = f(x); g.vectorize(x, 8); count_partitions(g, 2); } // The slicing applies to every loop level starting from the // outermost one, but only recursively simplifies the clean steady // state. It either splits things three (start, middle, end). So // adding a boundary condition to a 2D computation will produce 5 // code paths for the top, bottom, left, right, and center of the // image. { Var y; Func g; g(x, y) = x + y; g.compute_root(); Func h = BoundaryConditions::mirror_image(g, 0, 10, 0, 10); count_partitions(h, 5); } // If you split and also have a boundary condition, or have // multiple boundary conditions at play (e.g. because you're // blurring an inlined Func that uses a boundary condition), then // there are still only three partitions. The steady state is the // slice of the loop where *all* of the boundary conditions and // splitting logic simplify away. { Func g = BoundaryConditions::mirror_interior(f, 0, 10); Func h; Param<int> t1, t2; h(x) = g(x-1) + g(x+1); h.vectorize(x, 8); count_partitions(h, 3); } // You can manually control the splitting behavior using the // 'likely' intrinsic. When used on one side of a select, min, // max, or clamp, it tags the select, min, max, or clamp as likely // to simplify to that expression in the steady state case, and // tries to solve for loop variable values for which this is true. { // So this code should produce a prologue that evaluates to sin(x), and // a steady state that evaluates to 1: Func g; g(x) = select(x < 10, sin(x), likely(1.0f)); // There should be two partitions count_partitions(g, 2); // But only one should call sin count_sin_calls(g, 1); } { // This code should produce a prologue and epilogue that // evaluate sin(x), and a steady state that evaluates to 1: Func g; g(x) = select(x < 10 || x > 100, sin(x), likely(1.0f)); // There should be three partitions count_partitions(g, 3); // With calls to sin in the prologue and epilogue. count_sin_calls(g, 2); } // As a specialize case, we treat clamped ramps as likely to // simplify to the clamped expression. This handles the many // existing cases where people have written their boundary // condition manually using clamp. { Func g; g(x) = f(clamp(x, 0, 10)); // treated as clamp(likely(x), 0, 10) g.vectorize(x, 8); count_partitions(g, 3); } // Using the likely intrinsic pulls some IR relating to the // condition outside of the loop. We'd better check that this // respects lets and doesn't do any combinatorial expansion. We'll // do this with a nasty comparison: { Func g; Var y; // Have an inner reduction loop that the comparisons depend on // to make things harder. RDom r(0, 5); const int N = 25; // Make some nasty expressions to compare to. Expr e[N]; e[0] = y; for (int i = 1; i < N; i++) { e[i] = e[i-1] * e[i-1] + y + r; } // Make a nasty condition that uses all of these. Expr nasty = cast<bool>(1); for (int i = 0; i < N; i++) { nasty = nasty && (x*(i+1) < e[i]); } // Have an innermost loop over c to complicate things further. Var c; g(c, x, y) = sum(select(nasty, likely(10), c + r)); // Check that it doesn't take the age of the world to compile, // and that it produces the right number of partitions. count_partitions(g, 3); } // Make sure partitions that occur outside of the actual bounds // don't mess things up. { Func g; Var x; Param<int> limit; g(x) = select(x > limit, likely(3), 2); // If either of these realize calls iterates from 0 to limit, // and then from limit to 10, we'll have a nice segfault. limit.set(10000000); Buffer<int> result = g.realize(10); limit.set(-10000000); result = g.realize(10); } // The performance of this behavior is tested in // test/performance/boundary_conditions.cpp printf("Success!\n"); return 0; }