コード例 #1
0
ファイル: param.cpp プロジェクト: drtpig/Halide
int main(int argc, char **argv) {

    Var x("x");
    Func f("f");

    Param<float> u;

    f(x) = u;

    Target target = get_target_from_environment();
    if (target.features & Target::CUDA) {
        f.cuda_tile(x, 256);
    }

    u.set(17.0f);
    Image<float> out_17 = f.realize(1024);

    u.set(123.0f);
    Image<float> out_123 = f.realize(1024);

    for (int i = 0; i < 1024; i++) {
        if (out_17(i) != 17.0f || out_123(i) != 123.0f) {
            printf("Failed!\n");
            for (int i = 0; i < 1024; i++) {
                printf("%f %f\n", out_17(i), out_123(i));
            }
            return -1;
        }
    }

    printf("Success!\n");
    return 0;
}
コード例 #2
0
ファイル: param.cpp プロジェクト: 202198/Halide
int main(int argc, char **argv) {

    Var x("x");
    Func f("f");

    Param<float> u;

    f(x) = u;

    std::string target = get_target();
    if (target == "ptx" || target == "ptx-debug") {
        f.cuda_tile(x, 256);
    }

    u.set(17.0f);
    Image<float> out_17 = f.realize(1024);

    u.set(123.0f);
    Image<float> out_123 = f.realize(1024);

    for (int i = 0; i < 1024; i++) {
        if (out_17(i) != 17.0f || out_123(i) != 123.0f) {
            printf("Failed!\n");
            for (int i = 0; i < 1024; i++) {
                printf("%f %f\n", out_17(i), out_123(i));
            }
            return -1;
        }
    }

    printf("Success!\n");
    return 0;
}
コード例 #3
0
ファイル: rfactor.cpp プロジェクト: jiawen/Halide
int simple_rfactor_with_specialize_test(bool compile_module) {
    Func f("f"), g("g");
    Var x("x"), y("y");

    f(x, y) = x + y;
    f.compute_root();

    g(x, y) = 40;
    RDom r(10, 20, 30, 40);
    g(r.x, r.y) = min(f(r.x, r.y) + 2, g(r.x, r.y));

    Param<int> p;
    Var u("u");
    Func intm = g.update(0).specialize(p >= 10).rfactor(r.y, u);
    intm.compute_root();
    intm.vectorize(u, 8);
    intm.update(0).vectorize(r.x, 2);

    if (compile_module) {
        p.set(20);
        // Check the call graphs.
        Module m = g.compile_to_module({g.infer_arguments()});
        CheckCalls checker;
        m.functions().front().body.accept(&checker);

        CallGraphs expected = {
            {g.name(), {}},
            {g.update(0).name(), {f.name(), intm.name(), g.name()}},
            {intm.name(), {}},
            {intm.update(0).name(), {f.name(), intm.name()}},
            {f.name(), {}},
        };
        if (check_call_graphs(checker.calls, expected) != 0) {
            return -1;
        }
    } else {
        {
            p.set(0);
            Image<int> im = g.realize(80, 80);
            auto func = [](int x, int y, int z) {
                return (10 <= x && x <= 29) && (30 <= y && y <= 69) ? std::min(x + y + 2, 40) : 40;
            };
            if (check_image(im, func)) {
                return -1;
            }
        }
        {
            p.set(20);
            Image<int> im = g.realize(80, 80);
            auto func = [](int x, int y, int z) {
                return (10 <= x && x <= 29) && (30 <= y && y <= 69) ? std::min(x + y + 2, 40) : 40;
            };
            if (check_image(im, func)) {
                return -1;
            }
        }
    }
    return 0;
}
コード例 #4
0
int tuple_memoize_test(bool toggle_val, int index) {
    buffer_index = index;

    Param<bool> toggle;
    Func f1("f1_" + std::to_string(index)), f2("f2_" + std::to_string(index));
    Var x;

    f1(x) = Tuple(2*x, 2*x);
    f2(x) = Tuple(select(toggle, f1(x)[0], 1),
                     select(toggle, f1(x)[1], 1));

    f1.compute_root().memoize();

    f2.set_custom_trace(&single_toggle_trace);
    f1.trace_stores();

    f2.compile_jit();

    set_toggle1 = toggle_val;
    toggle.set(set_toggle1);
    Realization out = f2.realize(128);
    Image<int> out0 = out[0];
    Image<int> out1 = out[1];

    if (check_correctness_single(out0, set_toggle1) != 0) {
        return -1;
    }
    if (check_correctness_single(out1, set_toggle1) != 0) {
        return -1;
    }
    return 0;
}
コード例 #5
0
int main(int argc, char **argv) {
    ImageParam im1(UInt(8), 1);
    Buffer<uint8_t> im2(10), im3(20);
    Param<int> j;

    assert(im1.dimensions() == 1);
    assert(im2.dimensions() == 1);
    assert(im3.dimensions() == 1);

    Func f;
    Var x;
    f(x) = x + im1.width();
    RDom r(0, clamp(im2(j), 0, 99));
    f(r) = 37;

    im2(3) = 10;

    j.set(3);
    im1.set(im3);
    Buffer<int> result = f.realize(100);

    for (int i = 0; i < 100; i++) {
        int correct = i < im2(3) ? 37 : (i+20);
        if (result(i) != correct) {
            printf("result(%d) = %d instead of %d\n", i, result(i), correct);
            return -1;
        }
    }

    printf("Success!\n");
    return 0;
}
コード例 #6
0
int non_trivial_allocate_predicate_test(bool toggle_val, int index) {
    buffer_index = index;

    Param<bool> toggle;
    Func f1("f1_" + std::to_string(index)), f2("f2_" + std::to_string(index));
    Func f3("f3_" + std::to_string(index));
    Var x;

    // Generate allocate f1[...] if toggle
    f1(x) = 2*x;
    f2(x) = select(toggle, f1(x), 1);
    f3(x) = select(toggle, f2(x), 1);

    f1.compute_root().memoize();
    f2.compute_root().memoize();

    f3.set_custom_trace(&double_toggle_trace);
    f1.trace_stores();
    f2.trace_stores();

    f3.compile_jit();

    set_toggle1 = toggle_val;
    set_toggle2 = toggle_val;
    toggle.set(set_toggle1);
    Image<int> out = f3.realize(10);
    if (check_correctness_single(out, set_toggle1) != 0) {
        return -1;
    }
    return 0;
}
コード例 #7
0
int single_memoize_test(bool toggle_val, int index) {
    buffer_index = index;

    Param<bool> toggle;
    Func f1("f1_" + std::to_string(index)), f2("f2_" + std::to_string(index));
    Var x;

    f1(x) = 2*x;
    f2(x) = select(toggle, f1(x), 1);

    f1.compute_root().memoize();

    f2.set_custom_trace(&single_toggle_trace);
    f1.trace_stores();

    f2.compile_jit();

    set_toggle1 = toggle_val;
    toggle.set(set_toggle1);
    Image<int> out = f2.realize(10);
    if (check_correctness_single(out, set_toggle1) != 0) {
        return -1;
    }
    return 0;
}
コード例 #8
0
ファイル: store_in.cpp プロジェクト: jiapei100/Halide
void check(MemoryType t1, MemoryType t2, MemoryType t3) {
    Var x;

    // By default, small constant-sized allocations, or
    // allocations that can be bounded with a small constant size,
    // go on the stack. Other allocations go on the heap.

    Func f1, f2, f3;
    f1(x) = x;
    f1.compute_root().store_in(t1);
    f2(x) = x;
    f2.compute_root().store_in(t2);
    f3(x) = x;
    f3.compute_root().store_in(t3);

    Func f;
    Param<bool> p;
    f(x) = (f1(0) + f1(1)) + f2(select(p, 0, 2)) + f2(0) + f3(x % 1000);

    p.set(true);

    int expected_mallocs = ((t1 == MemoryType::Heap ? 1 : 0) +
                            (t2 == MemoryType::Heap ? 1 : 0) +
                            (t3 == MemoryType::Heap ? 1 : 0));

    mallocs = 0;
    f.set_custom_allocator(my_malloc, my_free);
    f.realize(1024);
    if (mallocs != expected_mallocs) {
        std::cerr << "Wrong number of mallocs for " << t1 << ", " << t2 << ", " << t3 << "\n"
                  << "Expected " << expected_mallocs << " got " << mallocs << "\n";
        exit(-1);
    }

}
コード例 #9
0
ファイル: parallel_nested.cpp プロジェクト: 202198/Halide
int main(int argc, char **argv) {
    Var x, y, z;
    Func f;

    Param<int> k;
    k.set(3);

    f(x, y, z) = x*y+z*k+1;

    f.parallel(x);
    f.parallel(y);
    f.parallel(z);

    Image<int> im = f.realize(64, 64, 64);

    for (int x = 0; x < 64; x++) {
        for (int y = 0; y < 64; y++) {
            for (int z = 0; z < 64; z++) {
                if (im(x, y, z) != x*y+z*3+1) {
                    printf("im(%d, %d, %d) = %d\n", x, y, z, im(x, y, z));
                    return -1;
                }
            }
        }
    } 
    
    printf("Success!\n");
    return 0;
}
コード例 #10
0
ファイル: extern_consumer.cpp プロジェクト: Amos-zq/Halide
int main(int argc, char **argv) {
    // Define a pipeline that dumps some squares to a file using an
    // external consumer stage.
    Func source;
    Var x;
    source(x) = x*x;

    Param<int> min, extent;
    Param<const char *> filename;

    Func sink;
    std::vector<ExternFuncArgument> args;
    args.push_back(source);
    args.push_back(filename);
    args.push_back(min);
    args.push_back(extent);
    sink.define_extern("dump_to_file", args, Int(32), 0);

    source.compute_root();

    sink.compile_jit();

    // Dump the first 10 squares to a file
    filename.set("halide_test_extern_consumer.txt");
    min.set(0);
    extent.set(10);
    sink.realize();

    if (!check_result())
        return -1;

    // Test ImageParam ExternFuncArgument via passed in image.
    Image<int32_t> buf = source.realize(10);
    ImageParam passed_in(Int(32), 1);
    passed_in.set(buf);

    Func sink2;
    std::vector<ExternFuncArgument> args2;
    args2.push_back(passed_in);
    args2.push_back(filename);
    args2.push_back(min);
    args2.push_back(extent);
    sink2.define_extern("dump_to_file", args2, Int(32), 0);

    sink2.realize();

    if (!check_result())
        return -1;

    printf("Success!\n");
    return 0;

}
コード例 #11
0
ファイル: varying.cpp プロジェクト: jiapei100/Halide
 CoordXform() : m0("m0"), m1("m1"), m2("m2"), m3("m3"), m4("m4"), m5("m5") {
     m0.set(m[0]);
     m1.set(m[1]);
     m2.set(m[2]);
     m3.set(m[3]);
     m4.set(m[4]);
     m5.set(m[5]);
 }
コード例 #12
0
int main(int argc, char **argv) {
    Param<float> val;

    Func f, g;
    Var x, y;

    f(x, y) = val + cast<uint8_t>(x);
    g(x, y) = f(x, y) + f(x - 1, y) + f(x + 1, y);

    g.split(y, y, _, 16);
    f.store_root();
    f.compute_at(g, y).memoize();

    val.set(23.0f);
    Image<uint8_t> out = g.realize(128, 128);

    for (int32_t i = 0; i < 128; i++) {
        for (int32_t j = 0; j < 128; j++) {
            assert(out(i, j) == (uint8_t)(3 * 23 + i + (i - 1) + (i + 1)));
        }
    }
}
コード例 #13
0
int main(int argc, char **argv) {
    Var x, y, z;
    Func f, g;

    Param<int> k;
    k.set(3);

    f(x, y, z) = x*y+z*k+1;
    g(x, y, z) = f(x, y, z) + 2;

    f.parallel(x);
    f.parallel(y);
    g.parallel(z);

    f.compute_at(g, z);

    auto target = get_jit_target_from_environment();
    if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
        g.hexagon().vectorize(x, 32);
        f.vectorize(x, 32);
    }

    Buffer<int> im = g.realize(64, 64, 64);

    for (int x = 0; x < 64; x++) {
        for (int y = 0; y < 64; y++) {
            for (int z = 0; z < 64; z++) {
                if (im(x, y, z) != x*y+z*3+3) {
                    printf("im(%d, %d, %d) = %d\n", x, y, z, im(x, y, z));
                    return -1;
                }
            }
        }
    }

    printf("Success!\n");
    return 0;
}
コード例 #14
0
ファイル: parallel.cpp プロジェクト: adityaatluri/Halide
int main(int argc, char **argv) {
    Var x;
    Func f;

    Param<int> k;
    k.set(3);

    f(x) = x*k;

    f.parallel(x);

    Buffer<int> im = f.realize(16);

    for (int i = 0; i < 16; i++) {
        if (im(i) != i*3) {
            printf("im(%d) = %d\n", i, im(i));
            return -1;
        }
    }

    printf("Success!\n");
    return 0;
}
コード例 #15
0
  static int run(int argc, char **argv) {
    static const MeCab::Option long_options[] = {
      { "dicdir",  'd',  ".",   "DIR", "set DIR as dicdir(default \".\" )" },
      { "outdir",  'o',  ".",   "DIR", "set DIR as output dir" },
      { "model",   'm',  0,     "FILE",   "use FILE as model file" },
      { "version", 'v',  0,   0,  "show the version and exit"  },
      { "training-algorithm", 'a',  "crf",    "(crf|hmm)",
        "set training algorithm" },
      { "default-emission-cost", 'E', "4000", "INT",
        "set default emission cost for HMM" },
      { "default-transition-cost", 'T', "4000", "INT",
        "set default transition cost for HMM" },
      { "help",    'h',  0,   0,  "show this help and exit."      },
      { 0, 0, 0, 0 }
    };

    Param param;

    if (!param.open(argc, argv, long_options)) {
      std::cout << param.what() << "\n\n" <<  COPYRIGHT
                << "\ntry '--help' for more information." << std::endl;
      return -1;
    }

    if (!param.help_version()) return 0;

    ContextID cid;
    DecoderFeatureIndex fi;
    DictionaryRewriter rewrite;

    const std::string dicdir = param.get<std::string>("dicdir");
    const std::string outdir = param.get<std::string>("outdir");
    const std::string model = param.get<std::string>("model");

#define DCONF(file) create_filename(dicdir, std::string(file)).c_str()
#define OCONF(file) create_filename(outdir, std::string(file)).c_str()

    CHECK_DIE(param.load(DCONF(DICRC)))
        << "no such file or directory: " << DCONF(DICRC);

    std::string charset;
    {
      Dictionary dic;
      CHECK_DIE(dic.open(DCONF(SYS_DIC_FILE), "r"));
      charset = dic.charset();
      CHECK_DIE(!charset.empty());
    }

    int default_emission_cost = 0;
    int default_transition_cost = 0;

    std::string type = param.get<std::string>("training-algorithm");
    toLower(&type);

    if (type == "hmm") {
      default_emission_cost =
          param.get<int>("default-emission-cost");
      default_transition_cost =
          param.get<int>("default-transition-cost");
      CHECK_DIE(default_transition_cost > 0)
          << "default transition cost must be > 0";
      CHECK_DIE(default_emission_cost > 0)
          << "default transition cost must be > 0";
      param.set("identity-template", 1);
    }

    CharProperty property;
    CHECK_DIE(property.open(param));
    property.set_charset(charset.c_str());

    const std::string bos = param.get<std::string>("bos-feature");
    const int factor = param.get<int>("cost-factor");

    std::vector<std::string> dic;
    enum_csv_dictionaries(dicdir.c_str(), &dic);

    {
      CHECK_DIE(dicdir != outdir) <<
          "output directory = dictionary directory! "
          "Please specify different directory.";
      CHECK_DIE(!outdir.empty()) << "output directory is empty";
      CHECK_DIE(!model.empty()) << "model file is empty";
      CHECK_DIE(fi.open(param)) << fi.what();
      CHECK_DIE(factor > 0)   << "cost factor needs to be positive value";
      CHECK_DIE(!bos.empty()) << "bos-feature is empty";
      CHECK_DIE(dic.size()) << "no dictionary is found in " << dicdir;
      CHECK_DIE(rewrite.open(DCONF(REWRITE_FILE)));
    }

    gencid_bos(bos, &rewrite, &cid);
    gencid(DCONF(UNK_DEF_FILE), &rewrite, &cid);

    for (std::vector<std::string>::const_iterator it = dic.begin();
         it != dic.end();
         ++it) {
      gencid(it->c_str(), &rewrite, &cid);
    }

    std::cout << "emitting "
              << OCONF(LEFT_ID_FILE) << "/ "
              << OCONF(RIGHT_ID_FILE) << std::endl;

    cid.build();
    cid.save(OCONF(LEFT_ID_FILE), OCONF(RIGHT_ID_FILE));

    gendic(DCONF(UNK_DEF_FILE), OCONF(UNK_DEF_FILE), property,
           &rewrite, cid, &fi, true, factor, default_emission_cost);

    for (std::vector<std::string>::const_iterator it = dic.begin();
         it != dic.end();
         ++it) {
      std::string file =  *it;
      remove_pathname(&file);
      gendic(it->c_str(), OCONF(file.c_str()), property,
             &rewrite, cid, &fi, false, factor, default_emission_cost);
    }

    genmatrix(OCONF(MATRIX_DEF_FILE), cid, &fi,
              factor, default_transition_cost);

    copy(DCONF(CHAR_PROPERTY_DEF_FILE), OCONF(CHAR_PROPERTY_DEF_FILE));
    copy(DCONF(REWRITE_FILE), OCONF(REWRITE_FILE));
    copy(DCONF(DICRC), OCONF(DICRC));

    if (type == "crf")
      copy(DCONF(FEATURE_FILE), OCONF(FEATURE_FILE));

#undef OCONF
#undef DCONF

    std::cout <<  "\ndone!\n";

    return 0;
  }
コード例 #16
0
ファイル: rfactor.cpp プロジェクト: jiawen/Halide
int tuple_specialize_rdom_predicate_rfactor_test(bool compile_module) {
    Func f("f"), g("g");
    Var x("x"), y("y"), z("z");

    f(x, y, z) = Tuple(x + y + z, x - y + z);
    f.compute_root();

    RDom r(5, 20, 5, 20, 5, 20);
    r.where(r.x*r.x + r.z*r.z <= 200);
    r.where(r.y*r.z + r.z*r.z > 100);

    Func ref("ref");
    ref(x, y) = Tuple(1, 3);
    ref(x, y) = Tuple(ref(x, y)[0]*f(r.x, r.y, r.z)[0], ref(x, y)[1] + 2*f(r.x, r.y, r.z)[1]);
    Realization ref_rn = ref.realize(10, 10);

    g(x, y) = Tuple(1, 3);

    g(x, y) = Tuple(g(x, y)[0]*f(r.x, r.y, r.z)[0], g(x, y)[1] + 2*f(r.x, r.y, r.z)[1]);

    Param<int> p;
    Param<bool> q;

    Var u("u"), v("v"), w("w");
    Func intm1 = g.update(0).specialize(p >= 5).rfactor({{r.y, v}, {r.z, w}});
    intm1.update(0).parallel(v, 2);

    RVar rxi("rxi"), rxo("rxo");
    intm1.update(0).split(r.x, rxo, rxi, 2);
    Var t("t");
    Func intm2 = intm1.update(0).specialize(q).rfactor(rxi, t);
    Func intm3 = intm1.update(0).specialize(!q).rfactor(rxo, t);
    Func intm4 = g.update(0).rfactor({{r.x, u}, {r.z, w}});
    intm4.update(0).vectorize(u);

    if (compile_module) {
        // Check the call graphs.
        Module m = g.compile_to_module({g.infer_arguments()});
        CheckCalls checker;
        m.functions().front().body.accept(&checker);

        CallGraphs expected = {
            {g.name(), {}},
            {g.update(0).name(), {intm1.name() + ".0", intm1.name() + ".1",
                                  intm4.name() + ".0", intm4.name() + ".1",
                                  g.name() + ".0", g.name() + ".1"}},
            {intm1.name(), {}},
            {intm1.update(0).name(), {intm2.name() + ".0", intm2.name() + ".1",
                                      intm3.name() + ".0", intm3.name() + ".1",
                                      intm1.name() + ".0", intm1.name() + ".1"}},
            {intm2.name(), {}},
            {intm2.update(0).name(), {f.name() + ".0", f.name() + ".1",
                                      intm2.name() + ".0", intm2.name() + ".1"}},
            {intm3.name(), {}},
            {intm3.update(0).name(), {f.name() + ".0", f.name() + ".1",
                                      intm3.name() + ".0", intm3.name() + ".1"}},
            {intm4.name(), {}},
            {intm4.update(0).name(), {f.name() + ".0", f.name() + ".1",
                                      intm4.name() + ".0", intm4.name() + ".1"}},
            {f.name(), {}},
        };
        if (check_call_graphs(checker.calls, expected) != 0) {
            return -1;
        }
    } else {
        {
            p.set(10);
            q.set(true);
            Realization rn = g.realize(10, 10);
            Image<int> im1(rn[0]);
            Image<int> im2(rn[1]);

            Image<int> ref_im1(ref_rn[0]);
            Image<int> ref_im2(ref_rn[1]);

            auto func1 = [&ref_im1](int x, int y, int z) {
                return ref_im1(x, y, z);
            };
            if (check_image(im1, func1)) {
                return -1;
            }
            auto func2 = [&ref_im2](int x, int y, int z) {
                return ref_im2(x, y, z);
            };
            if (check_image(im2, func2)) {
                return -1;
            }
        }
        {
            p.set(10);
            q.set(false);
            Realization rn = g.realize(10, 10);
            Image<int> im1(rn[0]);
            Image<int> im2(rn[1]);

            Image<int> ref_im1(ref_rn[0]);
            Image<int> ref_im2(ref_rn[1]);

            auto func1 = [&ref_im1](int x, int y, int z) {
                return ref_im1(x, y, z);
            };
            if (check_image(im1, func1)) {
                return -1;
            }
            auto func2 = [&ref_im2](int x, int y, int z) {
                return ref_im2(x, y, z);
            };
            if (check_image(im2, func2)) {
                return -1;
            }
        }
        {
            p.set(0);
            q.set(true);
            Realization rn = g.realize(10, 10);
            Image<int> im1(rn[0]);
            Image<int> im2(rn[1]);

            Image<int> ref_im1(ref_rn[0]);
            Image<int> ref_im2(ref_rn[1]);

            auto func1 = [&ref_im1](int x, int y, int z) {
                return ref_im1(x, y, z);
            };
            if (check_image(im1, func1)) {
                return -1;
            }
            auto func2 = [&ref_im2](int x, int y, int z) {
                return ref_im2(x, y, z);
            };
            if (check_image(im2, func2)) {
                return -1;
            }
        }
        {
            p.set(0);
            q.set(false);
            Realization rn = g.realize(10, 10);
            Image<int> im1(rn[0]);
            Image<int> im2(rn[1]);

            Image<int> ref_im1(ref_rn[0]);
            Image<int> ref_im2(ref_rn[1]);

            auto func1 = [&ref_im1](int x, int y, int z) {
                return ref_im1(x, y, z);
            };
            if (check_image(im1, func1)) {
                return -1;
            }
            auto func2 = [&ref_im2](int x, int y, int z) {
                return ref_im2(x, y, z);
            };
            if (check_image(im2, func2)) {
                return -1;
            }
        }
    }
    return 0;
}
コード例 #17
0
int intermediate_computed_if_param_test(int index) {
    buffer_index = index;

    Func f("f_" + std::to_string(index)), g("g_" + std::to_string(index));
    Var x("x"), y("y");
    Param<int> p;

    g(x, y) = x + y;

    f(x, y) = x + y;
    RDom r(0, 100, 0, 100);
    r.where(p > 3);
    f(r.x, r.y) += 2*g(r.x, r.y);

    // Expect g to be only computed over x=[0,99] and y=[0,99] if param is bigger
    // than 3.
    g.compute_root();

    f.set_custom_trace(&box_bound_trace);
    g.trace_stores();
    g.trace_realizations();

    {
        printf("....Set p to 5, expect g to be computed\n");
        p.set(5);
        run_tracer = false;
        niters_expected = 100*100;
        niters = 0;
        Image<int> im = f.realize(200, 200);
        for (int y = 0; y < im.height(); y++) {
            for (int x = 0; x < im.width(); x++) {
                int correct = x + y;
                if ((0 <= x && x <= 99) && (0 <= y && y <= 99)) {
                    correct = 3*correct;
                }
                if (im(x, y) != correct) {
                    printf("im(%d, %d) = %d instead of %d\n",
                           x, y, im(x, y), correct);
                    return -1;
                }
            }
        }
        if (niters_expected != niters) {
            printf("intermediate_computed_if_param_test : Expect niters on g to be %d but got %d instead\n",
                   niters_expected, niters);
            return -1;
        }
    }

    {
        printf("....Set p to 0, expect g to be not computed\n");
        p.set(0);
        run_tracer = false;
        niters_expected = 0;
        niters = 0;
        Image<int> im = f.realize(200, 200);
        for (int y = 0; y < im.height(); y++) {
            for (int x = 0; x < im.width(); x++) {
                int correct = x + y;
                if (im(x, y) != correct) {
                    printf("im(%d, %d) = %d instead of %d\n",
                           x, y, im(x, y), correct);
                    return -1;
                }
            }
        }
        if (niters_expected != niters) {
            printf("intermediate_computed_if_param_test : Expect niters on g to be %d but got %d instead\n",
                   niters_expected, niters);
            return -1;
        }
    }
    return 0;
}
コード例 #18
0
ファイル: image_wrap.cpp プロジェクト: bleibig/Halide
int update_defined_after_wrap_test() {
    Func source("source"), g("g");
    Var x("x"), y("y");

    source(x, y) = x + y;
    ImageParam img(Int(32), 2, "img");
    Buffer<int> buf = source.realize(200, 200);
    img.set(buf);

    g(x, y) = img(x, y);

    Func wrapper = img.in(g);

    // Update of 'g' is defined after img.in(g) is called. g's updates should
    // still call img's wrapper.
    RDom r(0, 100, 0, 100);
    r.where(r.x < r.y);
    g(r.x, r.y) += 2*img(r.x, r.y);

    Param<bool> param;

    Var xi("xi");
    RVar rxo("rxo"), rxi("rxi");
    g.specialize(param).vectorize(x, 8).unroll(x, 2).split(x, x, xi, 4).parallel(x);
    g.update(0).split(r.x, rxo, rxi, 2).unroll(rxi);
    Func img_f = img;
    img_f.compute_root();
    wrapper.compute_root().vectorize(_0, 8).unroll(_0, 2).split(_0, _0, xi, 4).parallel(_0);

    {
        param.set(true);

        // Check the call graphs.
        // Expect initialization of 'g' to call 'wrapper' and its update to call
        // 'wrapper' and 'g', wrapper' to call 'img_f', 'img_f' to call 'img'
        Module m = g.compile_to_module({g.infer_arguments()});
        CheckCalls c;
        m.functions().front().body.accept(&c);

        CallGraphs expected = {
            {g.name(), {wrapper.name(), g.name()}},
            {wrapper.name(), {img_f.name()}},
            {img_f.name(), {img.name()}},
        };
        if (check_call_graphs(c.calls, expected) != 0) {
            return -1;
        }

        Buffer<int> im = g.realize(200, 200);
        auto func = [](int x, int y) {
            return ((0 <= x && x <= 99) && (0 <= y && y <= 99) && (x < y)) ? 3*(x + y) : (x + y);
        };
        if (check_image(im, func)) {
            return -1;
        }
    }

    {
        param.set(false);

        // Check the call graphs.
        // Expect initialization of 'g' to call 'wrapper' and its update to call
        // 'wrapper' and 'g', wrapper' to call 'img_f', 'img_f' to call 'img'
        Module m = g.compile_to_module({g.infer_arguments()});
        CheckCalls c;
        m.functions().front().body.accept(&c);

        CallGraphs expected = {
            {g.name(), {wrapper.name(), g.name()}},
            {wrapper.name(), {img_f.name()}},
            {img_f.name(), {img.name()}},
        };
        if (check_call_graphs(c.calls, expected) != 0) {
            return -1;
        }

        Buffer<int> im = g.realize(200, 200);
        auto func = [](int x, int y) {
            return ((0 <= x && x <= 99) && (0 <= y && y <= 99) && (x < y)) ? 3*(x + y) : (x + y);
        };
        if (check_image(im, func)) {
            return -1;
        }
    }

    return 0;
}
コード例 #19
0
ファイル: memoize.cpp プロジェクト: josephsieh/Halide
int main(int argc, char **argv) {

    {
        call_count = 0;
        Func count_calls;
        count_calls.define_extern("count_calls",
                                  std::vector<ExternFuncArgument>(),
                                  UInt(8), 2);

        Func f;
        f() = count_calls(0, 0);
        f.compute_root().memoize();

        Image<uint8_t> result1 = f.realize();
        Image<uint8_t> result2 = f.realize();

        assert(result1(0) == 42);
        assert(result2(0) == 42);

        assert(call_count == 1);
    }

    {
        call_count = 0;
        Param<int32_t> coord;
        Func count_calls;
        count_calls.define_extern("count_calls",
                                  std::vector<ExternFuncArgument>(),
                                  UInt(8), 2);

        Func f, g;
        Var x, y;
        f() = count_calls(coord, coord);
        f.compute_root().memoize();

        g(x, y) = f();

        coord.set(0);
        Image<uint8_t> out1 = g.realize(256, 256);
        Image<uint8_t> out2 = g.realize(256, 256);

        for (int32_t i = 0; i < 256; i++) {
            for (int32_t j = 0; j < 256; j++) {
                assert(out1(i, j) == 42);
                assert(out2(i, j) == 42);
            }
        }
        assert(call_count == 1);

        coord.set(1);
        Image<uint8_t> out3 = g.realize(256, 256);
        Image<uint8_t> out4 = g.realize(256, 256);

        for (int32_t i = 0; i < 256; i++) {
            for (int32_t j = 0; j < 256; j++) {
                assert(out3(i, j) == 42);
                assert(out4(i, j) == 42);
            }
        }
        assert(call_count == 2);
    }

    {
        call_count = 0;
        Func count_calls;
        count_calls.define_extern("count_calls",
                                  std::vector<ExternFuncArgument>(),
                                  UInt(8), 2);

        Func f;
        Var x, y;
        f(x, y) = count_calls(x, y) + count_calls(x, y);
        count_calls.compute_root().memoize();

        Image<uint8_t> out1 = f.realize(256, 256);
        Image<uint8_t> out2 = f.realize(256, 256);

        for (int32_t i = 0; i < 256; i++) {
            for (int32_t j = 0; j < 256; j++) {
                assert(out1(i, j) == (42 + 42));
                assert(out2(i, j) == (42 + 42));
            }
        }
        assert(call_count == 1);
    }

    call_count = 0;

    {
        Func count_calls_23;
        count_calls_23.define_extern("count_calls_with_arg",
                                     Internal::vec(ExternFuncArgument(cast<uint8_t>(23))),
                                     UInt(8), 2);

        Func count_calls_42;
        count_calls_42.define_extern("count_calls_with_arg",
                                     Internal::vec(ExternFuncArgument(cast<uint8_t>(42))),
                                     UInt(8), 2);

        Func f;
        Var x, y;
        f(x, y) = count_calls_23(x, y) + count_calls_42(x, y);
        count_calls_23.compute_root().memoize();
        count_calls_42.compute_root().memoize();

        Image<uint8_t> out1 = f.realize(256, 256);
        Image<uint8_t> out2 = f.realize(256, 256);

        for (int32_t i = 0; i < 256; i++) {
            for (int32_t j = 0; j < 256; j++) {
                assert(out1(i, j) == (23 + 42));
                assert(out2(i, j) == (23 + 42));
            }
        }
        assert(call_count_with_arg == 2);
    }

    {
        Param<uint8_t> val1;
        Param<uint8_t> val2;

        call_count_with_arg = 0;
        Func count_calls_val1;
        count_calls_val1.define_extern("count_calls_with_arg",
                                       Internal::vec(ExternFuncArgument(Expr(val1))),
                                       UInt(8), 2);

        Func count_calls_val2;
        count_calls_val2.define_extern("count_calls_with_arg",
                                       Internal::vec(ExternFuncArgument(Expr(val2))),
                                       UInt(8), 2);

        Func f;
        Var x, y;
        f(x, y) = count_calls_val1(x, y) + count_calls_val2(x, y);
        count_calls_val1.compute_root().memoize();
        count_calls_val2.compute_root().memoize();

        val1.set(23);
        val2.set(42);

        Image<uint8_t> out1 = f.realize(256, 256);
        Image<uint8_t> out2 = f.realize(256, 256);

        val1.set(42);
        Image<uint8_t> out3 = f.realize(256, 256);

        val1.set(23);
        Image<uint8_t> out4 = f.realize(256, 256);

        val1.set(42);
        Image<uint8_t> out5 = f.realize(256, 256);

        val2.set(57);
        Image<uint8_t> out6 = f.realize(256, 256);


        for (int32_t i = 0; i < 256; i++) {
            for (int32_t j = 0; j < 256; j++) {
                assert(out1(i, j) == (23 + 42));
                assert(out2(i, j) == (23 + 42));
                assert(out3(i, j) == (42 + 42));
                assert(out4(i, j) == (23 + 42));
                assert(out5(i, j) == (42 + 42));
                assert(out6(i, j) == (42 + 57));
            }
        }
        assert(call_count_with_arg == 4);
    }

    {
        Param<float> val;

        call_count_with_arg = 0;
        Func count_calls;
        count_calls.define_extern("count_calls_with_arg",
                                  Internal::vec(ExternFuncArgument(cast<uint8_t>(val))),
                                  UInt(8), 2);

        Func f;
        Var x, y;
        f(x, y) = count_calls(x, y) + count_calls(x, y);
        count_calls.compute_root().memoize();

        val.set(23.0f);
        Image<uint8_t> out1 = f.realize(256, 256);
        val.set(23.4f);
        Image<uint8_t> out2 = f.realize(256, 256);

        for (int32_t i = 0; i < 256; i++) {
            for (int32_t j = 0; j < 256; j++) {
                assert(out1(i, j) == (23 + 23));
                assert(out2(i, j) == (23 + 23));
            }
        }
        assert(call_count_with_arg == 2);
    }

    {
        Param<float> val;

        call_count_with_arg = 0;
        Func count_calls;
        count_calls.define_extern("count_calls_with_arg",
                                  Internal::vec(ExternFuncArgument(memoize_tag(cast<uint8_t>(val)))),
                                  UInt(8), 2);

        Func f;
        Var x, y;
        f(x, y) = count_calls(x, y) + count_calls(x, y);
        count_calls.compute_root().memoize();

        val.set(23.0f);
        Image<uint8_t> out1 = f.realize(256, 256);
        val.set(23.4f);
        Image<uint8_t> out2 = f.realize(256, 256);

        for (int32_t i = 0; i < 256; i++) {
            for (int32_t j = 0; j < 256; j++) {
                assert(out1(i, j) == (23 + 23));
                assert(out2(i, j) == (23 + 23));
            }
        }
        assert(call_count_with_arg == 1);
    }

    {
        // Case with bounds computed not equal to bounds realized.
        Param<float> val;
        Param<int32_t> index;

        call_count_with_arg = 0;
        Func count_calls;
        count_calls.define_extern("count_calls_with_arg",
                                  Internal::vec(ExternFuncArgument(cast<uint8_t>(val))),
                                  UInt(8), 2);
        Func f, g, h;
        Var x;

        f(x) = count_calls(x, 0) + cast<uint8_t>(x);
        g(x) = f(x);
        h(x) = g(4) + g(index);

        f.compute_root().memoize();
        g.vectorize(x, 8).compute_at(h, x);

        val.set(23.0f);
        index.set(2);
        Image<uint8_t> out1 = h.realize(1);

        assert(out1(0) == (uint8_t)(2 * 23 + 4 + 2));
        assert(call_count_with_arg == 3);

        index.set(4);
        out1 = h.realize(1);

        assert(out1(0) == (uint8_t)(2 * 23 + 4 + 4));
        assert(call_count_with_arg == 4);
    }

    {
        // Test Tuple case
        Param<float> val;

        call_count_with_arg = 0;
        Func count_calls;
        count_calls.define_extern("count_calls_with_arg",
                                  Internal::vec(ExternFuncArgument(cast<uint8_t>(val))),
                                  UInt(8), 2);

        Func f;
        Var x, y, xi, yi;
        f(x, y) = Tuple(count_calls(x, y) + cast<uint8_t>(x), x);
        count_calls.compute_root().memoize();
        f.compute_root().memoize();

        Func g;
        g(x, y) = Tuple(f(x, y)[0] + f(x - 1, y)[0] + f(x + 1, y)[0], f(x, y)[1]);

        val.set(23.0f);
        Realization out = g.realize(128, 128);
        Image<uint8_t> out0 = out[0];
        Image<int32_t> out1 = out[1];


        for (int32_t i = 0; i < 100; i++) {
            for (int32_t j = 0; j < 100; j++) {
                assert(out0(i, j) == (uint8_t)(3 * 23 + i + (i - 1) + (i + 1)));
                assert(out1(i, j) == i);
            }
        }
        out = g.realize(128, 128);
        out0 = out[0];
        out1 = out[1];


        for (int32_t i = 0; i < 100; i++) {
            for (int32_t j = 0; j < 100; j++) {
                assert(out0(i, j) == (uint8_t)(3 * 23 + i + (i - 1) + (i + 1)));
                assert(out1(i, j) == i);
            }
        }
        assert(call_count_with_arg == 1);
    }

    {
        // Test cache eviction
        Param<float> val;

        call_count_with_arg = 0;
        Func count_calls;
        count_calls.define_extern("count_calls_with_arg",
                                  Internal::vec(ExternFuncArgument(cast<uint8_t>(val))),
                                  UInt(8), 2);

        Func f;
        Var x, y, xi, yi;
        f(x, y) = count_calls(x, y) + cast<uint8_t>(x);
        count_calls.compute_root().memoize();

        Func g;
        g(x, y) = f(x, y) + f(x - 1, y) + f(x + 1, y);
        Internal::JITSharedRuntime::memoization_cache_set_size(1000000);

        for (int v = 0; v < 1000; v++) {
            int r = rand() % 256;
            val.set((float)r);
            Image<uint8_t> out1 = g.realize(128, 128);

            for (int32_t i = 0; i < 100; i++) {
                for (int32_t j = 0; j < 100; j++) {
                    assert(out1(i, j) == (uint8_t)(3 * r + i + (i - 1) + (i + 1)));
                }
            }
        }
        // TODO work out an assertion on call count here.
        fprintf(stderr, "Call count is %d.\n", call_count_with_arg);

        // Return cache size to default.
        Internal::JITSharedRuntime::memoization_cache_set_size(0);
    }

    {
        // Test flushing entire cache with a single element larger than the cache
        Param<float> val;

        call_count_with_arg = 0;
        Func count_calls;
        count_calls.define_extern("count_calls_with_arg",
                                  Internal::vec(ExternFuncArgument(cast<uint8_t>(val))),
                                  UInt(8), 2);

        Func f;
        Var x, y, xi, yi;
        f(x, y) = count_calls(x, y) + cast<uint8_t>(x);
        count_calls.compute_root().memoize();

        Func g;
        g(x, y) = f(x, y) + f(x - 1, y) + f(x + 1, y);
        Internal::JITSharedRuntime::memoization_cache_set_size(1000000);

        for (int v = 0; v < 1000; v++) {
            int r = rand() % 256;
            val.set((float)r);
            Image<uint8_t> out1 = g.realize(128, 128);

            for (int32_t i = 0; i < 100; i++) {
                for (int32_t j = 0; j < 100; j++) {
                    assert(out1(i, j) == (uint8_t)(3 * r + i + (i - 1) + (i + 1)));
                }
            }
        }

        // TODO work out an assertion on call count here.
        fprintf(stderr, "Call count before oversize realize is %d.\n", call_count_with_arg);
        call_count_with_arg = 0;

        Image<uint8_t> big = g.realize(1024, 1024);
        Image<uint8_t> big2 = g.realize(1024, 1024);

        // TODO work out an assertion on call count here.
        fprintf(stderr, "Call count after oversize realize is %d.\n", call_count_with_arg);

        call_count_with_arg = 0;
        for (int v = 0; v < 1000; v++) {
            int r = rand() % 256;
            val.set((float)r);
            Image<uint8_t> out1 = g.realize(128, 128);

            for (int32_t i = 0; i < 100; i++) {
                for (int32_t j = 0; j < 100; j++) {
                    assert(out1(i, j) == (uint8_t)(3 * r + i + (i - 1) + (i + 1)));
                }
            }
        }

        fprintf(stderr, "Call count is %d.\n", call_count_with_arg);

        // Return cache size to default.
        Internal::JITSharedRuntime::memoization_cache_set_size(0);
    }

    {
        // Test parallel cache access
        Param<float> val;

        Func count_calls;
        count_calls.define_extern("count_calls_with_arg_parallel",
                                  Internal::vec(ExternFuncArgument(cast<uint8_t>(val))),
                                  UInt(8), 3);

        Func f;
        Var x, y;
        // Ensure that all calls map to the same cache key, but pass a thread ID
        // through to avoid having to do locking or an atomic add
        f(x, y) = count_calls(x, y % 4, memoize_tag(y / 16, 0)) + cast<uint8_t>(x);

        Func g;
        g(x, y) = f(x, y) + f(x - 1, y) + f(x + 1, y);
        count_calls.compute_at(f, y).memoize();
        f.compute_at(g, y).memoize();
        g.parallel(y, 16);

        val.set(23.0f);
        Internal::JITSharedRuntime::memoization_cache_set_size(1000000);
        Image<uint8_t> out = g.realize(128, 128);

        for (int32_t i = 0; i < 128; i++) {
            for (int32_t j = 0; j < 128; j++) {
                assert(out(i, j) == (uint8_t)(3 * 23 + i + (i - 1) + (i + 1)));
            }
        }

        // TODO work out an assertion on call counts here.
        for (int i = 0; i < 8; i++) {
          fprintf(stderr, "Call count for thread %d is %d.\n", i, call_count_with_arg_parallel[i]);
        }

        // Return cache size to default.
        Internal::JITSharedRuntime::memoization_cache_set_size(0);
    }

    {
        Param<float> val;

        Func f;
        Var x, y;
        f(x, y) = cast<uint8_t>((x << 8) + y);

        Func prev_func = f;

        Func stage[4];
        for (int i = 0; i < 4; i++) {
            std::vector<ExternFuncArgument> args(3);
            args[0] = cast<int32_t>(i);
            args[1] = cast<int32_t>(val);
            args[2] = prev_func;
            stage[i].define_extern("count_calls_staged",
                                   args,
                                   UInt(8), 2);
            prev_func = stage[i];
        }

        f.compute_root();
        for (int i = 0; i < 3; i++) {
          stage[i].compute_root();
        }
        stage[3].compute_root().memoize();
        val.set(23.0f);
        Image<uint8_t> result = stage[3].realize(128, 128);

        for (int32_t i = 0; i < 128; i++) {
            for (int32_t j = 0; j < 128; j++) {
              assert(result(i, j) == (uint8_t)((i << 8) + j + 4 * 23));
            }
        }

        for (int i = 0; i < 4; i++) {
          fprintf(stderr, "Call count for stage %d is %d.\n", i, call_count_staged[i]);
        }

        result = stage[3].realize(128, 128);
        for (int32_t i = 0; i < 128; i++) {
            for (int32_t j = 0; j < 128; j++) {
              assert(result(i, j) == (uint8_t)((i << 8) + j + 4 * 23));
            }
        }

        for (int i = 0; i < 4; i++) {
            fprintf(stderr, "Call count for stage %d is %d.\n", i, call_count_staged[i]);
        }

    }

    fprintf(stderr, "Success!\n");
    return 0;
}
コード例 #20
0
ファイル: lerp.cpp プロジェクト: AheadIO/Halide
int main(int argc, char **argv) {
    // Test bool
    check_range<bool, uint8_t>(0, 2, 0, 1,
                               0, 2, 0, 1,
                               0, 256, 0, 1,
                               "<bool, uint8_t> exhaustive");

    // Exhaustively test 8-bit cases
    check_range<uint8_t, uint8_t>(0, 256, 0, 1,
                                  0, 256, 0, 1,
                                  0, 256, 0, 1,
                                  "<uint8_t, uint8_t> exhaustive");
    check_range<int8_t, uint8_t>(0, 256, -128, 1,
                                 0, 256, -128, 1,
                                 0, 256, 0, 1,
                                 "<int8_t, uint8_t> exhaustive");
    check_range<uint8_t, float>(0, 256, 0, 1,
                                0, 256, 0, 1,
                                0, 256, 0, 1/255.0f,
                                "<uint8_t, float> exhaustive");
    check_range<int8_t, float>(0, 256, -128, 1,
                               0, 256, -128, 1,
                               0, 256, 0, 1/255.0f,
                               "<int8_t, float> exhaustive");

    // Check all delta values for 16-bit, verify swapping arguments doesn't break
    check_range<uint16_t, uint16_t>(0, 65536, 0, 1,
                                    65535, 1, 0, 1,
                                    0, 257, 255, 1,
                                    "<uint16_t, uint16_t> all zero starts");
    check_range<uint16_t, uint16_t>(65535, 1, 0, 1,
                                    0, 65536, 0, 1,
                                    0, 257, 255, 1,
                                    "<uint16_t, uint16_t> all one starts");


    // Verify different bit sizes for value and weight types
    check_range<uint16_t, uint8_t>(0, 1, 0, 1,
                                   65535, 1, 0, 1,
                                   0, 255, 1, 1,
                                   "<uint16_t, uint8_t> zero, one uint8_t weight test");
    check_range<uint16_t, uint32_t>(0, 1, 0, 1,
                                   65535, 1, 0, 1,
                                   std::numeric_limits<int32_t>::min(), 257, 255 * 65535, 1,
                                   "<uint16_t, uint8_t> zero, one uint32_t weight test");
    check_range<uint32_t, uint8_t>(0, 1, 0, 1,
                                   1 << 31, 1, 0, 1,
                                   0, 255, 0, 1,
                                   "<uint32_t, uint8_t> weight test");
    check_range<uint32_t, uint16_t>(0, 1, 0, 1,
                                    1 << 31, 1, 0, 1,
                                    0, 65535, 0, 1,
                                   "<uint32_t, uint16_t> weight test");

    // Verify float weights with integer values
    check_range<uint16_t, float>(0, 1, 0, 1,
                                 65535, 1, 0, 1,
                                 0, 257, 0, 255.0f/65535.0f,
                                 "<uint16_t, float> zero, one float weight test");

    check_range<int16_t, uint16_t>(0, 65536, -32768, 1,
                                   0, 1, 0, 1,
                                   0, 257, 0, 255,
                                   "<int16_t, uint16_t> all zero starts");

  #if 0 // takes too long, difficult to test with uint32_t
    // Check all delta values for 32-bit, do it in signed arithmetic
    check_range<int32_t, uint32_t>(std::numeric_limits<int32_t>::min(), std::numeric_limits<int32_t>::max(), 0, 1,
                                   1 << 31, 1, 0, 1,
                                   0, 1, 1 << 31, 1,
                                    "<uint32_t, uint32_t> all zero starts");
  #endif

    check_range<float, float>(0, 100, 0, .01,
                              0, 100, 0, .01,
                              0, 100, 0, .01,
                              "<float, float> float values 0 to 1 by 1/100ths");

    check_range<float, float>(0, 100, -5, .1,
                              0, 100, 0, .1,
                              0, 100, 0, .1,
                              "<float, float> float values -5 to 5 by 1/100ths");

    // Verify float values with integer weights
    check_range<float, uint8_t>(0, 100, -5, .1,
                              0, 100, 0, .1,
                              0, 255, 0, 1,
                              "<float, uint8_t> float values -5 to 5 by 1/100ths");
    check_range<float, uint16_t>(0, 100, -5, .1,
                                 0, 100, 0, .1,
                                 0, 255, 0, 257,
                                 "<float, uint16_t> float values -5 to 5 by 1/100ths");
    check_range<float, uint32_t>(0, 100, -5, .1,
                                 0, 100, 0, .1,
                                 std::numeric_limits<int32_t>::min(), 257, 255 * 65535, 1,
                                 "<float, uint32_t> float values -5 to 5 by 1/100ths");

    // Check constant and constant case:
    Func lerp_constants("lerp_constants");
    lerp_constants() = lerp(0, cast<uint32_t>(1023), .5f);
    Image<uint32_t> result = lerp_constants.realize();

    uint32_t expected = evaluate<uint32_t>(cast<uint32_t>(lerp(0, cast<uint16_t>(1023), .5f)));
    if (result(0) != expected)
        std::cerr << "Expected " << expected << " got " << result(0) << std::endl;
    assert(result(0) == expected);

    // Add a little more coverage for uint32_t as this was failing
    // without being detected for a long time.

    Image<uint8_t> input_a_img(16, 16);
    Image<uint8_t> input_b_img(16, 16);

    for (int i = 0; i < 16; i ++) {
        for (int j = 0; j < 16; j ++) {
  	    input_a_img(i, j) = (i << 4) + j;
	    input_b_img(i, j) = ((15 - i) << 4) + (15 - j);
	}
    }

    ImageParam input_a(UInt(8), 2);
    ImageParam input_b(UInt(8), 2);

    Var x, y;
    Func lerp_with_casts;
    Param<float> w;
    lerp_with_casts(x, y) = lerp(cast<int32_t>(input_a(x, y)), cast<int32_t>(input_b(x, y)), w);
    lerp_with_casts.vectorize(x, 4);

    input_a.set(input_a_img);
    input_b.set(input_b_img);

    w.set(0.0f);
    Image<int32_t> result_should_be_a = lerp_with_casts.realize(16, 16);
    w.set(1.0f);
    Image<int32_t> result_should_be_b = lerp_with_casts.realize(16, 16);

    for (int i = 0; i < 16; i ++) {
        for (int j = 0; j < 16; j ++) {
	    assert(input_a_img(i, j) == result_should_be_a(i, j));
	    assert(input_b_img(i, j) == result_should_be_b(i, j));
	}
    }

    std::cout << "Success!" << std::endl;
}
コード例 #21
0
int gpu_intermediate_computed_if_param_test(int index) {
    buffer_index = index;

    Func f("f_" + std::to_string(index)), g("g_" + std::to_string(index)), h("h_" + std::to_string(index));
    Var x("x"), y("y");
    Param<int> p;

    g(x, y) = x + y;
    h(x, y) = 10;

    f(x, y) = x + y;
    RDom r1(0, 100, 0, 100);
    r1.where(p > 3);
    f(r1.x, r1.y) += 2*g(r1.x, r1.y);

    RDom r2(0, 100, 0, 100);
    r2.where(p <= 3);
    f(r2.x, r2.y) += h(r2.x, r2.y) + g(r2.x, r2.y);

    f.update(0).specialize(p >= 2).gpu_tile(r1.x, r1.y, 4, 4);
    g.compute_root();
    h.compute_root();
    h.gpu_tile(x, y, 8, 8);

    {
        printf("....Set p to 5, expect g to be computed\n");
        p.set(5);
        run_tracer = false;
        niters_expected = 100*100;
        niters = 0;
        Image<int> im = f.realize(200, 200);
        for (int y = 0; y < im.height(); y++) {
            for (int x = 0; x < im.width(); x++) {
                int correct = x + y;
                if ((0 <= x && x <= 99) && (0 <= y && y <= 99)) {
                    correct = 3*correct;
                }
                if (im(x, y) != correct) {
                    printf("im(%d, %d) = %d instead of %d\n",
                           x, y, im(x, y), correct);
                    return -1;
                }
            }
        }
    }

    {
        printf("....Set p to 0, expect g to be not computed\n");
        p.set(0);
        run_tracer = false;
        niters_expected = 0;
        niters = 0;
        Image<int> im = f.realize(200, 200);
        for (int y = 0; y < im.height(); y++) {
            for (int x = 0; x < im.width(); x++) {
                int correct = x + y;
                if ((0 <= x && x <= 99) && (0 <= y && y <= 99)) {
                    correct += 10 + correct;
                }
                if (im(x, y) != correct) {
                    printf("im(%d, %d) = %d instead of %d\n",
                           x, y, im(x, y), correct);
                    return -1;
                }
            }
        }
    }
    return 0;
}
コード例 #22
0
ファイル: specialize.cpp プロジェクト: Mengke-Yuan/Halide
int main(int argc, char **argv) {
    {
        Param<bool> param;

        Func f;
        Var x;
        f(x) = select(param, x*3, x*17);

        // Vectorize when the output is large enough
        Expr cond = (f.output_buffer().width() >= 4);
        f.specialize(cond).vectorize(x, 4);

        // This has created a specialization of f that is
        // vectorized. Now we want to further specialize both the
        // default case and the special case based on param. We can
        // retrieve a reference to the specialization using the same
        // condition again:
        f.specialize(cond).specialize(param);

        // Now specialize the narrow case on param as well
        f.specialize(param);

        f.set_custom_trace(&my_trace);
        f.trace_stores();

        Image<int> out(100);

        // Just check that all the specialization didn't change the output.
        param.set(true);
        reset_trace();
        f.realize(out);
        for (int i = 0; i < out.width(); i++) {
            int correct = i*3;
            if (out(i) != correct) {
                printf("out(%d) was %d instead of %d\n",
                       i, out(i), correct);
            }
        }
        param.set(false);
        f.realize(out);
        for (int i = 0; i < out.width(); i++) {
            int correct = i*17;
            if (out(i) != correct) {
                printf("out(%d) was %d instead of %d\n",
                       i, out(i), correct);
            }
        }

        // Should have used vector stores
        if (!vector_store  || scalar_store) {
            printf("This was supposed to use vector stores\n");
            return -1;
        }

        // Now try a smaller input
        out = Image<int>(3);
        param.set(true);
        reset_trace();
        f.realize(out);
        for (int i = 0; i < out.width(); i++) {
            int correct = i*3;
            if (out(i) != correct) {
                printf("out(%d) was %d instead of %d\n",
                       i, out(i), correct);
            }
        }
        param.set(false);
        f.realize(out);
        for (int i = 0; i < out.width(); i++) {
            int correct = i*17;
            if (out(i) != correct) {
                printf("out(%d) was %d instead of %d\n",
                       i, out(i), correct);
            }
        }

        // Should have used scalar stores
        if (vector_store || !scalar_store) {
            printf("This was supposed to use scalar stores\n");
            return -1;
        }

    }

    {
        Func f1, f2, g1, g2;
        Var x;

        // Define pipeline A
        f1(x) = x + 7;
        g1(x) = f1(x) + f1(x + 1);

        // Define pipeline B
        f2(x) = x * 34;
        g2(x) = f2(x) + f2(x - 1);

        // Switch between them based on a boolean param
        Param<bool> param;
        Func out;
        out(x) = select(param, g1(x), g2(x));

        // These will be outside the condition that specializes out,
        // but skip stages will nuke their allocation and computation
        // for us.
        f1.compute_root();
        g1.compute_root();
        f2.compute_root();

        out.specialize(param);

        // Count allocations.
        out.set_custom_allocator(&my_malloc, &my_free);

        reset_alloc_counts();
        param.set(true);
        out.realize(100);

        if (empty_allocs != 1 || nonempty_allocs != 2 || frees != 3) {
            printf("There were supposed to be 1 empty alloc, 2 nonempty allocs, and 3 frees.\n"
                   "Instead we got %d empty allocs, %d nonempty allocs, and %d frees.\n",
                   empty_allocs, nonempty_allocs, frees);
            return -1;
        }

        reset_alloc_counts();
        param.set(false);
        out.realize(100);

        if (empty_allocs != 2 || nonempty_allocs != 1 || frees != 3) {
            printf("There were supposed to be 2 empty allocs, 1 nonempty alloc, and 3 frees.\n"
                   "Instead we got %d empty allocs, %d nonempty allocs, and %d frees.\n",
                   empty_allocs, nonempty_allocs, frees);
            return -1;
        }
    }

    {
        // Specialize for interleaved vs planar inputs
        ImageParam im(Float(32), 1);
        im.set_stride(0, Expr()); // unconstrain the stride

        Func f;
        Var x;

        f(x) = im(x);

        // If we have a stride of 1 it's worth vectorizing, but only if the width is also > 8.
        f.specialize(im.stride(0) == 1 && im.width() >= 8).vectorize(x, 8);

        f.trace_stores();
        f.set_custom_trace(&my_trace);

        // Check bounds inference is still cool with widths < 8
        f.infer_input_bounds(5);
        int m = im.get().min(0), e = im.get().extent(0);
        if (m != 0 || e != 5) {
            printf("min, extent = %d, %d instead of 0, 5\n", m, e);
            return -1;
        }

        // Check we don't crash with the small input, and that it uses scalar stores
        reset_trace();
        f.realize(5);
        if (!scalar_store || vector_store) {
            printf("These stores were supposed to be scalar.\n");
            return -1;
        }

        // Check we don't crash with a larger input, and that it uses vector stores
        Image<float> image(100);
        im.set(image);

        reset_trace();
        f.realize(100);
        if (scalar_store || !vector_store) {
            printf("These stores were supposed to be vector.\n");
            return -1;
        }

    }

    {
        // Bounds required of the input change depending on the param
        ImageParam im(Float(32), 1);
        Param<bool> param;

        Func f;
        Var x;
        f(x) = select(param, im(x + 10), im(x - 10));
        f.specialize(param);

        param.set(true);
        f.infer_input_bounds(100);
        int m = im.get().min(0);
        if (m != 10) {
            printf("min %d instead of 10\n", m);
            return -1;
        }
        param.set(false);
        im.set(Buffer());
        f.infer_input_bounds(100);
        m = im.get().min(0);
        if (m != -10) {
            printf("min %d instead of -10\n", m);
            return -1;
        }

    }

    {
        // Specialize an update definition
        Func f;
        Var x;
        Param<int> start, size;
        RDom r(start, size);


        f(x) = x;
        f(r) = 10 - r;

        // Special-case for when we only update one element of f
        f.update().specialize(size == 1);

        // Also special-case updating no elements of f
        f.update().specialize(size == 0);

        start.set(0);
        size.set(1);

        // Not crashing is enough
        f.realize(100);
    }

    {
        // What happens to bounds inference if an input is not used at
        // all for a given specialization?
        ImageParam im(Float(32), 1);
        Param<bool> param;
        Func f;
        Var x;

        f(x) = select(param, im(x), 0.0f);

        f.specialize(param);

        param.set(false);
        Image<float> image(10);
        im.set(image);
        // The image is too small, but that should be OK, because the
        // param is false so the image will never be used.
        f.realize(100);

    }

    {
        // Specialization inherits the scheduling directives done so far:

        ImageParam im(Int(32), 2);
        Func f;
        Var x, y;
        f(x, y) = im(x, y);

        Expr cond = f.output_buffer().width() >= 4;

        // Unroll y by two innermost.
        f.reorder(y, x).unroll(y, 2).reorder(x, y);

        // Vectorize if the output is at least 4-wide. Inherits the
        // unrolling already done.
        f.specialize(cond).vectorize(x, 4);

        // Confirm that the unrolling applies to both cases using bounds inference:
        f.infer_input_bounds(3, 1);

        if (im.get().extent(0) != 3) {
            printf("extent(0) was supposed to be 3.\n");
            return -1;
        }

        if (im.get().extent(1) != 2) {
            // Height is 2, because the unrolling also happens in the
            // specialized case.
            printf("extent(1) was supposed to be 2.\n");
            return -1;
        }

    }

    {
        // Check we don't need to specialize intermediate stages.
        ImageParam im(Int(32), 1);
        Func f, g, h, out;
        Var x;
        f(x) = im(x);
        g(x) = f(x);
        h(x) = g(x);
        out(x) = h(x);

        Expr w = out.output_buffer().extent(0);
        out.output_buffer().set_min(0, 0);

        f.compute_root().specialize(w >= 4).vectorize(x, 4);
        g.compute_root().vectorize(x, 4);
        h.compute_root().vectorize(x, 4);
        out.specialize(w >= 4).vectorize(x, 4);

        Image<int> input(3), output(3);
        // Shouldn't throw a bounds error:
        im.set(input);
        out.realize(output);
    }

    {
        // Check specializations of stages nested in other stages simplify appropriately.
        ImageParam im(Int(32), 2);
        Param<bool> cond1, cond2;
        Func f, out;
        Var x, y;
        f(x, y) = im(x, y);
        out(x, y) = f(x, y);

        f.compute_at(out, x).specialize(cond1 && cond2).vectorize(x, 4);
        out.compute_root().specialize(cond1 && cond2).vectorize(x, 4);

        if_then_else_count = 0;
        CountIfThenElse pass1;
        for (auto ff : out.compile_to_module(out.infer_arguments()).functions()) {
            pass1.mutate(ff.body);
        }

        Image<int> input(3, 3), output(3, 3);
        // Shouldn't throw a bounds error:
        im.set(input);
        out.realize(output);

        if (if_then_else_count != 1) {
            printf("Expected 1 IfThenElse stmts. Found %d.\n", if_then_else_count);
            return -1;
        }
    }

    {
        // Check specializations of stages nested in other stages simplify appropriately.
        ImageParam im(Int(32), 2);
        Param<bool> cond1, cond2;
        Func f, out;
        Var x, y;
        f(x, y) = im(x, y);
        out(x, y) = f(x, y);

        f.compute_at(out, x).specialize(cond1).vectorize(x, 4);
        out.compute_root().specialize(cond1 && cond2).vectorize(x, 4);

        if_then_else_count = 0;
        CountIfThenElse pass2;
        for (auto ff : out.compile_to_module(out.infer_arguments()).functions()) {
            pass2.mutate(ff.body);
        }

        Image<int> input(3, 3), output(3, 3);
        // Shouldn't throw a bounds error:
        im.set(input);
        out.realize(output);

        // There should have been 2 Ifs total: They are the
        // outer cond1 && cond2, and the condition in the true case
        // should have been simplified away. The If in the false
        // branch cannot be simplified.
        if (if_then_else_count != 2) {
            printf("Expected 2 IfThenElse stmts. Found %d.\n", if_then_else_count);
            return -1;
        }
    }

    printf("Success!\n");
    return 0;

}
コード例 #23
0
ファイル: print.cpp プロジェクト: ronen/Halide
int main(int argc, char **argv) {
    if (get_jit_target_from_environment().has_feature(Target::Profile)) {
        // The profiler adds lots of extra prints, so counting the
        // number of prints is not useful.
        printf("Skipping test because profiler is active\n");
        return 0;
    }

    Var x;

    {
        Func f;

        f(x) = print(x * x, "the answer is", 42.0f, "unsigned", cast<uint32_t>(145));
        f.set_custom_print(halide_print);
        Buffer<int32_t> result = f.realize(10);

        for (int32_t i = 0; i < 10; i++) {
            if (result(i) != i * i) {
                return -1;
            }
        }

        assert(messages.size() == 10);
        for (size_t i = 0; i < messages.size(); i++) {
            long square;
            float forty_two;
            unsigned long one_forty_five;

            int scan_count = sscanf(messages[i].c_str(), "%ld the answer is %f unsigned %lu",
                                    &square, &forty_two, &one_forty_five);
            assert(scan_count == 3);
            assert(square == static_cast<long long>(i * i));
            assert(forty_two == 42.0f);
            assert(one_forty_five == 145);
        }
    }

    messages.clear();

    {
        Func f;
        Param<int> param;
        param.set(127);

        // Test a string containing a printf format specifier (It should print it as-is).
        f(x) = print_when(x == 3, x * x, "g", 42.0f, "%s", param);
        f.set_custom_print(halide_print);
        Buffer<int32_t> result = f.realize(10);

        for (int32_t i = 0; i < 10; i++) {
            if (result(i) != i * i) {
                return -1;
            }
        }

        assert(messages.size() == 1);
        long nine;
        float forty_two;
        long p;

        int scan_count = sscanf(messages[0].c_str(), "%ld g %f %%s %ld",
                                &nine, &forty_two, &p);
        assert(scan_count == 3);
        assert(nine == 9);
        assert(forty_two == 42.0f);
        assert(p == 127);

    }

    messages.clear();

    {
        Func f;

        // Test a single message longer than 8K.
        std::vector<Expr> args;
        for (int i = 0; i < 500; i++) {
            uint64_t n = i;
            n *= n;
            n *= n;
            n *= n;
            n *= n;
            n += 100;
            int32_t hi = n >> 32;
            int32_t lo = n & 0xffffffff;
            args.push_back((cast<uint64_t>(hi) << 32) | lo);
            Expr dn = cast<double>((float)(n));
            args.push_back(dn);
        }
        f(x) = print(args);
        f.set_custom_print(halide_print);
        Buffer<uint64_t> result = f.realize(1);

        if (result(0) != 100) {
            return -1;
        }

        assert(messages.back().size() == 8191);
    }

    messages.clear();

    // Check that Halide's stringification of floats and doubles
    // matches %f and %e respectively.

    #ifndef _WIN32
    // msvc's library has different ideas about how %f and %e should come out.
    {
        Func f, g;

        const int N = 1000000;

        Expr e = reinterpret(Float(32), random_uint());
        // Make sure we cover some special values.
        e = select(x == 0, 0.0f,
                   x == 1, -0.0f,
                   x == 2, std::numeric_limits<float>::infinity(),
                   x == 3, -std::numeric_limits<float>::infinity(),
                   x == 4, std::numeric_limits<float>::quiet_NaN(),
                   x == 5, -std::numeric_limits<float>::quiet_NaN(),
                   e);
        e = select(x == 5, std::numeric_limits<float>::denorm_min(),
                   x == 6, -std::numeric_limits<float>::denorm_min(),
                   x == 7, std::numeric_limits<float>::min(),
                   x == 8, -std::numeric_limits<float>::min(),
                   x == 9, std::numeric_limits<float>::max(),
                   x == 10, -std::numeric_limits<float>::max(),
                   x == 11, 1.0f - 1.0f / (1 << 22),
                   e);

        f(x) = print(e);

        f.set_custom_print(halide_print);
        Buffer<float> imf = f.realize(N);

        assert(messages.size() == (size_t)N);

        char correct[1024];
        for (int i = 0; i < N; i++) {
            snprintf(correct, sizeof(correct), "%f\n", imf(i));
            // OS X prints -nan as nan
            #ifdef __APPLE__
            if (messages[i] == "-nan\n") messages[i] = "nan\n";
            #endif
            if (messages[i] != correct) {
                printf("float %d: %s vs %s for %10.20e\n", i, messages[i].c_str(), correct, imf(i));
                return -1;
            }
        }

        messages.clear();

        g(x) = print(reinterpret(Float(64), (cast<uint64_t>(random_uint()) << 32) | random_uint()));
        g.set_custom_print(halide_print);
        Buffer<double> img = g.realize(N);

        assert(messages.size() == (size_t)N);

        for (int i = 0; i < N; i++) {
            snprintf(correct, sizeof(correct), "%e\n", img(i));
            #ifdef __APPLE__
            if (messages[i] == "-nan\n") messages[i] = "nan\n";
            #endif
            if (messages[i] != correct) {
                printf("double %d: %s vs %s for %10.20e\n", i, messages[i].c_str(), correct, img(i));
                return -1;
            }
        }


    }
    #endif


    printf("Success!\n");
    return 0;
}
コード例 #24
0
ファイル: inverse.cpp プロジェクト: adityaatluri/Halide
int main(int argc, char **argv) {

    Func f1, f2, f3, f4, f5;
    Func g1, g2, g3, g4, g5;

    Var x, xi;
    Expr v = x*1.34f + 1.0142f;

    Param<float> p;
    p.set(1.0f);

    // Test accuracy of reciprocals.

    // First prevent any optimizations by hiding 1.0 in a param.
    f1(x) = p / v;

    // Now test various vectorization widths with an explicit 1.0. On
    // arm 2 and 4 trigger optimizations. On x86 4 and 8 do.
    f2(x) = fast_inverse(v);
    f2.vectorize(x, 2);

    f3(x) = fast_inverse(v);
    f3.vectorize(x, 4);

    f4(x) = fast_inverse(v);
    f4.vectorize(x, 8);

    // Same thing for reciprocal square root.
    g1(x) = p / sqrt(v);

    g2(x) = fast_inverse_sqrt(v);
    g2.vectorize(x, 2);

    g3(x) = fast_inverse_sqrt(v);
    g3.vectorize(x, 4);

    g4(x) = fast_inverse_sqrt(v);
    g4.vectorize(x, 8);

    // Also test both on the GPU.
    f5(x) = fast_inverse(v);
    g5(x) = fast_inverse_sqrt(v);

    Target t = get_jit_target_from_environment();
    if (t.has_gpu_feature()) {
        f5.gpu_tile(x, xi, 16);
        g5.gpu_tile(x, xi, 16);
    }

    Buffer<float> imf1 = f1.realize(10000);
    Buffer<float> imf2 = f2.realize(10000);
    Buffer<float> imf3 = f3.realize(10000);
    Buffer<float> imf4 = f4.realize(10000);
    Buffer<float> imf5 = f5.realize(10000);

    Buffer<float> img1 = g1.realize(10000);
    Buffer<float> img2 = g2.realize(10000);
    Buffer<float> img3 = g3.realize(10000);
    Buffer<float> img4 = g4.realize(10000);
    Buffer<float> img5 = g5.realize(10000);

    printf("Testing accuracy of inverse\n");
    check(imf1, imf2);
    check(imf1, imf3);
    check(imf1, imf4);
    check(imf1, imf5);
    printf("Pass.\n");
    printf("Testing accuracy of inverse sqrt\n");
    check(img1, img2);
    check(img1, img3);
    check(img1, img4);
    check(img1, img5);
    printf("Pass.\n");

    printf("Success!\n");
    return 0;
}
コード例 #25
0
ファイル: inverse.cpp プロジェクト: Iamquen/Halide
int main(int argc, char **argv) {

    Func f1, f2, f3, f4;
    Func g1, g2, g3, g4;

    Var x;
    Expr v = x*1.34f + 1.0142f;

    Param<float> p;
    p.set(1.0f);

    // Test accuracy of reciprocals.

    // First prevent any optimizations by hiding 1.0 in a param.
    f1(x) = p / v;

    // Now test various vectorization widths with an explicit 1.0. On
    // arm 2 and 4 trigger optimizations. On x86 4 and 8 do.
    f2(x) = 1.0f / v;
    f2.vectorize(x, 2);

    f3(x) = 1.0f / v;
    f3.vectorize(x, 4);

    f4(x) = 1.0f / v;
    f4.vectorize(x, 8);

    // Same thing for reciprocal square root.
    g1(x) = p / sqrt(v);

    g2(x) = 1.0f / sqrt(v);
    g2.vectorize(x, 2);

    g3(x) = 1.0f / sqrt(v);
    g3.vectorize(x, 4);

    g4(x) = 1.0f / sqrt(v);
    g4.vectorize(x, 8);

    Image<float> imf1 = f1.realize(10000);
    Image<float> imf2 = f2.realize(10000);
    Image<float> imf3 = f3.realize(10000);
    Image<float> imf4 = f4.realize(10000);

    Image<float> img1 = g1.realize(10000);
    Image<float> img2 = g2.realize(10000);
    Image<float> img3 = g3.realize(10000);
    Image<float> img4 = g4.realize(10000);

    printf("Testing accuracy of inverse\n");
    check(imf1, imf2);
    check(imf1, imf3);
    check(imf1, imf4);
    printf("Pass.\n");
    printf("Testing accuracy of inverse sqrt\n");
    check(img1, img2);
    check(img1, img3);
    check(img1, img4);
    printf("Pass.\n");

    printf("Success!\n");
    return 0;
}
コード例 #26
0
ファイル: print.cpp プロジェクト: white-pony/Halide
int main(int argc, char **argv) {
    Target target = get_jit_target_from_environment();
    if (target.has_feature(Target::Profile)) {
        // The profiler adds lots of extra prints, so counting the
        // number of prints is not useful.
        printf("Skipping test because profiler is active\n");
        return 0;
    }

    if (target.has_feature(Target::Debug)) {
        // Same thing here: the runtime debug adds lots of extra prints,
        // so counting the number of prints is not useful.
        printf("Skipping test because runtime debug is active\n");
        return 0;
    }

    Var x;

    {
        Func f;

        f(x) = print(x * x, "the answer is", 42.0f, "unsigned", cast<uint32_t>(145));
        f.set_custom_print(halide_print);
        Buffer<int32_t> result = f.realize(10);

        for (int32_t i = 0; i < 10; i++) {
            if (result(i) != i * i) {
                return -1;
            }
        }

        assert(messages.size() == 10);
        for (size_t i = 0; i < messages.size(); i++) {
            long square;
            float forty_two;
            unsigned long one_forty_five;

            int scan_count = sscanf(messages[i].c_str(), "%ld the answer is %f unsigned %lu",
                                    &square, &forty_two, &one_forty_five);
            assert(scan_count == 3);
            assert(square == static_cast<long long>(i * i));
            assert(forty_two == 42.0f);
            assert(one_forty_five == 145);
        }
    }

    messages.clear();

    {
        Func f;
        Param<int> param;
        param.set(127);

        // Test a string containing a printf format specifier (It should print it as-is).
        f(x) = print_when(x == 3, x * x, "g", 42.0f, "%s", param);
        f.set_custom_print(halide_print);
        Buffer<int32_t> result = f.realize(10);

        for (int32_t i = 0; i < 10; i++) {
            if (result(i) != i * i) {
                return -1;
            }
        }

        assert(messages.size() == 1);
        long nine;
        float forty_two;
        long p;

        int scan_count = sscanf(messages[0].c_str(), "%ld g %f %%s %ld",
                                &nine, &forty_two, &p);
        assert(scan_count == 3);
        assert(nine == 9);
        assert(forty_two == 42.0f);
        assert(p == 127);

    }

    messages.clear();

    {
        Func f;

        // Test a single message longer than 8K.
        std::vector<Expr> args;
        for (int i = 0; i < 500; i++) {
            uint64_t n = i;
            n *= n;
            n *= n;
            n *= n;
            n *= n;
            n += 100;
            int32_t hi = n >> 32;
            int32_t lo = n & 0xffffffff;
            args.push_back((cast<uint64_t>(hi) << 32) | lo);
            Expr dn = cast<double>((float)(n));
            args.push_back(dn);
        }
        f(x) = print(args);
        f.set_custom_print(halide_print);
        Buffer<uint64_t> result = f.realize(1);

        if (result(0) != 100) {
            return -1;
        }

        assert(messages.back().size() == 8191);
    }

    messages.clear();

    // Check that Halide's stringification of floats and doubles
    // matches %f and %e respectively.

    #ifndef _WIN32
    // msvc's library has different ideas about how %f and %e should come out.
    {
        Func f, g;

        const int N = 1000000;

        Expr e = reinterpret(Float(32), random_uint());
        // Make sure we cover some special values.
        e = select(x == 0, 0.0f,
                   x == 1, -0.0f,
                   x == 2, std::numeric_limits<float>::infinity(),
                   x == 3, -std::numeric_limits<float>::infinity(),
                   x == 4, std::numeric_limits<float>::quiet_NaN(),
                   x == 5, -std::numeric_limits<float>::quiet_NaN(),
                   e);
        e = select(x == 5, std::numeric_limits<float>::denorm_min(),
                   x == 6, -std::numeric_limits<float>::denorm_min(),
                   x == 7, std::numeric_limits<float>::min(),
                   x == 8, -std::numeric_limits<float>::min(),
                   x == 9, std::numeric_limits<float>::max(),
                   x == 10, -std::numeric_limits<float>::max(),
                   x == 11, 1.0f - 1.0f / (1 << 22),
                   e);

        f(x) = print(e);

        f.set_custom_print(halide_print);
        Buffer<float> imf = f.realize(N);

        assert(messages.size() == (size_t)N);

        char correct[1024];
        for (int i = 0; i < N; i++) {
            snprintf(correct, sizeof(correct), "%f\n", imf(i));
            // Some versions of the std library can emit some NaN patterns
            // as "-nan", due to sloppy conversion (or not) of the sign bit.
            // Halide considers all NaN's equivalent, so paper over this
            // noise in the test by normalizing all -nan -> nan.
            if (messages[i] == "-nan\n") messages[i] = "nan\n";
            if (!strcmp(correct, "-nan\n")) strcpy(correct, "nan\n");
            if (messages[i] != correct) {
                printf("float %d: %s vs %s for %10.20e\n", i, messages[i].c_str(), correct, imf(i));
                return -1;
            }
        }

        messages.clear();

        g(x) = print(reinterpret(Float(64), (cast<uint64_t>(random_uint()) << 32) | random_uint()));
        g.set_custom_print(halide_print);
        Buffer<double> img = g.realize(N);

        assert(messages.size() == (size_t)N);

        for (int i = 0; i < N; i++) {
            snprintf(correct, sizeof(correct), "%e\n", img(i));
            // Some versions of the std library can emit some NaN patterns
            // as "-nan", due to sloppy conversion (or not) of the sign bit.
            // Halide considers all NaN's equivalent, so paper over this
            // noise in the test by normalizing all -nan -> nan.
            if (messages[i] == "-nan\n") messages[i] = "nan\n";
            if (!strcmp(correct, "-nan\n")) strcpy(correct, "nan\n");
            if (messages[i] != correct) {
                printf("double %d: %s vs %s for %10.20e\n", i, messages[i].c_str(), correct, img(i));
                return -1;
            }
        }


    }
    #endif

    messages.clear();

    {
        Func f;

        // Test a vectorized print.
        f(x) = print(x * 3);
        f.set_custom_print(halide_print);
        f.vectorize(x, 32);
        if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
            f.hexagon();
        }
        Buffer<int> result = f.realize(128);

        if (!target.features_any_of({Target::HVX_64, Target::HVX_128})) {
            assert((int)messages.size() == result.width());
            for (size_t i = 0; i < messages.size(); i++) {
                assert(messages[i] == std::to_string(i * 3) + "\n");
            }
        } else {
            // The Hexagon simulator prints directly to stderr, so we
            // can't read the messages.
        }
    }

    messages.clear();

    {
        Func f;

        // Test a vectorized print_when.
        f(x) = print_when(x % 2 == 0, x * 3);
        f.set_custom_print(halide_print);
        f.vectorize(x, 32);
        if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
            f.hexagon();
        }
        Buffer<int> result = f.realize(128);

        if (!target.features_any_of({Target::HVX_64, Target::HVX_128})) {
            assert((int)messages.size() == result.width() / 2);
            for (size_t i = 0; i < messages.size(); i++) {
                assert(messages[i] == std::to_string(i * 2 * 3) + "\n");
            }
        } else {
            // The Hexagon simulator prints directly to stderr, so we
            // can't read the messages.
        }
    }


    printf("Success!\n");
    return 0;
}
コード例 #27
0
ファイル: likely.cpp プロジェクト: white-pony/Halide
int main(int argc, char **argv) {
    Func f;
    Var x;
    f(x) = x;
    f.compute_root();

    // Halide will partition a loop into three pieces in a few
    // situations. The pieces are 1) a messy prologue, 2) a clean
    // steady state, and 3) a messy epilogue. One way to trigger this
    // is if you use a boundary condition helper:

    {
        Func g = BoundaryConditions::repeat_edge(f, 0, 100);
        count_partitions(g, 3);
    }

    // If you vectorize or otherwise split, then the last vector
    // (which gets shifted leftwards) is its own partition. This
    // removes some clamping logic from the inner loop.

    {
        Func g;
        g(x) = f(x);
        g.vectorize(x, 8);
        count_partitions(g, 2);
    }

    // The slicing applies to every loop level starting from the
    // outermost one, but only recursively simplifies the clean steady
    // state. It either splits things three (start, middle, end). So
    // adding a boundary condition to a 2D computation will produce 5
    // code paths for the top, bottom, left, right, and center of the
    // image.
    {
        Var y;
        Func g;
        g(x, y) = x + y;
        g.compute_root();
        Func h = BoundaryConditions::mirror_image(g, 0, 10, 0, 10);
        count_partitions(h, 5);
    }

    // If you split and also have a boundary condition, or have
    // multiple boundary conditions at play (e.g. because you're
    // blurring an inlined Func that uses a boundary condition), then
    // there are still only three partitions. The steady state is the
    // slice of the loop where *all* of the boundary conditions and
    // splitting logic simplify away.
    {
        Func g = BoundaryConditions::mirror_interior(f, 0, 10);
        Func h;
        Param<int> t1, t2;
        h(x) = g(x-1) + g(x+1);
        h.vectorize(x, 8);
        count_partitions(h, 3);
    }

    // You can manually control the splitting behavior using the
    // 'likely' intrinsic. When used on one side of a select, min,
    // max, or clamp, it tags the select, min, max, or clamp as likely
    // to simplify to that expression in the steady state case, and
    // tries to solve for loop variable values for which this is true.
    {
        // So this code should produce a prologue that evaluates to sin(x), and
        // a steady state that evaluates to 1:
        Func g;
        g(x) = select(x < 10, sin(x), likely(1.0f));
        // There should be two partitions
        count_partitions(g, 2);
        // But only one should call sin
        count_sin_calls(g, 1);
    }

    {
        // This code should produce a prologue and epilogue that
        // evaluate sin(x), and a steady state that evaluates to 1:
        Func g;
        g(x) = select(x < 10 || x > 100, sin(x), likely(1.0f));
        // There should be three partitions
        count_partitions(g, 3);
        // With calls to sin in the prologue and epilogue.
        count_sin_calls(g, 2);
    }

    // As a specialize case, we treat clamped ramps as likely to
    // simplify to the clamped expression. This handles the many
    // existing cases where people have written their boundary
    // condition manually using clamp.
    {
        Func g;
        g(x) = f(clamp(x, 0, 10)); // treated as clamp(likely(x), 0, 10)
        g.vectorize(x, 8);
        count_partitions(g, 3);
    }

    // Using the likely intrinsic pulls some IR relating to the
    // condition outside of the loop. We'd better check that this
    // respects lets and doesn't do any combinatorial expansion. We'll
    // do this with a nasty comparison:
    {
        Func g;
        Var y;

        // Have an inner reduction loop that the comparisons depend on
        // to make things harder.
        RDom r(0, 5);

        const int N = 25;

        // Make some nasty expressions to compare to.
        Expr e[N];
        e[0] = y;
        for (int i = 1; i < N; i++) {
            e[i] = e[i-1] * e[i-1] + y + r;
        }
        // Make a nasty condition that uses all of these.
        Expr nasty = cast<bool>(1);
        for (int i = 0; i < N; i++) {
            nasty = nasty && (x*(i+1) < e[i]);
        }

        // Have an innermost loop over c to complicate things further.
        Var c;
        g(c, x, y) = sum(select(nasty, likely(10), c + r));

        // Check that it doesn't take the age of the world to compile,
        // and that it produces the right number of partitions.
        count_partitions(g, 3);
    }

    // Make sure partitions that occur outside of the actual bounds
    // don't mess things up.
    {
        Func g;
        Var x;
        Param<int> limit;
        g(x) = select(x > limit, likely(3), 2);

        // If either of these realize calls iterates from 0 to limit,
        // and then from limit to 10, we'll have a nice segfault.
        limit.set(10000000);
        Buffer<int> result = g.realize(10);

        limit.set(-10000000);
        result = g.realize(10);
    }

    // The performance of this behavior is tested in
    // test/performance/boundary_conditions.cpp

    printf("Success!\n");
    return 0;
}