C++ (Cpp) Func::compile_jitの例、Func::compile_jit, AlgoSolution C++ (Cpp)の例

コード例 #1

0

ファイルを表示

ファイル: bad_elem_size.cpp プロジェクト: 202198/Halide

int main(int argc, char **argv) {
    Var x, y;
    Func f;
    f(x, y) = x+y;

    // Dig out the raw function pointer so we can use it as if we were
    // compiling statically
    void (*function)(buffer_t *) = (void (*)(buffer_t *))(f.compile_jit());

    buffer_t out;
    memset(&out, 0, sizeof(out));
    out.host = (uint8_t *)malloc(10*10);
    out.elem_size = 1; // should be 4!
    out.extent[0] = 10;
    out.extent[1] = 10;
    out.stride[0] = 1;
    out.stride[1] = 10;

    f.set_error_handler(&halide_error);
    error_occurred = false;
    function(&out);

    if (error_occurred) {
        printf("Success!\n");
        return 0;
    } else {
        printf("There should have been a runtime error\n");
        return -1;
    }
}

コード例 #2

0

ファイルを表示

ファイル: lesson_12_using_the_gpu.cpp プロジェクト: Mengke-Yuan/Halide

    // Now we define methods that give our pipeline several different
    // schedules.
    void schedule_for_cpu() {
        // Compute the look-up-table ahead of time.
        lut.compute_root();

        // Compute color channels innermost. Promise that there will
        // be three of them and unroll across them.
        curved.reorder(c, x, y)
              .bound(c, 0, 3)
              .unroll(c);

        // Look-up-tables don't vectorize well, so just parallelize
        // curved in slices of 16 scanlines.
        Var yo, yi;
        curved.split(y, yo, yi, 16)
              .parallel(yo);

        // Compute sharpen as needed per scanline of curved.
        sharpen.compute_at(curved, yi);

        // Vectorize the sharpen. It's 16-bit so we'll vectorize it 8-wide.
        sharpen.vectorize(x, 8);

        // Compute the padded input as needed per scanline of curved,
        // reusing previous values computed within the same strip of
        // 16 scanlines.
        padded.store_at(curved, yo)
              .compute_at(curved, yi);

        // Also vectorize the padding. It's 8-bit, so we'll vectorize
        // 16-wide.
        padded.vectorize(x, 16);

        // JIT-compile the pipeline for the CPU.
        curved.compile_jit();
    }

コード例 #3

0

ファイルを表示

ファイル: clamped_vector_load.cpp プロジェクト: hirokai/Halide

double test(Func f, bool test_correctness = true) {
    f.compile_to_assembly(f.name() + ".s", Internal::vec<Argument>(input), f.name());
    f.compile_jit();
    f.realize(output);

    if (test_correctness) {
        for (int y = 0; y < output.height(); y++) {
            for (int x = 0; x < output.width(); x++) {
                int ix1 = std::max(std::min(x, MAX), MIN);
                int ix2 = std::max(std::min(x+1, MAX), MIN);
                uint16_t correct = input(ix1, y) * 3 + input(ix2, y);
                if (output(x, y) != correct) {
                    printf("output(%d, %d) = %d instead of %d\n",
                           x, y, output(x, y), correct);
                    exit(-1);
                }
            }
        }
    }

    double t1 = currentTime();
    for (int i = 0; i < 10; i++) {
        f.realize(output);
    }
    return currentTime() - t1;
}

コード例 #4

0

ファイルを表示

ファイル: realize_over_shifted_domain.cpp プロジェクト: adityaatluri/Halide

int main(int argc, char **argv) {
    Buffer<int> input(100, 50);

    // This image represents the range [100, 199]*[50, 99]
    input.set_min(100, 50);

    input(100, 50) = 123;
    input(198, 99) = 234;

    Func f;
    Var x, y;
    f(x, y) = input(2*x, y/2);

    f.compile_jit();

    // The output will represent the range from [50, 99]*[100, 199]
    Buffer<int> result(50, 100);
    result.set_min(50, 100);

    f.realize(result);

    if (result(50, 100) != 123 || result(99, 199) != 234) {
        fprintf(stderr, "Err: f(50, 100) = %d (supposed to be 123)\n"
               "f(99, 199) = %d (supposed to be 234)\n",
               result(50, 100), result(99, 199));
        return -1;
    }

    printf("Success!\n");

    return 0;
}

コード例 #5

0

ファイルを表示

ファイル: out_constraint.cpp プロジェクト: jiapei100/Halide

int main(int argc, char **argv) {
    ImageParam input(UInt(8), 1);
    input.dim(0).set_bounds(0, size);

    {
        Func f;
        Var x;
        f(x) = input(x);
        // Output must have the same size as the input.
        f.output_buffer().dim(0).set_bounds(input.dim(0).min(), input.dim(0).extent());
        f.add_custom_lowering_pass(new Validator);
        f.compile_jit();

        Buffer<uint8_t> dummy(size);
        dummy.fill(42);
        input.set(dummy);
        Buffer<uint8_t> out = f.realize(size);
        if (!out.all_equal(42)) {
            std::cerr << "wrong output" << std::endl;
            exit(-1);
        }
    }

    {
        Func f;
        Var x;
        f(x) = undef(UInt(8));
        RDom r(input);
        f(r.x) = cast<uint8_t>(42);

        f.add_custom_lowering_pass(new Validator);
        f.compile_jit();

        Buffer<uint8_t> dummy(size);
        input.set(dummy);
        Buffer<uint8_t> out = f.realize(size);
        if (!out.all_equal(42)) {
            std::cerr << "wrong output" << std::endl;
            exit(-1);
        }
    }

    std::cout << "Success!" << std::endl;

    return 0;

}

コード例 #6

0

ファイルを表示

ファイル: memcpy.cpp プロジェクト: delcypher/Halide

int main(int argc, char **argv) {
    ImageParam src(UInt(8), 1);
    Func dst;
    Var x;
    dst(x) = src(x);


    Var xo;
    dst.split(x, xo, x, 8*4096);
    // dst.parallel(xo); speeds up halide's memcpy considerably, but doesn't seem sporting
    dst.vectorize(x, 16);

    dst.compile_to_assembly("memcpy.s", {src}, "memcpy");
    dst.compile_jit();

    const int32_t buffer_size = 12345678;
    const int iterations = 50;

    Image<uint8_t> input(buffer_size);
    Image<uint8_t> output(buffer_size);

    src.set(input);

    // Get past one-time set-up issues for the ptx backend.
    dst.realize(output);

    double halide = 0, system = 0;
    for (int i = 0; i < iterations; i++) {
        double t1 = current_time();
        dst.realize(output);
        dst.realize(output);
        dst.realize(output);
        double t2 = current_time();
        memcpy(output.data(), input.data(), input.width());
        memcpy(output.data(), input.data(), input.width());
        memcpy(output.data(), input.data(), input.width());
        double t3 = current_time();
        system += t3-t2;
        halide += t2-t1;
    }

    printf("system memcpy: %.3e byte/s\n", (buffer_size / system) * 3 * 1000 * iterations);
    printf("halide memcpy: %.3e byte/s\n", (buffer_size / halide) * 3 * 1000 * iterations);

    // memcpy will win by a little bit for large inputs because it uses streaming stores
    if (halide > system * 2) {
        printf("Halide memcpy is slower than it should be.\n");
        return -1;
    }

    printf("Success!\n");
    return 0;
}

コード例 #7

0

ファイルを表示

ファイル: extern_consumer.cpp プロジェクト: Amos-zq/Halide

int main(int argc, char **argv) {
    // Define a pipeline that dumps some squares to a file using an
    // external consumer stage.
    Func source;
    Var x;
    source(x) = x*x;

    Param<int> min, extent;
    Param<const char *> filename;

    Func sink;
    std::vector<ExternFuncArgument> args;
    args.push_back(source);
    args.push_back(filename);
    args.push_back(min);
    args.push_back(extent);
    sink.define_extern("dump_to_file", args, Int(32), 0);

    source.compute_root();

    sink.compile_jit();

    // Dump the first 10 squares to a file
    filename.set("halide_test_extern_consumer.txt");
    min.set(0);
    extent.set(10);
    sink.realize();

    if (!check_result())
        return -1;

    // Test ImageParam ExternFuncArgument via passed in image.
    Image<int32_t> buf = source.realize(10);
    ImageParam passed_in(Int(32), 1);
    passed_in.set(buf);

    Func sink2;
    std::vector<ExternFuncArgument> args2;
    args2.push_back(passed_in);
    args2.push_back(filename);
    args2.push_back(min);
    args2.push_back(extent);
    sink2.define_extern("dump_to_file", args2, Int(32), 0);

    sink2.realize();

    if (!check_result())
        return -1;

    printf("Success!\n");
    return 0;

}

コード例 #8

0

ファイルを表示

ファイル: unbounded_output.cpp プロジェクト: AheadIO/Halide

int main(int argc, char **argv) {
    Func f;
    Var x, y;

    ImageParam in(Float(32), 2);
    ImageParam x_coord(Int(32), 2);
    ImageParam y_coord(Int(32), 2);

    f(x, y) = 0.0f;
    RDom r(0, 100, 0, 100);
    f(x_coord(r.x, r.y), y_coord(r.x, r.y)) += in(r.x, r.y);

    f.compile_jit();

    printf("I should not have reached here\n");

    return 0;
}

コード例 #9

0

ファイルを表示

ファイル: inner_loop_parallel.cpp プロジェクト: jrprice/Halide

int main(int argc, char **argv) {

    Func f;
    Var x, y;
    f(x, y) = x + y;
    f.parallel(x);

    // Having more threads than tasks shouldn't hurt performance too much.
    double correct_time = 0;

    for (int t = 2; t <= 64; t *= 2) {
        std::ostringstream ss;
        ss << "HL_NUM_THREADS=" << t;
        std::string str = ss.str();
        char buf[32] = {0};
        memcpy(buf, str.c_str(), str.size());
        putenv(buf);
        Halide::Internal::JITSharedRuntime::release_all();
        f.compile_jit();
        // Start the thread pool without giving any hints as to the
        // number of tasks we'll be using.
        f.realize(t, 1);
        double min_time = 1e20;
        for (int i = 0; i < 3; i++) {
            double t1 = current_time();
            f.realize(2, 1000000);
            double t2 = current_time() - t1;
            if (t2 < min_time) min_time = t2;
        }

        printf("%d: %f ms\n", t, min_time);
        if (t == 2) {
            correct_time = min_time;
        } else if (min_time > correct_time * 5) {
            printf("Unacceptable overhead when using %d threads for 2 tasks: %f ms vs %f ms\n",
                   t, min_time, correct_time);
            return -1;
        }
    }

    printf("Success!\n");
    return 0;
}

コード例 #10

0

ファイルを表示

ファイル: dynamic_reduction_bounds.cpp プロジェクト: netaz/Halide

int main(int argc, char **argv) {
    
    ImageParam input(Float(32), 2);

    Var x, y, z;
    RDom dom(0, input.width()*8);
    Func f;
    Expr hard_to_reason_about = cast<int>(hypot(input.width(), input.height()));
    f(x, y, z) = 1;
    f(x, y, dom / hard_to_reason_about) += 1;
    f.compile_jit();

    Image<float> im(32, 32);
    input.set(im);

    f.realize(100, 100, 16);

    printf("Success!\n");
    return 0;
}

コード例 #11

0

ファイルを表示

ファイル: lesson_12_using_the_gpu.cpp プロジェクト: kree-colemcalughlin/Halide

    // Now a schedule that uses CUDA or OpenCL.
    void schedule_for_gpu() {
        // We make the decision about whether to use the GPU for each
        // Func independently. If you have one Func computed on the
        // CPU, and the next computed on the GPU, Halide will do the
        // copy-to-gpu under the hood. For this pipeline, there's no
        // reason to use the CPU for any of the stages. Halide will
        // copy the input image to the GPU the first time we run the
        // pipeline, and leave it there to reuse on subsequent runs.

        // As before, we'll compute the LUT once at the start of the
        // pipeline.
        lut.compute_root();

        // Let's compute the look-up-table using the GPU in 16-wide
        // one-dimensional thread blocks. First we split the index
        // into blocks of size 16:
        Var block, thread;
        lut.split(i, block, thread, 16);
        // Then we tell cuda that our Vars 'block' and 'thread'
        // correspond to CUDA's notions of blocks and threads, or
        // OpenCL's notions of thread groups and threads.
        lut.gpu_blocks(block)
           .gpu_threads(thread);

        // This is a very common scheduling pattern on the GPU, so
        // there's a shorthand for it:

        // lut.gpu_tile(i, 16);

        // Func::gpu_tile method is similar to Func::tile, except that
        // it also specifies that the tile coordinates correspond to
        // GPU blocks, and the coordinates within each tile correspond
        // to GPU threads.

        // Compute color channels innermost. Promise that there will
        // be three of them and unroll across them.
        curved.reorder(c, x, y)
              .bound(c, 0, 3)
              .unroll(c);

        // Compute curved in 2D 8x8 tiles using the GPU.
        curved.gpu_tile(x, y, 8, 8);

        // This is equivalent to:
        // curved.tile(x, y, xo, yo, xi, yi, 8, 8)
        //       .gpu_blocks(xo, yo)
        //       .gpu_threads(xi, yi);

        // We'll leave sharpen as inlined into curved.

        // Compute the padded input as needed per GPU block, storing the
        // intermediate result in shared memory. Var::gpu_blocks, and
        // Var::gpu_threads exist to help you schedule producers within
        // GPU threads and blocks.
        padded.compute_at(curved, Var::gpu_blocks());

        // Use the GPU threads for the x and y coordinates of the
        // padded input.
        padded.gpu_threads(x, y);

        // JIT-compile the pipeline for the GPU. CUDA or OpenCL are
        // not enabled by default. We have to construct a Target
        // object, enable one of them, and then pass that target
        // object to compile_jit. Otherwise your CPU will very slowly
        // pretend it's a GPU, and use one thread per output pixel.

        // Start with a target suitable for the machine you're running
        // this on.
        Target target = get_host_target();

        // Then enable OpenCL or CUDA.

        // We'll enable OpenCL here, because it tends to give better
        // performance than CUDA, even with NVidia's drivers, because
        // NVidia's open source LLVM backend doesn't seem to do all
        // the same optimizations their proprietary compiler does.
        target.features |= Target::OpenCL;

        // Uncomment the next line and comment out the line above to
        // try CUDA instead.
        // target.features |= Target::CUDA;

        // If you want to see all of the OpenCL or CUDA API calls done
        // by the pipeline, you can also enable the GPUDebug
        // flag. This is helpful for figuring out which stages are
        // slow, or when CPU -> GPU copies happen. It hurts
        // performance though, so we'll leave it commented out.
        //target.features |= Target::GPUDebug;

        curved.compile_jit(target);
    }

コード例 #12

0

ファイルを表示

ファイル: sort.cpp プロジェクト: AheadIO/Halide

int main(int argc, char **argv) {

    const int N = 1 << 10;

    Image<int> data(N);
    for (int i = 0; i < N; i++) {
        data(i) = rand() & 0xfffff;
    }
    Func input = lambda(x, data(x));

    printf("Bitonic sort...\n");
    Func f = bitonic_sort(input, N);
    f.bound(x, 0, N);
    f.compile_jit();
    printf("Running...\n");
    Image<int> bitonic_sorted(N);
    f.realize(bitonic_sorted);
    double t1 = current_time();
    for (int i = 0; i < 10; i++) {
        f.realize(bitonic_sorted);
    }
    double t2 = current_time();

    printf("Merge sort...\n");
    f = merge_sort(input, N);
    f.bound(x, 0, N);
    f.compile_jit();
    printf("Running...\n");
    Image<int> merge_sorted(N);
    f.realize(merge_sorted);
    double t3 = current_time();
    for (int i = 0; i < 10; i++) {
        f.realize(merge_sorted);
    }
    double t4 = current_time();

    Image<int> correct(N);
    for (int i = 0; i < N; i++) {
        correct(i) = data(i);
    }
    printf("std::sort...\n");
    double t5 = current_time();
    std::sort(&correct(0), &correct(N));
    double t6 = current_time();

    printf("Times:\n"
           "bitonic sort: %f \n"
           "merge sort: %f \n"
           "std::sort %f\n",
           (t2-t1)/10, (t4-t3)/10, t6-t5);

    if (N <= 100) {
        for (int i = 0; i < N; i++) {
            printf("%8d %8d %8d\n",
                   correct(i), bitonic_sorted(i), merge_sorted(i));
        }
    }

    for (int i = 0; i < N; i++) {
        if (bitonic_sorted(i) != correct(i)) {
            printf("bitonic sort failed: %d -> %d instead of %d\n", i, bitonic_sorted(i), correct(i));
            return -1;
        }
        if (merge_sorted(i) != correct(i)) {
            printf("merge sort failed: %d -> %d instead of %d\n", i, merge_sorted(i), correct(i));
            return -1;
        }
    }

    return 0;
}

コード例 #13

0

ファイルを表示

ファイル: sort.cpp プロジェクト: ronen/Halide

int main(int argc, char **argv) {

    const int N = 1 << 10;

    Buffer<int> data(N);
    for (int i = 0; i < N; i++) {
        data(i) = rand() & 0xfffff;
    }
    Func input = lambda(x, data(x));

    printf("Bitonic sort...\n");
    Func f = bitonic_sort(input, N);
    f.bound(x, 0, N);
    f.compile_jit();
    printf("Running...\n");
    Buffer<int> bitonic_sorted(N);
    f.realize(bitonic_sorted);
    double t_bitonic = benchmark(1, 10, [&]() {
        f.realize(bitonic_sorted);
    });

    printf("Merge sort...\n");
    f = merge_sort(input, N);
    f.bound(x, 0, N);
    f.compile_jit();
    printf("Running...\n");
    Buffer<int> merge_sorted(N);
    f.realize(merge_sorted);
    double t_merge = benchmark(1, 10, [&]() {
        f.realize(merge_sorted);
    });

    Buffer<int> correct(N);
    for (int i = 0; i < N; i++) {
        correct(i) = data(i);
    }
    printf("std::sort...\n");
    double t_std = benchmark(1, 1, [&]() {
        std::sort(&correct(0), &correct(N));
    });

    printf("Times:\n"
           "bitonic sort: %fms \n"
           "merge sort: %fms \n"
           "std::sort %fms\n",
           t_bitonic * 1e3, t_merge * 1e3, t_std * 1e3);

    if (N <= 100) {
        for (int i = 0; i < N; i++) {
            printf("%8d %8d %8d\n",
                   correct(i), bitonic_sorted(i), merge_sorted(i));
        }
    }

    for (int i = 0; i < N; i++) {
        if (bitonic_sorted(i) != correct(i)) {
            printf("bitonic sort failed: %d -> %d instead of %d\n", i, bitonic_sorted(i), correct(i));
            return -1;
        }
        if (merge_sorted(i) != correct(i)) {
            printf("merge sort failed: %d -> %d instead of %d\n", i, merge_sorted(i), correct(i));
            return -1;
        }
    }

    return 0;
}

コード例 #14

0

ファイルを表示

ファイル: linearCombinationKernel.cpp プロジェクト: stanford-gfx/astro

int main(int argc, char *argv[]) {
#if !defined(STANDALONE) && !defined(TESTING_GPU)
    auto im = afwImage::MaskedImage<float>("../calexp-004207-g3-0123.fits");
    int width = im.getWidth(), height = im.getHeight();

#else
    int width = 2048, height = 1489;
//    int width = 200, height = 200;
    printf("[no load]");
#endif
    printf("Loaded: %d x %d\n", width, height);

    //store image data in img_var(x, y, 0) and variance data in img_var(x, y, 1)
    Image<float> image(width, height);
    Image<float> variance(width, height);
    Image<uint16_t> mask(width, height);

#if !defined(STANDALONE) && !defined(TESTING_GPU) 
    //Read image in
    for (int y = 0; y < im.getHeight(); y++) {
        afwImage::MaskedImage<float, lsst::afw::image::MaskPixel, lsst::afw::image::VariancePixel>::x_iterator inPtr = im.x_at(0, y);
        for (int x = 0; x < im.getWidth(); x++){
            image(x, y) = (*inPtr).image();
            variance(x, y) = (*inPtr).variance();
            mask(x, y) = (*inPtr).mask();
            inPtr++;
        }
    }
#endif

    int boundingBox = 5; 
    Var x, y, i_v, y0, yi;

    //compute output image and variance
    //Polynomials that define weights of spatially variant linear combination of 5 kernels
    Func polynomial1, polynomial2, polynomial3, polynomial4, polynomial5;
    polynomial1(x, y) = 0.1f + 0.002f*x + 0.003f*y + 0.4f*x*x + 0.5f*x*y
                     + 0.6f*y*y +  0.0007f*x*x*x + 0.0008f*x*x*y + 0.0009f*x*y*y
                     + 0.00011f*y*y*y;

    //for experimenting with optimizations
    polynomial2(x, y) = 1.1f + 1.002f*x + 1.003f*y + 1.4f*x*x + 1.5f*x*y
                     + 1.6f*y*y +  1.0007f*x*x*x + 1.0008f*x*x*y + 1.0009f*x*y*y
                     + 1.00011f*y*y*y;

    //for experimenting with optimizations

    polynomial3(x, y) = 2.1f + 2.002f*x + 2.003f*y + 2.4f*x*x + 2.5f*x*y
                     + 2.6f*y*y +  2.0007f*x*x*x + 2.0008f*x*x*y + 2.0009f*x*y*y
                     + 2.00011f*y*y*y;

    //for experimenting with optimizations
    polynomial4(x, y) = 3.1f + 3.002f*x + 3.003f*y + 3.4f*x*x + 3.5f*x*y
                     + 3.6f*y*y +  3.0007f*x*x*x + 3.0008f*x*x*y + 3.0009f*x*y*y
                     + 3.00011f*y*y*y;

    //for experimenting with optimizations
    polynomial5(x, y) = 4.1f + 4.002f*x + 4.003f*y + 4.4f*x*x + 4.5f*x*y
                     + 4.6f*y*y +  4.0007f*x*x*x + 4.0008f*x*x*y + 4.0009f*x*y*y
                     + 4.00011f*y*y*y;

    //Kernel #1
    Func kernel1;
    float sigmaX1 = 2.0f;
    float sigmaY1 = 2.0f;
    float theta1 = 0.0f; //rotation of sigmaX axis
    kernel1(x, y) = (exp(-((x*cos(theta1) +y*sin(theta1))*(x*cos(theta1) +y*sin(theta1)))
                    /(2*sigmaX1*sigmaX1)) / (sqrtf(2*M_PI)*sigmaX1))
                    *(exp(-((y*cos(theta1) - x*sin(theta1))*(y*cos(theta1) - x*sin(theta1)))
                    /(2*sigmaY1*sigmaY1)) / (sqrtf(2*M_PI)*sigmaY1));



    //Kernel #2
    Func kernel2;
    float sigmaX2 = 0.5f;
    float sigmaY2 = 4.0f;
    float theta2 = 0.0f; //rotation of sigmaX axis
    kernel2(x, y) = (exp(-((x*cos(theta2) +y*sin(theta2))*(x*cos(theta2) +y*sin(theta2)))
                    /(2*sigmaX2*sigmaX2)) / (sqrtf(2*M_PI)*sigmaX2))
                    *(exp(-((y*cos(theta2) - x*sin(theta2))*(y*cos(theta2) - x*sin(theta2)))
                    /(2*sigmaY2*sigmaY2)) / (sqrtf(2*M_PI)*sigmaY2));

    //Kernel #3
    Func kernel3;
    float sigmaX3 = 0.5f;
    float sigmaY3 = 4.0f;
    float theta3 = 3.14159f/4; //rotation of sigmaX axis
    kernel3(x, y) = (exp(-((x*cos(theta3) +y*sin(theta3))*(x*cos(theta3) +y*sin(theta3)))
                    /(2*sigmaX3*sigmaX3)) / (sqrtf(2*M_PI)*sigmaX3))
                    *(exp(-((y*cos(theta3) - x*sin(theta3))*(y*cos(theta3) - x*sin(theta3)))
                    /(2*sigmaY3*sigmaY3)) / (sqrtf(2*M_PI)*sigmaY3));
    //Kernel #4
    Func kernel4;
    float sigmaX4 = 0.5f;
    float sigmaY4 = 4.0f;
    float theta4 = 3.14159f/2; //rotation of sigmaX axis
    kernel4(x, y) = (exp(-((x*cos(theta4) +y*sin(theta4))*(x*cos(theta4) +y*sin(theta4)))
                    /(2*sigmaX4*sigmaX4)) / (sqrtf(2*M_PI)*sigmaX4))
                    *(exp(-((y*cos(theta4) - x*sin(theta4))*(y*cos(theta4) - x*sin(theta4)))
                    /(2*sigmaY4*sigmaY4)) / (sqrtf(2*M_PI)*sigmaY4));


    //Kernel #5
    Func kernel5;
    float sigmaX5 = 4.0f;
    float sigmaY5 = 4.0f;
    float theta5 = 0.0; //rotation of sigmaX axis
    kernel5(x, y) = (exp(-((x*cos(theta5) +y*sin(theta5))*(x*cos(theta5) +y*sin(theta5)))
                    /(2*sigmaX5*sigmaX5)) / (sqrtf(2*M_PI)*sigmaX5))
                    *(exp(-((y*cos(theta5) - x*sin(theta5))*(y*cos(theta5) - x*sin(theta5)))
                    /(2*sigmaY5*sigmaY5)) / (sqrtf(2*M_PI)*sigmaY5));


    //Compute output image plane
    Func image_bounded ("image_bounded");
    image_bounded = BoundaryConditions::repeat_edge(image);


    //Spatially Invariant Implementation 1
/*    Expr blur_image_help = 0.0f;
    Expr norm = 0.0f;
    for(int i = -boundingBox; i <= boundingBox; i++){
        for(int j = -boundingBox; j <= boundingBox; j++){
            blur_image_help += image_bounded(x + i, y + j) * (kernel1(i, j) + kernel2(i, j) +
                                kernel3(i, j) + kernel4(i, j) + kernel5(i, j)); 
            norm += (kernel1(i, j) + kernel2(i, j) + kernel3(i, j) + kernel4(i, j) + kernel5(i, j));
        }
    }
    blur_image_help = blur_image_help/norm;
    Func blurImage ("blurImage");
    blurImage(x, y) = blur_image_help;
*/

    //Spatially Invariant Implementation 2
/*
    Expr blur_image_help1 = 0.0f;
    Expr norm1 = 0.0f;
    for(int i = -boundingBox; i <= boundingBox; i++){
        for(int j = -boundingBox; j <= boundingBox; j++){
            blur_image_help1 += image_bounded(x + i, y + j) * kernel1(i, j); 
            norm1 += kernel1(i, j);
        }
    }
//    blur_image_help1 = blur_image_help1/norm1;
    Func blurImage1 ("blurImage1");
    blurImage1(x, y) = blur_image_help1;

    Expr blur_image_help2 = 0.0f;
    Expr norm2 = 0.0f;
    for(int i = -boundingBox; i <= boundingBox; i++){
        for(int j = -boundingBox; j <= boundingBox; j++){
            blur_image_help2 += image_bounded(x + i, y + j) * kernel2(i, j); 
            norm2 += kernel2(i, j);
        }
    }
//    blur_image_help2 = blur_image_help2/norm2;
    Func blurImage2 ("blurImage2");
    blurImage2(x, y) = blur_image_help2;

    Expr blur_image_help3 = 0.0f;
    Expr norm3 = 0.0f;
    for(int i = -boundingBox; i <= boundingBox; i++){
        for(int j = -boundingBox; j <= boundingBox; j++){
            blur_image_help3 += image_bounded(x + i, y + j) * kernel3(i, j); 
            norm3 += kernel3(i, j);
        }
    }
//    blur_image_help3 = blur_image_help3/norm3;
    Func blurImage3 ("blurImage3");
    blurImage3(x, y) = blur_image_help3;

    Expr blur_image_help4 = 0.0f;
    Expr norm4 = 0.0f;
    for(int i = -boundingBox; i <= boundingBox; i++){
        for(int j = -boundingBox; j <= boundingBox; j++){
            blur_image_help4 += image_bounded(x + i, y + j) * kernel4(i, j); 
            norm4 += kernel4(i, j);
        }
    }
//    blur_image_help4 = blur_image_help4/norm4;
    Func blurImage4 ("blurImage4");
    blurImage4(x, y) = blur_image_help4;

    Expr blur_image_help5 = 0.0f;
    Expr norm5 = 0.0f;
    for(int i = -boundingBox; i <= boundingBox; i++){
        for(int j = -boundingBox; j <= boundingBox; j++){
            blur_image_help5 += image_bounded(x + i, y + j) * kernel5(i, j); 
            norm5 += kernel5(i, j);
        }
    }
//    blur_image_help5 = blur_image_help5/norm5;
    Func blurImage5 ("blurImage5");
    blurImage5(x, y) = blur_image_help5;


    Func blurImage ("blurImage");
//    blurImage(x, y) = (blurImage1(x, y) + blurImage2(x, y) + blurImage3(x, y) +
//                        blurImage4(x, y) + blurImage5(x, y))/(5*norm1);
    blurImage(x, y) = (blur_image_help1 + blur_image_help2 + blur_image_help3 + 
                        blur_image_help4 + blur_image_help5)/(5*norm1);
*/




    //Spatially Variant Implementation 1
    Expr blur_image_help = 0.0f;
    Expr norm = 0.0f;
    for(int i = -boundingBox; i <= boundingBox; i++){
        for(int j = -boundingBox; j <= boundingBox; j++){
            blur_image_help += image_bounded(x + i, y + j) * (polynomial1(x, y)*kernel1(i, j) +
                polynomial2(x, y)*kernel2(i, j) + polynomial3(x, y)*kernel3(i, j) + 
                polynomial4(x, y)*kernel4(i, j) + polynomial5(x, y)*kernel5(i, j)); 
            norm += (polynomial1(x, y)*kernel1(i, j) + polynomial2(x, y)*kernel2(i, j) + 
                polynomial3(x, y)*kernel3(i, j) + polynomial4(x, y)*kernel4(i, j) + 
                polynomial5(x, y)*kernel5(i, j));
        }
    }
    blur_image_help = blur_image_help/norm;
    Func blurImage ("blurImage");
    blurImage(x, y) = blur_image_help;





    //Compute output variance plane
    Func variance_bounded ("variance_bounded");
    variance_bounded = BoundaryConditions::repeat_edge(variance);
    //compute Variance output
    Func blurVariance ("blurVariance");
    Expr blur_variance_help = 0.0f;
    Expr vNorm2 = 0.0f;
    for(int i = -boundingBox; i <= boundingBox; i++){
        for(int j = -boundingBox; j <= boundingBox; j++){
            blur_variance_help += variance_bounded(x + i, y + j) * (polynomial1(x, y)*kernel1(i, j) +
                polynomial2(x, y)*kernel2(i, j) + polynomial3(x, y)*kernel3(i, j) + 
                polynomial4(x, y)*kernel4(i, j) + polynomial5(x, y)*kernel5(i, j))
                *(polynomial1(x, y)*kernel1(i, j) +
                polynomial2(x, y)*kernel2(i, j) + polynomial3(x, y)*kernel3(i, j) + 
                polynomial4(x, y)*kernel4(i, j) + polynomial5(x, y)*kernel5(i, j)); 
            vNorm2 += (polynomial1(x, y)*kernel1(i, j) + polynomial2(x, y)*kernel2(i, j) + 
                polynomial3(x, y)*kernel3(i, j) + polynomial4(x, y)*kernel4(i, j) + 
                polynomial5(x, y)*kernel5(i, j))
                *(polynomial1(x, y)*kernel1(i, j) + polynomial2(x, y)*kernel2(i, j) + 
                polynomial3(x, y)*kernel3(i, j) + polynomial4(x, y)*kernel4(i, j) + 
                polynomial5(x, y)*kernel5(i, j));
        }
    }
//    blur_variance_help = blur_variance_help/(norm(x,y)*norm(x,y));
    blur_variance_help = blur_variance_help/(vNorm2*vNorm2);
    blurVariance(x, y) = blur_variance_help;



    //Compute output mask plane
    Func mask_bounded ("mask_bounded");
    mask_bounded = BoundaryConditions::repeat_edge(mask);

    Func maskOut ("maskOut");

    Expr maskOutHelp = 0;

    for(int i = -boundingBox; i <= boundingBox; i++){
        for(int j = -boundingBox; j <= boundingBox; j++){
            maskOutHelp = select((polynomial1(x, y)*kernel1(i, j) + polynomial2(x, y)*kernel2(i, j) + 
                polynomial3(x, y)*kernel3(i, j) + polynomial4(x, y)*kernel4(i, j) + 
                polynomial5(x, y)*kernel5(i, j)) == 0.0f, maskOutHelp, maskOutHelp | mask_bounded(x + i, y + j));
//            maskOutHelp = maskOutHelp | mask_bounded(x + i, y + j);    
        }
    }
    maskOut(x, y) = maskOutHelp;



    //Schedule
  //  blur.reorder(i_v, x, y);


//    kernel1.compute_at(blurImage, x);
//    kernel1.vectorize(x, 8);
//    kernel1.split(y, y0, yi, 4);
//    kernel1.parallel(y0);

/*    kernel1.compute_root();
    kernel2.compute_root();
    kernel3.compute_root();
    kernel4.compute_root();
    kernel5.compute_root();
*/
    //best schedule found:

#ifdef TESTING_GPU
        blurImage.gpu_tile(x, y, 16, 16);

        // JIT-compile the pipeline for the GPU. CUDA or OpenCL are
        // not enabled by default. We have to construct a Target
        // object, enable one of them, and then pass that target
        // object to compile_jit. Otherwise your CPU will very slowly
        // pretend it's a GPU, and use one thread per output pixel.

        // Start with a target suitable for the machine you're running
        // this on.
        Target target = get_host_target();

        // Then enable OpenCL or CUDA.

        // We'll enable OpenCL here, because it tends to give better
        // performance than CUDA, even with NVidia's drivers, because
        // NVidia's open source LLVM backend doesn't seem to do all
        // the same optimizations their proprietary compiler does.
        target.set_feature(Target::OpenCL);

        // Uncomment the next line and comment out the line above to
        // try CUDA instead.
        // target.set_feature(Target::CUDA);

        // If you want to see all of the OpenCL or CUDA API calls done
        // by the pipeline, you can also enable the Debug
        // flag. This is helpful for figuring out which stages are
        // slow, or when CPU -> GPU copies happen. It hurts
        // performance though, so we'll leave it commented out.
        // target.set_feature(Target::Debug);

        blurImage.compile_jit(target);
#else
        blurImage.split(y, y0, yi, 4);
        blurImage.parallel(y0);
        blurImage.vectorize(x, 8);
#endif

    // Split the y coordinate of the consumer into strips:
    blurVariance.split(y, y0, yi, 4);
    // Compute the strips using a thread pool and a task queue.
    blurVariance.parallel(y0);
    // Vectorize across x.
    blurVariance.vectorize(x, 8);

//    polynomial1.compute_at(blurImage, x).vectorize(x, 8);
//    kernel1.compute_at(blurImage, x).vectorize(x, 8);


    // Split the y coordinate of the consumer into strips of 16 scanlines:
    maskOut.split(y, y0, yi, 30);
    // Compute the strips using a thread pool and a task queue.
    maskOut.parallel(y0);
    // Vectorize across x by a factor of four.
    maskOut.vectorize(x, 8);

//    kernel1.trace_stores();
//    blurImage.trace_stores();




    //Check out what is happening
    blurImage.print_loop_nest();
    // Print out pseudocode for the pipeline.
    blurImage.compile_to_lowered_stmt("linearCombinationKernelBlurImage.html", {image}, HTML);
//    blurImage.compile_to_c("linearCombinationKernel_C_Code.cpp", std::vector<Argument>(), "linearCombinationKernel_C_Code");
//    blurVariance.compile_to_lowered_stmt("blur.html", {variance}, HTML);



    // Benchmark the pipeline.
#ifdef TESTING_GPU
    Buffer image_output(Float(32), image.width(), image.height()); //for GPU testing
#else
    Image<float> image_output(image.width(), image.height());
#endif

    blurImage.realize(image_output);

    Image<float> variance_output(variance.width(), variance.height());
    blurVariance.realize(variance_output);

    Image<int32_t> mask_output(mask.width(), mask.height());
    maskOut.realize(mask_output);

#ifdef TESTING_GPU 
    // Run the filter once to initialize any GPU runtime state.
    blurImage.realize(image_output);

    // Now take the best of 3 runs for timing.
    double best_time;
    for (int i = 0; i < 3; i++) {

        double t1 = current_time();

        // Run the filter 100 times.
        for (int j = 0; j < 100; j++) {
            blurImage.realize(image_output);
        }

        // Force any GPU code to finish by copying the buffer back to the CPU.
        image_output.copy_to_host();

        double t2 = current_time();

        double elapsed = (t2 - t1)/100;
        if (i == 0 || elapsed < best_time) {
            best_time = elapsed;
        }
    }

    printf("%1.4f milliseconds\n", best_time);
#else

	double average = 0;
    double min;
    double max;
    double imgTime;
    double varTime;
    double maskTime;
    int numberOfRuns = 5;
    for (int i = 0; i < numberOfRuns; i++) {
        double t1 = current_time();
        blurImage.realize(image_output);
        double t2 = current_time();
        blurVariance.realize(variance_output);
        double t3 = current_time();
        maskOut.realize(mask_output);
        double t4 = current_time();
        double curTime = (t4-t1);
        average += curTime;
        if(i == 0){
            min = curTime;
            max = curTime;
            imgTime = t2-t1;
            varTime = t3-t2;
            maskTime = t4-t3;
        }
        else{
            if(curTime < min){
                min = curTime;
                imgTime = t2-t1;
                varTime = t3-t2;
                maskTime = t4-t3;
            }
            if(curTime > max)
                max = curTime;
        }
    }
    average = average/numberOfRuns;
    std::cout << "Average Time: " << average << ", Min = " <<
    min << ", Max = " << max << ", with " << numberOfRuns <<
    " runs" << '\n';
    cout << "For fastest run total time = " << min << ", imgTime = " << imgTime << ", varTime = " << varTime << 
    "maskTime = " << maskTime << endl;
#endif



#if !defined(STANDALONE) && !defined(TESTING_GPU)    
    //write image out
    auto imOut = afwImage::MaskedImage<float, lsst::afw::image::MaskPixel, lsst::afw::image::VariancePixel>(im.getWidth(), im.getHeight());
    for (int y = 0; y < imOut.getHeight(); y++) {
    	afwImage::MaskedImage<float, lsst::afw::image::MaskPixel, lsst::afw::image::VariancePixel>::x_iterator inPtr = imOut.x_at(0, y);

        for (int x = 0; x < imOut.getWidth(); x++){
        	afwImage::pixel::SinglePixel<float, lsst::afw::image::MaskPixel, lsst::afw::image::VariancePixel> 
            curPixel(image_output(x, y), mask_output(x, y), variance_output(x, y));
        	(*inPtr) = curPixel;
        	inPtr++;

        }
    }

	imOut.writeFits("./halideLinearCombination5x5.fits");
#endif

}

コード例 #15

0

ファイルを表示

ファイル: lesson_12_using_the_gpu.cpp プロジェクト: darkbuck/Halide

    // Now a schedule that uses CUDA or OpenCL.
    void schedule_for_gpu() {
        // We make the decision about whether to use the GPU for each
        // Func independently. If you have one Func computed on the
        // CPU, and the next computed on the GPU, Halide will do the
        // copy-to-gpu under the hood. For this pipeline, there's no
        // reason to use the CPU for any of the stages. Halide will
        // copy the input image to the GPU the first time we run the
        // pipeline, and leave it there to reuse on subsequent runs.

        // As before, we'll compute the LUT once at the start of the
        // pipeline.
        lut.compute_root();

        // Let's compute the look-up-table using the GPU in 16-wide
        // one-dimensional thread blocks. First we split the index
        // into blocks of size 16:
        Var block, thread;
        lut.split(i, block, thread, 16);
        // Then we tell cuda that our Vars 'block' and 'thread'
        // correspond to CUDA's notions of blocks and threads, or
        // OpenCL's notions of thread groups and threads.
        lut.gpu_blocks(block)
           .gpu_threads(thread);

        // This is a very common scheduling pattern on the GPU, so
        // there's a shorthand for it:

        // lut.gpu_tile(i, block, thread, 16);

        // Func::gpu_tile behaves the same as Func::tile, except that
        // it also specifies that the tile coordinates correspond to
        // GPU blocks, and the coordinates within each tile correspond
        // to GPU threads.

        // Compute color channels innermost. Promise that there will
        // be three of them and unroll across them.
        curved.reorder(c, x, y)
              .bound(c, 0, 3)
              .unroll(c);

        // Compute curved in 2D 8x8 tiles using the GPU.
        curved.gpu_tile(x, y, xo, yo, xi, yi, 8, 8);

        // This is equivalent to:
        // curved.tile(x, y, xo, yo, xi, yi, 8, 8)
        //       .gpu_blocks(xo, yo)
        //       .gpu_threads(xi, yi);

        // We'll leave sharpen as inlined into curved.

        // Compute the padded input as needed per GPU block, storing
        // the intermediate result in shared memory. In the schedule
        // above xo corresponds to GPU blocks.
        padded.compute_at(curved, xo);

        // Use the GPU threads for the x and y coordinates of the
        // padded input.
        padded.gpu_threads(x, y);

        // JIT-compile the pipeline for the GPU. CUDA, OpenCL, or
        // Metal are not enabled by default. We have to construct a
        // Target object, enable one of them, and then pass that
        // target object to compile_jit. Otherwise your CPU will very
        // slowly pretend it's a GPU, and use one thread per output
        // pixel.

        // Start with a target suitable for the machine you're running
        // this on.
        Target target = get_host_target();

        // Then enable OpenCL or Metal, depending on which platform
        // we're on. OS X doesn't update its OpenCL drivers, so they
        // tend to be broken. CUDA would also be a fine choice on
        // machines with NVidia GPUs.
        if (target.os == Target::OSX) {
            target.set_feature(Target::Metal);
        } else {
            target.set_feature(Target::OpenCL);
        }

        // Uncomment the next line and comment out the lines above to
        // try CUDA instead.
        // target.set_feature(Target::CUDA);

        // If you want to see all of the OpenCL, Metal, or CUDA API
        // calls done by the pipeline, you can also enable the Debug
        // flag. This is helpful for figuring out which stages are
        // slow, or when CPU -> GPU copies happen. It hurts
        // performance though, so we'll leave it commented out.
        // target.set_feature(Target::Debug);

        curved.compile_jit(target);
    }