C++ (Cpp) Func::gpu_tileの例

プログラミング言語: C++ (Cpp)

クラス/型: Func

メソッド/関数: gpu_tile

hotexamples.comのコード掲載数: 13

C++ (Cpp) Func::gpu_tile - 13件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたC++ (Cpp)のFunc::gpu_tile パッケージから AlgoSolutionの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

compute_root(30)

compute_at(24)

Call(18)

compile_jit(15)

compile_to_file(14)

bound(14)

gpu_tile(13)

add_custom_lowering_pass(8)

hexagon(6)

compile_to_assembly(6)

attrs(6)

maxModulo(5)

function(5)

define_extern(5)

compile_to_module(4)

glsl(4)

appendParam(4)

args(4)

free_ord(3)

getBody(3)

eval(3)

compileToFile(3)

gpu_threads(3)

infer_arguments(3)

dimensions(3)

accept(3)

initPrologues(3)

isAllowOverride(2)

getArguments(2)

gpu_blocks(2)

infer_input_bounds(2)

finiteHessian(2)

getName(2)

getResults(2)

getKind(2)

finiteGradient(2)

chunk(2)

compile_to_static_library(2)

codegen(2)

compile_to_lowered_stmt(2)

isStatic(1)

isPseudoMain(1)

isIgnoreRedefinition(1)

isAsync(1)

length(1)

hello(1)

GetLocalsPointer(1)

GetName(1)

inputs(1)

GetParentFunc(1)

コード例 #1

ファイルを表示

ファイル: load_library.cpp プロジェクト: jiapei100/Halide

int main(int argc, char **argv) {
    Target target = get_jit_target_from_environment();
    if (!target.has_feature(Target::OpenCL)) {
        printf("This test requires opencl.\n");
        return 0;
    }

    // These calls are only available for AOT-compiled code:
    //
    //   halide_set_custom_get_symbol(my_get_symbol_impl);
    //   halide_set_custom_load_library(my_load_library_impl);
    //   halide_set_custom_get_library_symbol(my_get_library_symbol_impl);
    //
    // For JIT code, we must use JITSharedRuntime::set_default_handlers().

    Internal::JITHandlers handlers;
    handlers.custom_get_symbol = my_get_symbol_impl;
    handlers.custom_load_library = my_load_library_impl;
    handlers.custom_get_library_symbol = my_get_library_symbol_impl;
    Internal::JITSharedRuntime::set_default_handlers(handlers);

    Var x, y, xi, yi;
    Func f;
    f(x, y) = cast<int32_t>(x + y);
    f.gpu_tile(x, y, xi, yi, 8, 8, TailStrategy::Auto, DeviceAPI::OpenCL);
    f.set_error_handler(my_error_handler);

    Buffer<int32_t> out = f.realize(64, 64, target);

    fprintf(stderr, "Should not get here.\n");
    return -1;
}

コード例 #2

ファイルを表示

ファイル: gpu_free_sync.cpp プロジェクト: DoDNet/Halide

int main(int argc, char **argv) {
    // Make sure that freeing GPU buffers doesn't occur before the
    // computation that is filling them completes.
    Func f;
    Var x, y;
    RDom r(0, 100);
    f(x, y) = sum(sqrt(sqrt(sqrt(sqrt(x+y+r)))));

    Target t = get_jit_target_from_environment();

    if (t.has_feature(Target::OpenCL) ||
        t.has_feature(Target::CUDA)) {
        f.gpu_tile(x, y, 16, 16);

        // This allocates a buffer, does gpu compute into it, and then
        // frees it (calling dev_free) possibly before the compute is
        // done.
        for (int i = 0; i < 10; i++) {
            f.realize(1024, 1024, t);
        }
    } else {
        // Skip this test if gpu target not enabled (it's pretty slow on a cpu).
    }

    printf("Success!\n");
    return 0;
}

コード例 #3

ファイルを表示

ファイル: lesson_12_using_the_gpu.cpp プロジェクト: kree-colemcalughlin/Halide

    // Now a schedule that uses CUDA or OpenCL.
    void schedule_for_gpu() {
        // We make the decision about whether to use the GPU for each
        // Func independently. If you have one Func computed on the
        // CPU, and the next computed on the GPU, Halide will do the
        // copy-to-gpu under the hood. For this pipeline, there's no
        // reason to use the CPU for any of the stages. Halide will
        // copy the input image to the GPU the first time we run the
        // pipeline, and leave it there to reuse on subsequent runs.

        // As before, we'll compute the LUT once at the start of the
        // pipeline.
        lut.compute_root();

        // Let's compute the look-up-table using the GPU in 16-wide
        // one-dimensional thread blocks. First we split the index
        // into blocks of size 16:
        Var block, thread;
        lut.split(i, block, thread, 16);
        // Then we tell cuda that our Vars 'block' and 'thread'
        // correspond to CUDA's notions of blocks and threads, or
        // OpenCL's notions of thread groups and threads.
        lut.gpu_blocks(block)
           .gpu_threads(thread);

        // This is a very common scheduling pattern on the GPU, so
        // there's a shorthand for it:

        // lut.gpu_tile(i, 16);

        // Func::gpu_tile method is similar to Func::tile, except that
        // it also specifies that the tile coordinates correspond to
        // GPU blocks, and the coordinates within each tile correspond
        // to GPU threads.

        // Compute color channels innermost. Promise that there will
        // be three of them and unroll across them.
        curved.reorder(c, x, y)
              .bound(c, 0, 3)
              .unroll(c);

        // Compute curved in 2D 8x8 tiles using the GPU.
        curved.gpu_tile(x, y, 8, 8);

        // This is equivalent to:
        // curved.tile(x, y, xo, yo, xi, yi, 8, 8)
        //       .gpu_blocks(xo, yo)
        //       .gpu_threads(xi, yi);

        // We'll leave sharpen as inlined into curved.

        // Compute the padded input as needed per GPU block, storing the
        // intermediate result in shared memory. Var::gpu_blocks, and
        // Var::gpu_threads exist to help you schedule producers within
        // GPU threads and blocks.
        padded.compute_at(curved, Var::gpu_blocks());

        // Use the GPU threads for the x and y coordinates of the
        // padded input.
        padded.gpu_threads(x, y);

        // JIT-compile the pipeline for the GPU. CUDA or OpenCL are
        // not enabled by default. We have to construct a Target
        // object, enable one of them, and then pass that target
        // object to compile_jit. Otherwise your CPU will very slowly
        // pretend it's a GPU, and use one thread per output pixel.

        // Start with a target suitable for the machine you're running
        // this on.
        Target target = get_host_target();

        // Then enable OpenCL or CUDA.

        // We'll enable OpenCL here, because it tends to give better
        // performance than CUDA, even with NVidia's drivers, because
        // NVidia's open source LLVM backend doesn't seem to do all
        // the same optimizations their proprietary compiler does.
        target.features |= Target::OpenCL;

        // Uncomment the next line and comment out the line above to
        // try CUDA instead.
        // target.features |= Target::CUDA;

        // If you want to see all of the OpenCL or CUDA API calls done
        // by the pipeline, you can also enable the GPUDebug
        // flag. This is helpful for figuring out which stages are
        // slow, or when CPU -> GPU copies happen. It hurts
        // performance though, so we'll leave it commented out.
        //target.features |= Target::GPUDebug;

        curved.compile_jit(target);
    }

コード例 #4

ファイルを表示

ファイル: gpu_thread_barrier.cpp プロジェクト: josephsieh/Halide

int main(int argc, char **argv) {
    if (!get_jit_target_from_environment().has_gpu_feature()) {
        printf("Not running test because no gpu target enabled\n");
        return 0;
    }

    {
        Func f;
        Var x, y, z;

        // Construct a Func with lots of potential race conditions, and
        // then run it in thread blocks on the gpu.

        f(x, y) = x + 100 * y;

        const int passes = 10;
        for (int i = 0; i < passes; i++) {
            RDom rx(0, 10);
            // Flip each row, using spots 10-19 as temporary storage
            f(rx + 10, y) = f(9 - rx, y);
            f(rx, y) = f(rx + 10, y);
            // Flip each column the same way
            RDom ry(0, 8);
            f(x, ry + 8) = f(x, 7 - ry);
            f(x, ry) = f(x, ry + 8);
        }

        Func g;
        g(x, y) = f(0, 0)+ f(9, 7);

        g.gpu_tile(x, y, 16, 8);
        f.compute_at(g, Var::gpu_blocks());

        for (int i = 0; i < passes; i++) {
            f.update(i*4 + 0).gpu_threads(y);
            f.update(i*4 + 1).gpu_threads(y);
            f.update(i*4 + 2).gpu_threads(x);
            f.update(i*4 + 3).gpu_threads(x);
        }

        Image<int> out = g.realize(100, 100);
        for (int y = 0; y < out.height(); y++) {
            for (int x = 0; x < out.width(); x++) {
                int correct = 7*100 + 9;
                if (out(x, y) != correct) {
                    printf("out(%d, %d) = %d instead of %d\n",
                           x, y, out(x, y), correct);
                    return -1;
                }
            }
        }

    }

    {
        // Construct a Func with undef stages, then run it in thread
        // blocks and make sure the right number of syncthreads are
        // added.

        Func f;
        Var x, y, z;
        f(x, y) = undef<int>();
        f(x, y) += x + 100 * y;
        // This next line is dubious, because it entirely masks the
        // effect of the previous definition. If you add an undefined
        // value to the previous def, then Halide can evaluate this to
        // whatever it likes. Currently we'll just elide this update
        // definition.
        f(x, y) += undef<int>();
        f(x, y) += y * 100 + x;

        Func g;
        g(x, y) = f(0, 0) + f(7, 7);

        g.gpu_tile(x, y, 8, 8);
        f.compute_at(g, Var::gpu_blocks());

        f.gpu_threads(x, y);
        f.update(0).gpu_threads(x, y);
        f.update(1).gpu_threads(x, y);
        f.update(2).gpu_threads(x, y);

        // There should be two thread barriers: one in between the
        // non-undef definitions, and one between f and g.
        g.add_custom_lowering_pass(new CheckBarrierCount(2));

        Image<int> out = g.realize(100, 100);
    }

    printf("Success!\n");
    return 0;
}

コード例 #5

ファイルを表示

ファイル: tuple_reduction.cpp プロジェクト: Mengke-Yuan/Halide

int main(int argc, char **argv) {
    Target target = get_jit_target_from_environment();

    if (1) {
        // Test a tuple reduction on the gpu
        Func f;
        Var x, y;

        f(x, y) = Tuple(x + y, x - y);

        // Updates to a reduction are atomic.
        f(x, y) = Tuple(f(x, y)[1]*2, f(x, y)[0]*2);
        // now equals ((x - y)*2, (x + y)*2)

        if (target.has_gpu_feature()) {
            f.gpu_tile(x, y, 16, 16);
            f.update().gpu_tile(x, y, 16, 16);
        } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
            f.hexagon(y).vectorize(x, 32);
            f.update().hexagon(y).vectorize(x, 32);
        }

        Realization result = f.realize(1024, 1024);

        Image<int> a = result[0], b = result[1];

        for (int y = 0; y < a.height(); y++) {
            for (int x = 0; x < a.width(); x++) {
                int correct_a = (x - y)*2;
                int correct_b = (x + y)*2;
                if (a(x, y) != correct_a || b(x, y) != correct_b) {
                    printf("result(%d, %d) = (%d, %d) instead of (%d, %d)\n",
                           x, y, a(x, y), b(x, y), correct_a, correct_b);
                    return -1;
                }
            }
        }
    }

    if (1) {
        // Now test one that alternates between cpu and gpu per update step
        Func f;
        Var x, y;

        f(x, y) = Tuple(x + y, x - y);

        for (size_t i = 0; i < 10; i++) {
            // Swap the tuple elements and increment both
            f(x, y) = Tuple(f(x, y)[1] + 1, f(x, y)[0] + 1);
        }

        // Schedule the pure step and the odd update steps on the gpu
        if (target.has_gpu_feature()) {
            f.gpu_tile(x, y, 16, 16);
        } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
            f.hexagon(y).vectorize(x, 32);
        }
        for (int i = 0; i < 10; i ++) {
	    if (i & 1) {
                if (target.has_gpu_feature()) {
                    f.update(i).gpu_tile(x, y, 16, 16);
                } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
                    f.update(i).hexagon(y).vectorize(x, 32);
                }
	    } else {
		f.update(i);
	    }
        }

        Realization result = f.realize(1024, 1024);

        Image<int> a = result[0], b = result[1];

        for (int y = 0; y < a.height(); y++) {
            for (int x = 0; x < a.width(); x++) {
                int correct_a = (x + y) + 10;
                int correct_b = (x - y) + 10;
                if (a(x, y) != correct_a || b(x, y) != correct_b) {
                    printf("result(%d, %d) = (%d, %d) instead of (%d, %d)\n",
                           x, y, a(x, y), b(x, y), correct_a, correct_b);
                    return -1;
                }
            }
        }

    }

    if (1) {
        // Same as above, but switches which steps are gpu and cpu
        Func f;
        Var x, y;

        f(x, y) = Tuple(x + y, x - y);

        for (size_t i = 0; i < 10; i++) {
            // Swap the tuple elements and increment both
            f(x, y) = Tuple(f(x, y)[1] + 1, f(x, y)[0] + 1);
        }

        // Schedule the even update steps on the gpu
        for (int i = 0; i < 10; i ++) {
            if (i & 1) {
                if (target.has_gpu_feature()) {
                    f.update(i).gpu_tile(x, y, 16, 16);
                } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
                    f.update(i).hexagon(y).vectorize(x, 32);
                }
            } else {
                f.update(i);
            }
        }

        Realization result = f.realize(1024, 1024);

        Image<int> a = result[0], b = result[1];

        for (int y = 0; y < a.height(); y++) {
            for (int x = 0; x < a.width(); x++) {
                int correct_a = (x + y) + 10;
                int correct_b = (x - y) + 10;
                if (a(x, y) != correct_a || b(x, y) != correct_b) {
                    printf("result(%d, %d) = (%d, %d) instead of (%d, %d)\n",
                           x, y, a(x, y), b(x, y), correct_a, correct_b);
                    return -1;
                }
            }
        }

    }

    if (1) {
        // In this one, each step only uses one of the tuple elements
        // of the previous step, so only that buffer should get copied
        // back to host or copied to device.
        Func f;
        Var x, y;

        f(x, y) = Tuple(x + y - 1000, x - y + 1000);

        for (size_t i = 0; i < 10; i++) {
            f(x, y) = Tuple(f(x, y)[1] - 1, f(x, y)[1] + 1);
        }

        // Schedule the even update steps on the gpu
        for (int i = 0; i < 10; i++) {
            if (i & 1) {
                f.update(i);
            } else {
                if (target.has_gpu_feature()) {
                    f.update(i).gpu_tile(x, y, 16, 16);
                } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
                    f.update(i).hexagon(y).vectorize(x, 32);
                }
            }
        }

        Realization result = f.realize(1024, 1024);

        Image<int> a = result[0], b = result[1];

        for (int y = 0; y < a.height(); y++) {
            for (int x = 0; x < a.width(); x++) {
                int correct_a = (x - y + 1000) + 8;
                int correct_b = (x - y + 1000) + 10;
                if (a(x, y) != correct_a || b(x, y) != correct_b) {
                    printf("result(%d, %d) = (%d, %d) instead of (%d, %d)\n",
                           x, y, a(x, y), b(x, y), correct_a, correct_b);
                    return -1;
                }
            }
        }

    }

    printf("Success!\n");
    return 0;
}

コード例 #6

ファイルを表示

ファイル: gpu_non_contiguous_copy.cpp プロジェクト: DoDNet/Halide

int main(int argc, char **argv) {
    if (!get_jit_target_from_environment().has_gpu_feature()) {
        printf("No gpu target enabled. Skipping test.\n");
        return 0;
    }

    Var x, y, z, w;
    Image<int> full(80, 60, 10, 10);

    const int x_off = 4, y_off = 8, z_off = 2, w_off = 4;
    const int x_size = 16, y_size = 16, z_size = 3, w_size = 3;

    buffer_t cropped = *full.raw_buffer();
    cropped.host = (uint8_t *)&(full(x_off, y_off, z_off, w_off));
    cropped.min[0] = 0;
    cropped.min[1] = 0;
    cropped.min[2] = 0;
    cropped.min[3] = 0;
    cropped.extent[0] = x_size;
    cropped.extent[1] = y_size;
    cropped.extent[2] = z_size;
    cropped.extent[3] = w_size;
    cropped.stride[0] *= 2;
    cropped.stride[1] *= 2;
    cropped.stride[2] *= 2;
    cropped.stride[3] *= 2;
    Buffer out(Int(32), &cropped);

    // Make a bitmask representing the region inside the crop.
    Image<bool> in_subregion(80, 60, 10, 10);
    Expr test = ((x >= x_off) && (x < x_off + x_size*2) &&
                 (y >= y_off) && (y < y_off + y_size*2) &&
                 (z >= z_off) && (z < z_off + z_size*2) &&
                 (w >= w_off) && (w < w_off + w_size*2) &&
                 (x % 2 == 0) &&
                 (y % 2 == 0) &&
                 (z % 2 == 0) &&
                 (w % 2 == 0));
    Func test_func;
    test_func(x, y, z, w) = test;
    test_func.realize(in_subregion);

    Func f;
    f(x, y, z, w) = 3*x + 2*y + z + 4*w;
    f.gpu_tile(x, y, 16, 16);
    f.output_buffer().set_stride(0, Expr());
    f.realize(out);

    // Put some data in the full host buffer, avoiding the region
    // being evaluated above.
    Expr change_out_of_subregion = select(test, undef<int>(), 4*x + 3*y + 2*z + w);
    lambda(x, y, z, w, change_out_of_subregion).realize(full);

    // Copy back the output subset from the GPU.
    out.copy_to_host();

    for (int w = 0; w < full.extent(3); ++w) {
        for (int z = 0; z < full.extent(2); ++z) {
            for (int y = 0; y < full.extent(1); ++y) {
                for (int x = 0; x < full.extent(0); ++x) {
                    int correct;
                    if (in_subregion(x, y, z, w)) {
                        int x_ = (x - x_off)/2;
                        int y_ = (y - y_off)/2;
                        int z_ = (z - z_off)/2;
                        int w_ = (w - w_off)/2;
                        correct = 3*x_ + 2*y_ + z_ + 4*w_;
                    } else {
                        correct = 4*x + 3*y + 2*z + w;
                    }
                    if (full(x, y, z, w) != correct) {
                        printf("Error! Incorrect value %i != %i at %i, %i, %i, %i\n", full(x, y, z, w), correct, x, y, z, w);
                        return -1;
                    }
                }
            }
        }
    }

    printf("Success!\n");
    return 0;
}

コード例 #7

ファイルを表示

ファイル: logical.cpp プロジェクト: adityaatluri/Halide

int main(int argc, char **argv) {

    Buffer<uint8_t> input(128, 64);

    for (int y = 0; y < input.height(); y++) {
        for (int x = 0; x < input.width(); x++) {
            input(x, y) = y*input.width() + x;
        }
    }

    Var x, y, xi, yi;
    {
        Func f;
        f(x, y) = select(((input(x, y) > 10) && (input(x, y) < 20)) ||
                         ((input(x, y) > 40) && (!(input(x, y) > 50))),
                         u8(255), u8(0));

        Target target = get_jit_target_from_environment();
        if (target.has_gpu_feature()) {
            f.gpu_tile(x, y, xi, yi, 16, 16).vectorize(xi, 4);
        } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
            f.hexagon().vectorize(x, 128);
        } else {
            f.vectorize(x, 8);
        }

        Buffer<uint8_t> output = f.realize(input.width(), input.height(), target);

        for (int y = 0; y < input.height(); y++) {
            for (int x = 0; x < input.width(); x++) {
                bool cond = ((input(x, y) > 10) && (input(x, y) < 20)) ||
                    ((input(x, y) > 40) && (!(input(x, y) > 50)));
                uint8_t correct = cond ? 255 : 0;
                if (correct != output(x, y)) {
                    fprintf(stderr, "output(%d, %d) = %d instead of %d\n", x, y, output(x, y), correct);
                    return -1;
                }
            }
        }
    }

    // Test a condition that uses a let resulting from common
    // subexpression elimination.
    {
        Func f;
        Expr common_cond = input(x, y) > 10;
        f(x, y) = select((common_cond && (input(x, y) < 20)) ||
                         ((input(x, y) > 40) && (!common_cond)),
                         u8(255), u8(0));

        Target target = get_jit_target_from_environment();
        if (target.has_gpu_feature()) {
            f.gpu_tile(x, y, xi, yi, 16, 16).vectorize(xi, 4);
        } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
            f.hexagon().vectorize(x, 128);
        } else {
            f.vectorize(x, 8);
        }

        Buffer<uint8_t> output = f.realize(input.width(), input.height(), target);

        for (int y = 0; y < input.height(); y++) {
            for (int x = 0; x < input.width(); x++) {
                bool common_cond = input(x, y) > 10;
                bool cond = (common_cond && (input(x, y) < 20)) ||
                    ((input(x, y) > 40) && (!common_cond));
                uint8_t correct = cond ? 255 : 0;
                if (correct != output(x, y)) {
                    fprintf(stderr, "output(%d, %d) = %d instead of %d\n", x, y, output(x, y), correct);
                    return -1;
                }
            }
        }
    }

    // Test a condition which has vector and scalar inputs.
    {
        Func f("f");
        f(x, y) = select(x < 10 || x > 20 || y < 10 || y > 20, 0, input(x, y));

        Target target = get_jit_target_from_environment();

        if (target.has_gpu_feature()) {
            f.gpu_tile(x, y, xi, yi, 16, 16).vectorize(xi, 4);
        } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
            f.hexagon().vectorize(x, 128);
        } else {
            f.vectorize(x, 128);
        }

        Buffer<uint8_t> output = f.realize(input.width(), input.height(), target);

        for (int y = 0; y < input.height(); y++) {
            for (int x = 0; x < input.width(); x++) {
                bool cond = x < 10 || x > 20 || y < 10 || y > 20;
                uint8_t correct = cond ? 0 : input(x,y);
                if (correct != output(x, y)) {
                    fprintf(stderr, "output(%d, %d) = %d instead of %d\n", x, y, output(x, y), correct);
                    return -1;
                }
            }
        }
    }

    // Test a condition that uses differently sized types.
    {
        Func f;
        Expr ten = 10;
        f(x, y) = select(input(x, y) > ten, u8(255), u8(0));

        Target target = get_jit_target_from_environment();
        if (target.has_gpu_feature()) {
            f.gpu_tile(x, y, xi, yi, 16, 16).vectorize(xi, 4);
        } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
            f.hexagon().vectorize(x, 128);
        } else {
            f.vectorize(x, 8);
        }

        Buffer<uint8_t> output = f.realize(input.width(), input.height(), target);

        for (int y = 0; y < input.height(); y++) {
            for (int x = 0; x < input.width(); x++) {
                bool cond = input(x, y) > 10;
                uint8_t correct = cond ? 255 : 0;
                if (correct != output(x, y)) {
                    fprintf(stderr, "output(%d, %d) = %d instead of %d\n", x, y, output(x, y), correct);
                    return -1;
                }
            }
        }
    }

    // Test a select where the condition has a different width than
    // the true/false values.
    for (int w = 8; w <= 32; w *= 2) {
        for (int n = 8; n < w; n *= 2) {
            Type narrow = UInt(n), wide = UInt(w);

            Func in_wide;
            in_wide(x, y) = cast(wide, y + x*3);
            in_wide.compute_root();

            Func in_narrow;
            in_narrow(x, y) = cast(narrow, x*y + x - 17);
            in_narrow.compute_root();

            Func f;
            f(x, y) = select(in_narrow(x, y) > 10, in_wide(x, y*2), in_wide(x, y*2+1));

            Func cpu;
            cpu(x, y) = f(x, y);

            Func gpu;
            gpu(x, y) = f(x, y);

            Func out;
            out(x, y) = {cast<uint32_t>(cpu(x, y)), cast<uint32_t>(gpu(x, y))};

            cpu.compute_root();
            gpu.compute_root();

            Target target = get_jit_target_from_environment();
            if (target.has_feature(Target::OpenCL) && n == 16 && w == 32) {
                // Workaround for https://github.com/halide/Halide/issues/2477
                printf("Skipping uint%d -> uint%d for OpenCL\n", n, w);
                continue;
            }
            if (target.has_gpu_feature()) {
                gpu.gpu_tile(x, y, xi, yi, 16, 16).vectorize(xi, 4);
            } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
                gpu.hexagon().vectorize(x, 128);
            } else {
                // Just test vectorization
                gpu.vectorize(x, 8);
            }

            Realization r = out.realize(input.width(), input.height(), target);
            Buffer<uint32_t> cpu_output = r[0];
            Buffer<uint32_t> gpu_output = r[1];

            for (int y = 0; y < input.height(); y++) {
                for (int x = 0; x < input.width(); x++) {
                    if (cpu_output(x, y) != gpu_output(x, y)) {
                        fprintf(stderr, "gpu_output(%d, %d) = %d instead of %d for uint%d -> uint%d\n",
                               x, y, gpu_output(x, y), cpu_output(x, y), n, w);
                        return -1;
                    }
                }
            }
        }
    }


    printf("Success!\n");
    return 0;

}

コード例 #8

ファイルを表示

ファイル: gpu_mixed_shared_mem_types.cpp プロジェクト: DoDNet/Halide

int main(int argc, char **argv) {
    Target t(get_jit_target_from_environment());
    if (!t.has_gpu_feature()) {
        printf("Not running test because no gpu target enabled\n");
        return 0;
    }

    const int n_types = 9;

    Type types[] = {Int(8), Int(16), Int(32), Int(64),
                    UInt(8), UInt(16), UInt(32), UInt(64),
                    Float(32)};
    Func funcs[n_types];

    Var x;

    Func out;

    Type result_type;
    if (t.has_feature(Target::Metal)) {
        result_type = UInt(32);
    } else {
        result_type = UInt(64);
    }
    Expr e = cast(result_type, 0);
    int offset = 0;
    for (int i = 0; i < n_types; i++) {
        int off = 0;
        if ((types[i].is_int() || types[i].is_uint())) {
            // Metal does not support 64-bit integers.
            if (t.has_feature(Target::Metal) &&
                types[i].bits() >= 64) {
                continue;
            }

            if (types[i].bits() <= 64) {
                off = (1 << (types[i].bits() - 4)) + 17;
            }
        }
        offset += off;

        funcs[i](x) = cast(types[i], x/16 + off);
        e += cast(result_type, funcs[i](x));
        funcs[i].compute_at(out, Var::gpu_blocks()).gpu_threads(x);
    }


    out(x) = e;
    out.gpu_tile(x, 23);

    Buffer output = out.realize(23*5);

    int result;
    if (t.has_feature(Target::Metal)) {
        result = check_result<uint32_t>(output, n_types - 2, offset);
    } else {
        result = check_result<uint64_t>(output, n_types, offset);
    }
    if (result != 0) {
        return result;
    }

    printf("Success!\n");
    return 0;
}

コード例 #9

ファイルを表示

ファイル: linearCombinationKernel.cpp プロジェクト: stanford-gfx/astro

int main(int argc, char *argv[]) {
#if !defined(STANDALONE) && !defined(TESTING_GPU)
    auto im = afwImage::MaskedImage<float>("../calexp-004207-g3-0123.fits");
    int width = im.getWidth(), height = im.getHeight();

#else
    int width = 2048, height = 1489;
//    int width = 200, height = 200;
    printf("[no load]");
#endif
    printf("Loaded: %d x %d\n", width, height);

    //store image data in img_var(x, y, 0) and variance data in img_var(x, y, 1)
    Image<float> image(width, height);
    Image<float> variance(width, height);
    Image<uint16_t> mask(width, height);

#if !defined(STANDALONE) && !defined(TESTING_GPU) 
    //Read image in
    for (int y = 0; y < im.getHeight(); y++) {
        afwImage::MaskedImage<float, lsst::afw::image::MaskPixel, lsst::afw::image::VariancePixel>::x_iterator inPtr = im.x_at(0, y);
        for (int x = 0; x < im.getWidth(); x++){
            image(x, y) = (*inPtr).image();
            variance(x, y) = (*inPtr).variance();
            mask(x, y) = (*inPtr).mask();
            inPtr++;
        }
    }
#endif

    int boundingBox = 5; 
    Var x, y, i_v, y0, yi;

    //compute output image and variance
    //Polynomials that define weights of spatially variant linear combination of 5 kernels
    Func polynomial1, polynomial2, polynomial3, polynomial4, polynomial5;
    polynomial1(x, y) = 0.1f + 0.002f*x + 0.003f*y + 0.4f*x*x + 0.5f*x*y
                     + 0.6f*y*y +  0.0007f*x*x*x + 0.0008f*x*x*y + 0.0009f*x*y*y
                     + 0.00011f*y*y*y;

    //for experimenting with optimizations
    polynomial2(x, y) = 1.1f + 1.002f*x + 1.003f*y + 1.4f*x*x + 1.5f*x*y
                     + 1.6f*y*y +  1.0007f*x*x*x + 1.0008f*x*x*y + 1.0009f*x*y*y
                     + 1.00011f*y*y*y;

    //for experimenting with optimizations

    polynomial3(x, y) = 2.1f + 2.002f*x + 2.003f*y + 2.4f*x*x + 2.5f*x*y
                     + 2.6f*y*y +  2.0007f*x*x*x + 2.0008f*x*x*y + 2.0009f*x*y*y
                     + 2.00011f*y*y*y;

    //for experimenting with optimizations
    polynomial4(x, y) = 3.1f + 3.002f*x + 3.003f*y + 3.4f*x*x + 3.5f*x*y
                     + 3.6f*y*y +  3.0007f*x*x*x + 3.0008f*x*x*y + 3.0009f*x*y*y
                     + 3.00011f*y*y*y;

    //for experimenting with optimizations
    polynomial5(x, y) = 4.1f + 4.002f*x + 4.003f*y + 4.4f*x*x + 4.5f*x*y
                     + 4.6f*y*y +  4.0007f*x*x*x + 4.0008f*x*x*y + 4.0009f*x*y*y
                     + 4.00011f*y*y*y;

    //Kernel #1
    Func kernel1;
    float sigmaX1 = 2.0f;
    float sigmaY1 = 2.0f;
    float theta1 = 0.0f; //rotation of sigmaX axis
    kernel1(x, y) = (exp(-((x*cos(theta1) +y*sin(theta1))*(x*cos(theta1) +y*sin(theta1)))
                    /(2*sigmaX1*sigmaX1)) / (sqrtf(2*M_PI)*sigmaX1))
                    *(exp(-((y*cos(theta1) - x*sin(theta1))*(y*cos(theta1) - x*sin(theta1)))
                    /(2*sigmaY1*sigmaY1)) / (sqrtf(2*M_PI)*sigmaY1));



    //Kernel #2
    Func kernel2;
    float sigmaX2 = 0.5f;
    float sigmaY2 = 4.0f;
    float theta2 = 0.0f; //rotation of sigmaX axis
    kernel2(x, y) = (exp(-((x*cos(theta2) +y*sin(theta2))*(x*cos(theta2) +y*sin(theta2)))
                    /(2*sigmaX2*sigmaX2)) / (sqrtf(2*M_PI)*sigmaX2))
                    *(exp(-((y*cos(theta2) - x*sin(theta2))*(y*cos(theta2) - x*sin(theta2)))
                    /(2*sigmaY2*sigmaY2)) / (sqrtf(2*M_PI)*sigmaY2));

    //Kernel #3
    Func kernel3;
    float sigmaX3 = 0.5f;
    float sigmaY3 = 4.0f;
    float theta3 = 3.14159f/4; //rotation of sigmaX axis
    kernel3(x, y) = (exp(-((x*cos(theta3) +y*sin(theta3))*(x*cos(theta3) +y*sin(theta3)))
                    /(2*sigmaX3*sigmaX3)) / (sqrtf(2*M_PI)*sigmaX3))
                    *(exp(-((y*cos(theta3) - x*sin(theta3))*(y*cos(theta3) - x*sin(theta3)))
                    /(2*sigmaY3*sigmaY3)) / (sqrtf(2*M_PI)*sigmaY3));
    //Kernel #4
    Func kernel4;
    float sigmaX4 = 0.5f;
    float sigmaY4 = 4.0f;
    float theta4 = 3.14159f/2; //rotation of sigmaX axis
    kernel4(x, y) = (exp(-((x*cos(theta4) +y*sin(theta4))*(x*cos(theta4) +y*sin(theta4)))
                    /(2*sigmaX4*sigmaX4)) / (sqrtf(2*M_PI)*sigmaX4))
                    *(exp(-((y*cos(theta4) - x*sin(theta4))*(y*cos(theta4) - x*sin(theta4)))
                    /(2*sigmaY4*sigmaY4)) / (sqrtf(2*M_PI)*sigmaY4));


    //Kernel #5
    Func kernel5;
    float sigmaX5 = 4.0f;
    float sigmaY5 = 4.0f;
    float theta5 = 0.0; //rotation of sigmaX axis
    kernel5(x, y) = (exp(-((x*cos(theta5) +y*sin(theta5))*(x*cos(theta5) +y*sin(theta5)))
                    /(2*sigmaX5*sigmaX5)) / (sqrtf(2*M_PI)*sigmaX5))
                    *(exp(-((y*cos(theta5) - x*sin(theta5))*(y*cos(theta5) - x*sin(theta5)))
                    /(2*sigmaY5*sigmaY5)) / (sqrtf(2*M_PI)*sigmaY5));


    //Compute output image plane
    Func image_bounded ("image_bounded");
    image_bounded = BoundaryConditions::repeat_edge(image);


    //Spatially Invariant Implementation 1
/*    Expr blur_image_help = 0.0f;
    Expr norm = 0.0f;
    for(int i = -boundingBox; i <= boundingBox; i++){
        for(int j = -boundingBox; j <= boundingBox; j++){
            blur_image_help += image_bounded(x + i, y + j) * (kernel1(i, j) + kernel2(i, j) +
                                kernel3(i, j) + kernel4(i, j) + kernel5(i, j)); 
            norm += (kernel1(i, j) + kernel2(i, j) + kernel3(i, j) + kernel4(i, j) + kernel5(i, j));
        }
    }
    blur_image_help = blur_image_help/norm;
    Func blurImage ("blurImage");
    blurImage(x, y) = blur_image_help;
*/

    //Spatially Invariant Implementation 2
/*
    Expr blur_image_help1 = 0.0f;
    Expr norm1 = 0.0f;
    for(int i = -boundingBox; i <= boundingBox; i++){
        for(int j = -boundingBox; j <= boundingBox; j++){
            blur_image_help1 += image_bounded(x + i, y + j) * kernel1(i, j); 
            norm1 += kernel1(i, j);
        }
    }
//    blur_image_help1 = blur_image_help1/norm1;
    Func blurImage1 ("blurImage1");
    blurImage1(x, y) = blur_image_help1;

    Expr blur_image_help2 = 0.0f;
    Expr norm2 = 0.0f;
    for(int i = -boundingBox; i <= boundingBox; i++){
        for(int j = -boundingBox; j <= boundingBox; j++){
            blur_image_help2 += image_bounded(x + i, y + j) * kernel2(i, j); 
            norm2 += kernel2(i, j);
        }
    }
//    blur_image_help2 = blur_image_help2/norm2;
    Func blurImage2 ("blurImage2");
    blurImage2(x, y) = blur_image_help2;

    Expr blur_image_help3 = 0.0f;
    Expr norm3 = 0.0f;
    for(int i = -boundingBox; i <= boundingBox; i++){
        for(int j = -boundingBox; j <= boundingBox; j++){
            blur_image_help3 += image_bounded(x + i, y + j) * kernel3(i, j); 
            norm3 += kernel3(i, j);
        }
    }
//    blur_image_help3 = blur_image_help3/norm3;
    Func blurImage3 ("blurImage3");
    blurImage3(x, y) = blur_image_help3;

    Expr blur_image_help4 = 0.0f;
    Expr norm4 = 0.0f;
    for(int i = -boundingBox; i <= boundingBox; i++){
        for(int j = -boundingBox; j <= boundingBox; j++){
            blur_image_help4 += image_bounded(x + i, y + j) * kernel4(i, j); 
            norm4 += kernel4(i, j);
        }
    }
//    blur_image_help4 = blur_image_help4/norm4;
    Func blurImage4 ("blurImage4");
    blurImage4(x, y) = blur_image_help4;

    Expr blur_image_help5 = 0.0f;
    Expr norm5 = 0.0f;
    for(int i = -boundingBox; i <= boundingBox; i++){
        for(int j = -boundingBox; j <= boundingBox; j++){
            blur_image_help5 += image_bounded(x + i, y + j) * kernel5(i, j); 
            norm5 += kernel5(i, j);
        }
    }
//    blur_image_help5 = blur_image_help5/norm5;
    Func blurImage5 ("blurImage5");
    blurImage5(x, y) = blur_image_help5;


    Func blurImage ("blurImage");
//    blurImage(x, y) = (blurImage1(x, y) + blurImage2(x, y) + blurImage3(x, y) +
//                        blurImage4(x, y) + blurImage5(x, y))/(5*norm1);
    blurImage(x, y) = (blur_image_help1 + blur_image_help2 + blur_image_help3 + 
                        blur_image_help4 + blur_image_help5)/(5*norm1);
*/




    //Spatially Variant Implementation 1
    Expr blur_image_help = 0.0f;
    Expr norm = 0.0f;
    for(int i = -boundingBox; i <= boundingBox; i++){
        for(int j = -boundingBox; j <= boundingBox; j++){
            blur_image_help += image_bounded(x + i, y + j) * (polynomial1(x, y)*kernel1(i, j) +
                polynomial2(x, y)*kernel2(i, j) + polynomial3(x, y)*kernel3(i, j) + 
                polynomial4(x, y)*kernel4(i, j) + polynomial5(x, y)*kernel5(i, j)); 
            norm += (polynomial1(x, y)*kernel1(i, j) + polynomial2(x, y)*kernel2(i, j) + 
                polynomial3(x, y)*kernel3(i, j) + polynomial4(x, y)*kernel4(i, j) + 
                polynomial5(x, y)*kernel5(i, j));
        }
    }
    blur_image_help = blur_image_help/norm;
    Func blurImage ("blurImage");
    blurImage(x, y) = blur_image_help;





    //Compute output variance plane
    Func variance_bounded ("variance_bounded");
    variance_bounded = BoundaryConditions::repeat_edge(variance);
    //compute Variance output
    Func blurVariance ("blurVariance");
    Expr blur_variance_help = 0.0f;
    Expr vNorm2 = 0.0f;
    for(int i = -boundingBox; i <= boundingBox; i++){
        for(int j = -boundingBox; j <= boundingBox; j++){
            blur_variance_help += variance_bounded(x + i, y + j) * (polynomial1(x, y)*kernel1(i, j) +
                polynomial2(x, y)*kernel2(i, j) + polynomial3(x, y)*kernel3(i, j) + 
                polynomial4(x, y)*kernel4(i, j) + polynomial5(x, y)*kernel5(i, j))
                *(polynomial1(x, y)*kernel1(i, j) +
                polynomial2(x, y)*kernel2(i, j) + polynomial3(x, y)*kernel3(i, j) + 
                polynomial4(x, y)*kernel4(i, j) + polynomial5(x, y)*kernel5(i, j)); 
            vNorm2 += (polynomial1(x, y)*kernel1(i, j) + polynomial2(x, y)*kernel2(i, j) + 
                polynomial3(x, y)*kernel3(i, j) + polynomial4(x, y)*kernel4(i, j) + 
                polynomial5(x, y)*kernel5(i, j))
                *(polynomial1(x, y)*kernel1(i, j) + polynomial2(x, y)*kernel2(i, j) + 
                polynomial3(x, y)*kernel3(i, j) + polynomial4(x, y)*kernel4(i, j) + 
                polynomial5(x, y)*kernel5(i, j));
        }
    }
//    blur_variance_help = blur_variance_help/(norm(x,y)*norm(x,y));
    blur_variance_help = blur_variance_help/(vNorm2*vNorm2);
    blurVariance(x, y) = blur_variance_help;



    //Compute output mask plane
    Func mask_bounded ("mask_bounded");
    mask_bounded = BoundaryConditions::repeat_edge(mask);

    Func maskOut ("maskOut");

    Expr maskOutHelp = 0;

    for(int i = -boundingBox; i <= boundingBox; i++){
        for(int j = -boundingBox; j <= boundingBox; j++){
            maskOutHelp = select((polynomial1(x, y)*kernel1(i, j) + polynomial2(x, y)*kernel2(i, j) + 
                polynomial3(x, y)*kernel3(i, j) + polynomial4(x, y)*kernel4(i, j) + 
                polynomial5(x, y)*kernel5(i, j)) == 0.0f, maskOutHelp, maskOutHelp | mask_bounded(x + i, y + j));
//            maskOutHelp = maskOutHelp | mask_bounded(x + i, y + j);    
        }
    }
    maskOut(x, y) = maskOutHelp;



    //Schedule
  //  blur.reorder(i_v, x, y);


//    kernel1.compute_at(blurImage, x);
//    kernel1.vectorize(x, 8);
//    kernel1.split(y, y0, yi, 4);
//    kernel1.parallel(y0);

/*    kernel1.compute_root();
    kernel2.compute_root();
    kernel3.compute_root();
    kernel4.compute_root();
    kernel5.compute_root();
*/
    //best schedule found:

#ifdef TESTING_GPU
        blurImage.gpu_tile(x, y, 16, 16);

        // JIT-compile the pipeline for the GPU. CUDA or OpenCL are
        // not enabled by default. We have to construct a Target
        // object, enable one of them, and then pass that target
        // object to compile_jit. Otherwise your CPU will very slowly
        // pretend it's a GPU, and use one thread per output pixel.

        // Start with a target suitable for the machine you're running
        // this on.
        Target target = get_host_target();

        // Then enable OpenCL or CUDA.

        // We'll enable OpenCL here, because it tends to give better
        // performance than CUDA, even with NVidia's drivers, because
        // NVidia's open source LLVM backend doesn't seem to do all
        // the same optimizations their proprietary compiler does.
        target.set_feature(Target::OpenCL);

        // Uncomment the next line and comment out the line above to
        // try CUDA instead.
        // target.set_feature(Target::CUDA);

        // If you want to see all of the OpenCL or CUDA API calls done
        // by the pipeline, you can also enable the Debug
        // flag. This is helpful for figuring out which stages are
        // slow, or when CPU -> GPU copies happen. It hurts
        // performance though, so we'll leave it commented out.
        // target.set_feature(Target::Debug);

        blurImage.compile_jit(target);
#else
        blurImage.split(y, y0, yi, 4);
        blurImage.parallel(y0);
        blurImage.vectorize(x, 8);
#endif

    // Split the y coordinate of the consumer into strips:
    blurVariance.split(y, y0, yi, 4);
    // Compute the strips using a thread pool and a task queue.
    blurVariance.parallel(y0);
    // Vectorize across x.
    blurVariance.vectorize(x, 8);

//    polynomial1.compute_at(blurImage, x).vectorize(x, 8);
//    kernel1.compute_at(blurImage, x).vectorize(x, 8);


    // Split the y coordinate of the consumer into strips of 16 scanlines:
    maskOut.split(y, y0, yi, 30);
    // Compute the strips using a thread pool and a task queue.
    maskOut.parallel(y0);
    // Vectorize across x by a factor of four.
    maskOut.vectorize(x, 8);

//    kernel1.trace_stores();
//    blurImage.trace_stores();




    //Check out what is happening
    blurImage.print_loop_nest();
    // Print out pseudocode for the pipeline.
    blurImage.compile_to_lowered_stmt("linearCombinationKernelBlurImage.html", {image}, HTML);
//    blurImage.compile_to_c("linearCombinationKernel_C_Code.cpp", std::vector<Argument>(), "linearCombinationKernel_C_Code");
//    blurVariance.compile_to_lowered_stmt("blur.html", {variance}, HTML);



    // Benchmark the pipeline.
#ifdef TESTING_GPU
    Buffer image_output(Float(32), image.width(), image.height()); //for GPU testing
#else
    Image<float> image_output(image.width(), image.height());
#endif

    blurImage.realize(image_output);

    Image<float> variance_output(variance.width(), variance.height());
    blurVariance.realize(variance_output);

    Image<int32_t> mask_output(mask.width(), mask.height());
    maskOut.realize(mask_output);

#ifdef TESTING_GPU 
    // Run the filter once to initialize any GPU runtime state.
    blurImage.realize(image_output);

    // Now take the best of 3 runs for timing.
    double best_time;
    for (int i = 0; i < 3; i++) {

        double t1 = current_time();

        // Run the filter 100 times.
        for (int j = 0; j < 100; j++) {
            blurImage.realize(image_output);
        }

        // Force any GPU code to finish by copying the buffer back to the CPU.
        image_output.copy_to_host();

        double t2 = current_time();

        double elapsed = (t2 - t1)/100;
        if (i == 0 || elapsed < best_time) {
            best_time = elapsed;
        }
    }

    printf("%1.4f milliseconds\n", best_time);
#else

	double average = 0;
    double min;
    double max;
    double imgTime;
    double varTime;
    double maskTime;
    int numberOfRuns = 5;
    for (int i = 0; i < numberOfRuns; i++) {
        double t1 = current_time();
        blurImage.realize(image_output);
        double t2 = current_time();
        blurVariance.realize(variance_output);
        double t3 = current_time();
        maskOut.realize(mask_output);
        double t4 = current_time();
        double curTime = (t4-t1);
        average += curTime;
        if(i == 0){
            min = curTime;
            max = curTime;
            imgTime = t2-t1;
            varTime = t3-t2;
            maskTime = t4-t3;
        }
        else{
            if(curTime < min){
                min = curTime;
                imgTime = t2-t1;
                varTime = t3-t2;
                maskTime = t4-t3;
            }
            if(curTime > max)
                max = curTime;
        }
    }
    average = average/numberOfRuns;
    std::cout << "Average Time: " << average << ", Min = " <<
    min << ", Max = " << max << ", with " << numberOfRuns <<
    " runs" << '\n';
    cout << "For fastest run total time = " << min << ", imgTime = " << imgTime << ", varTime = " << varTime << 
    "maskTime = " << maskTime << endl;
#endif



#if !defined(STANDALONE) && !defined(TESTING_GPU)    
    //write image out
    auto imOut = afwImage::MaskedImage<float, lsst::afw::image::MaskPixel, lsst::afw::image::VariancePixel>(im.getWidth(), im.getHeight());
    for (int y = 0; y < imOut.getHeight(); y++) {
    	afwImage::MaskedImage<float, lsst::afw::image::MaskPixel, lsst::afw::image::VariancePixel>::x_iterator inPtr = imOut.x_at(0, y);

        for (int x = 0; x < imOut.getWidth(); x++){
        	afwImage::pixel::SinglePixel<float, lsst::afw::image::MaskPixel, lsst::afw::image::VariancePixel> 
            curPixel(image_output(x, y), mask_output(x, y), variance_output(x, y));
        	(*inPtr) = curPixel;
        	inPtr++;

        }
    }

	imOut.writeFits("./halideLinearCombination5x5.fits");
#endif

}

コード例 #10

ファイルを表示

ファイル: lesson_12_using_the_gpu.cpp プロジェクト: darkbuck/Halide

    // Now a schedule that uses CUDA or OpenCL.
    void schedule_for_gpu() {
        // We make the decision about whether to use the GPU for each
        // Func independently. If you have one Func computed on the
        // CPU, and the next computed on the GPU, Halide will do the
        // copy-to-gpu under the hood. For this pipeline, there's no
        // reason to use the CPU for any of the stages. Halide will
        // copy the input image to the GPU the first time we run the
        // pipeline, and leave it there to reuse on subsequent runs.

        // As before, we'll compute the LUT once at the start of the
        // pipeline.
        lut.compute_root();

        // Let's compute the look-up-table using the GPU in 16-wide
        // one-dimensional thread blocks. First we split the index
        // into blocks of size 16:
        Var block, thread;
        lut.split(i, block, thread, 16);
        // Then we tell cuda that our Vars 'block' and 'thread'
        // correspond to CUDA's notions of blocks and threads, or
        // OpenCL's notions of thread groups and threads.
        lut.gpu_blocks(block)
           .gpu_threads(thread);

        // This is a very common scheduling pattern on the GPU, so
        // there's a shorthand for it:

        // lut.gpu_tile(i, block, thread, 16);

        // Func::gpu_tile behaves the same as Func::tile, except that
        // it also specifies that the tile coordinates correspond to
        // GPU blocks, and the coordinates within each tile correspond
        // to GPU threads.

        // Compute color channels innermost. Promise that there will
        // be three of them and unroll across them.
        curved.reorder(c, x, y)
              .bound(c, 0, 3)
              .unroll(c);

        // Compute curved in 2D 8x8 tiles using the GPU.
        curved.gpu_tile(x, y, xo, yo, xi, yi, 8, 8);

        // This is equivalent to:
        // curved.tile(x, y, xo, yo, xi, yi, 8, 8)
        //       .gpu_blocks(xo, yo)
        //       .gpu_threads(xi, yi);

        // We'll leave sharpen as inlined into curved.

        // Compute the padded input as needed per GPU block, storing
        // the intermediate result in shared memory. In the schedule
        // above xo corresponds to GPU blocks.
        padded.compute_at(curved, xo);

        // Use the GPU threads for the x and y coordinates of the
        // padded input.
        padded.gpu_threads(x, y);

        // JIT-compile the pipeline for the GPU. CUDA, OpenCL, or
        // Metal are not enabled by default. We have to construct a
        // Target object, enable one of them, and then pass that
        // target object to compile_jit. Otherwise your CPU will very
        // slowly pretend it's a GPU, and use one thread per output
        // pixel.

        // Start with a target suitable for the machine you're running
        // this on.
        Target target = get_host_target();

        // Then enable OpenCL or Metal, depending on which platform
        // we're on. OS X doesn't update its OpenCL drivers, so they
        // tend to be broken. CUDA would also be a fine choice on
        // machines with NVidia GPUs.
        if (target.os == Target::OSX) {
            target.set_feature(Target::Metal);
        } else {
            target.set_feature(Target::OpenCL);
        }

        // Uncomment the next line and comment out the lines above to
        // try CUDA instead.
        // target.set_feature(Target::CUDA);

        // If you want to see all of the OpenCL, Metal, or CUDA API
        // calls done by the pipeline, you can also enable the Debug
        // flag. This is helpful for figuring out which stages are
        // slow, or when CPU -> GPU copies happen. It hurts
        // performance though, so we'll leave it commented out.
        // target.set_feature(Target::Debug);

        curved.compile_jit(target);
    }

コード例 #11

ファイルを表示

ファイル: register_shuffle.cpp プロジェクト: jiapei100/Halide

int main(int argc, char **argv) {
    Target t = get_jit_target_from_environment();

    if (!t.features_any_of({Target::CUDACapability50,
                            Target::CUDACapability61})) {
        printf("This test requires cuda enabled with cuda capability 5.0 or greater\n");
        return 0;
    }

    {
        // Shuffle test to do a small convolution
        Func f, g;
        Var x, y;

        f(x, y) = x + y;
        g(x, y) = f(x-1, y) + f(x+1, y);

        Var xo, xi, yi, yo;
        g.gpu_tile(x, y, xi, yi, 32, 2, TailStrategy::RoundUp).gpu_lanes(xi);
        f.compute_root();
        f.in(g).compute_at(g, yi).split(x, xo, xi, 32, TailStrategy::RoundUp).gpu_lanes(xi).unroll(xo);

        Buffer<int> out = g.realize(32, 4);
        for (int y = 0; y < out.height(); y++) {
            for (int x = 0; x < out.width(); x++) {
                int correct = 2*(x + y);
                int actual = out(x, y);
                if (correct != actual) {
                    printf("out(%d, %d) = %d instead of %d\n",
                           x, y, actual, correct);
                    return -1;
                }
            }
        }
    }

    {
        // Broadcast test - an outer product access pattern
        Func a, b, c;
        Var x, y;
        a(x) = cast<float>(x);
        b(y) = cast<float>(y);
        c(x, y) = a(x) + 100 * b(y);

        a.compute_root();
        b.compute_root();

        Var xi, yi, yii;

        c.tile(x, y, xi, yi, 32, 32, TailStrategy::RoundUp)
            .gpu_blocks(x, y)
            .gpu_lanes(xi);
        // We're going to be computing 'a' and 'b' at block level, but
        // we want them in register, not shared, so we explicitly call
        // store_in.
        a.in(c).compute_at(c, x)
            .gpu_lanes(x)
            .store_in(MemoryType::Register);
        b.in(c).compute_at(c, x)
            .gpu_lanes(y)
            .store_in(MemoryType::Register);

        Buffer<float> out = c.realize(32, 32);
        for (int y = 0; y < out.height(); y++) {
            for (int x = 0; x < out.width(); x++) {
                float correct = x + 100 * y;
                float actual = out(x, y);
                // The floats are small integers, so they should be exact.
                if (correct != actual) {
                    printf("out(%d, %d) = %f instead of %f\n",
                           x, y, actual, correct);
                    return -1;
                }
            }
        }
    }

    {
        // Vectorized broadcast test. Each lane is responsible for a
        // 2-vector from 'a' and a 2-vector from 'b' instead of a single
        // value.
        Func a, b, c;
        Var x, y;
        a(x) = cast<float>(x);
        b(y) = cast<float>(y);
        c(x, y) = a(x) + 100 * b(y);

        a.compute_root();
        b.compute_root();

        Var xi, yi, yii;

        c.tile(x, y, xi, yi, 64, 64, TailStrategy::RoundUp)
            .gpu_blocks(x, y)
            .split(yi, yi, yii, 64).unroll(yii, 2).gpu_threads(yi)
            .vectorize(xi, 2).gpu_lanes(xi);
        a.in(c).compute_at(c, yi).vectorize(x, 2).gpu_lanes(x);
        b.in(c).compute_at(c, yi).vectorize(y, 2).gpu_lanes(y);

        Buffer<float> out = c.realize(64, 64);
        for (int y = 0; y < out.height(); y++) {
            for (int x = 0; x < out.width(); x++) {
                float correct = x + 100 * y;
                float actual = out(x, y);
                // The floats are small integers, so they should be exact.
                if (correct != actual) {
                    printf("out(%d, %d) = %f instead of %f\n",
                           x, y, actual, correct);
                    return -1;
                }
            }
        }
    }

    {
        // A stencil chain where many of the lanes will be masked
        Func a, b, c, d;
        Var x, y;

        a(x, y) = x + y;
        a.compute_root();

        b(x, y) = a(x-1, y) + a(x, y) + a(x+1, y);
        c(x, y) = b(x-1, y) + b(x, y) + b(x+1, y);
        d(x, y) = c(x-1, y) + c(x, y) + c(x+1, y);

        Var xi, yi;
        // Compute 24-wide pieces of output per block. Should use 32
        // warp lanes to do so. The footprint on the input is 30, so
        // the last two lanes are always inactive. 26-wide blocks
        // would be a more efficient use of the gpu, but a less
        // interesting test.
        d.gpu_tile(x, y, xi, yi, 24, 2).gpu_lanes(xi);
        for (Func stage : {a.in(), b, c}) {
            stage.compute_at(d, yi).gpu_lanes(x);
        }

        Buffer<int> out = d.realize(24, 2);
        for (int y = 0; y < out.height(); y++) {
            for (int x = 0; x < out.width(); x++) {
                int correct = 27*(x + y);
                int actual = out(x, y);
                if (correct != actual) {
                    printf("out(%d, %d) = %d instead of %d\n",
                           x, y, actual, correct);
                    return -1;
                }
            }
        }
    }

    {
        // Same as above, but in half-warps
        Func a, b, c, d;
        Var x, y;

        a(x, y) = x + y;
        a.compute_root();

        b(x, y) = a(x-1, y) + a(x, y) + a(x+1, y);
        c(x, y) = b(x-1, y) + b(x, y) + b(x+1, y);
        d(x, y) = c(x-1, y) + c(x, y) + c(x+1, y);

        Var xi, yi;
        // Compute 10-wide pieces of output per block. Should use 16
        // warp lanes to do so.
        d.gpu_tile(x, y, xi, yi, 10, 2).gpu_lanes(xi);
        for (Func stage : {a.in(), b, c}) {
            stage.compute_at(d, yi).gpu_lanes(x);
        }

        Buffer<int> out = d.realize(24, 2);
        for (int y = 0; y < out.height(); y++) {
            for (int x = 0; x < out.width(); x++) {
                int correct = 27*(x + y);
                int actual = out(x, y);
                if (correct != actual) {
                    printf("out(%d, %d) = %d instead of %d\n",
                           x, y, actual, correct);
                    return -1;
                }
            }
        }
    }

    {
        // A shuffle with a shift amount that depends on the y coord
        Func a, b;
        Var x, y;

        a(x, y) = x + y;
        b(x, y) = a(x + y, y);

        Var xi, yi;
        b.gpu_tile(x, y, xi, yi, 16, 8, TailStrategy::RoundUp).gpu_lanes(xi);
        a.compute_at(b, yi).gpu_lanes(x);

        Buffer<int> out = b.realize(32, 32);
        for (int y = 0; y < out.height(); y++) {
            for (int x = 0; x < out.width(); x++) {
                int correct = x + 2*y;
                int actual = out(x, y);
                if (correct != actual) {
                    printf("out(%d, %d) = %d instead of %d\n",
                           x, y, actual, correct);
                    return -1;
                }
            }
        }
    }

    {
        // Bilinear upsample
        Func f, upx, upy;
        Var x, y;

        f(x, y) = cast<float>(x + y);
        f.compute_root();

        upx(x, y) = 0.25f * f((x/2) - 1 + 2*(x % 2), y) + 0.75f * f(x/2, y);
        upy(x, y) = 0.25f * upx(x, (y/2) - 1 + 2*(y % 2)) + 0.75f * upx(x, y/2);

        // Compute 128x64 tiles of output, which require 66x34 tiles
        // of input. All intermediate data stored in lanes and
        // accessed using register shuffles.

        Var xi, yi, xii, yii;
        upy.tile(x, y, xi, yi, 128, 64, TailStrategy::RoundUp)
            .tile(xi, yi, xii, yii, 4, 8).vectorize(xii)
            .gpu_blocks(x, y).gpu_threads(yi).gpu_lanes(xi);

        upx.compute_at(upy, yi).unroll(x, 4).gpu_lanes(x).unroll(y);

        // Stage the input into lanes, doing two dense vector loads
        // per lane, and use register shuffles to do the upsample in x.
        f.in().compute_at(upy, yi).align_storage(x, 64)
            .vectorize(x, 2, TailStrategy::RoundUp)
            .split(x, x, xi, 32, TailStrategy::GuardWithIf)
            .reorder(xi, y, x).gpu_lanes(xi).unroll(x).unroll(y);

        upy.output_buffer().dim(0).set_min(0).dim(1).set_min(0);
        Buffer<float> out = upy.realize(128, 128);

        for (int y = 0; y < out.height(); y++) {
            for (int x = 0; x < out.width(); x++) {
                float actual = out(x, y);
                float correct = (x + y - 1) / 2.0f;
                if (correct != actual) {
                    printf("out(%d, %d) = %f instead of %f\n",
                           x, y, actual, correct);
                    return -1;
                }
            }
        }
    }

    {
        // Box-downsample by a factor of 8 using summation within each
        // warp.
        Func f;
        Var x, y;
        f(x, y) = cast<float>(x + y);
        f.compute_root();

        Func s1, s2, s3, s4;

        s1(x, y) = f(2*x, y) + f(2*x + 1, y);
        s2(x, y) = s1(2*x, y) + s1(2*x + 1, y);
        s3(x, y) = s2(2*x, y) + s2(2*x + 1, y);
        s4(x, y) = s3(x, y);

        Var xi, yi;
        s4.gpu_tile(x, y, xi, yi, 64, 1, TailStrategy::RoundUp).vectorize(xi, 2).gpu_lanes(xi);
        s3.compute_at(s4, yi).split(x, x, xi, 32, TailStrategy::RoundUp).gpu_lanes(xi).unroll(x);
        s2.compute_at(s4, yi).split(x, x, xi, 32, TailStrategy::RoundUp).gpu_lanes(xi).unroll(x);
        s1.compute_at(s4, yi).split(x, x, xi, 32, TailStrategy::RoundUp).gpu_lanes(xi).unroll(x);
        f.in().compute_at(s4, yi).split(x, x, xi, 64, TailStrategy::RoundUp).vectorize(xi, 2).gpu_lanes(xi).unroll(x);

        Buffer<float> out = s4.realize(64, 64);

        for (int y = 0; y < out.height(); y++) {
            for (int x = 0; x < out.width(); x++) {
                float actual = out(x, y);
                // One factor of 8 from adding instead of averaging,
                // and another factor of 8 from the compression of the
                // coordinate system across x.
                float correct = (x*8 + y)*8 + 28;
                if (correct != actual) {
                    printf("out(%d, %d) = %f instead of %f\n",
                           x, y, actual, correct);
                    return -1;
                }
            }
        }
    }

    {
        // The same, with a narrower tile in x so that one warp is divided up across many scanlines.
        Func f;
        Var x, y;
        f(x, y) = cast<float>(x + y);
        f.compute_root();

        Func s1, s2, s3, s4;

        s1(x, y) = f(2*x, y) + f(2*x + 1, y);
        s2(x, y) = s1(2*x, y) + s1(2*x + 1, y);
        s3(x, y) = s2(2*x, y) + s2(2*x + 1, y);
        s4(x, y) = s3(x, y);

        Var xi, yi;
        s4.gpu_tile(x, y, xi, yi, 8, 16, TailStrategy::RoundUp).vectorize(xi, 2).gpu_lanes(xi);
        s3.compute_at(s4, yi).split(x, x, xi, 4, TailStrategy::RoundUp).gpu_lanes(xi).unroll(x);
        s2.compute_at(s4, yi).split(x, x, xi, 4, TailStrategy::RoundUp).gpu_lanes(xi).unroll(x);
        s1.compute_at(s4, yi).split(x, x, xi, 4, TailStrategy::RoundUp).gpu_lanes(xi).unroll(x);
        f.in().compute_at(s4, yi).split(x, x, xi, 8, TailStrategy::RoundUp).vectorize(xi, 2).gpu_lanes(xi).unroll(x);

        Buffer<float> out = s4.realize(32, 32);

        for (int y = 0; y < out.height(); y++) {
            for (int x = 0; x < out.width(); x++) {
                float actual = out(x, y);
                float correct = (x*8 + y)*8 + 28;
                if (correct != actual) {
                    printf("out(%d, %d) = %f instead of %f\n",
                           x, y, actual, correct);
                    return -1;
                }
            }
        }
    }

    {
        Buffer<uint8_t> buf(256, 256);
        buf.for_each_value([](uint8_t &x) {
                x = rand();
            });
        buf.set_host_dirty();

        // Store a small LUT in-register, populated at the warp
        // level.
        Func lut;
        Var x, y;
        lut(x) = cast<uint16_t>(x)+1;

        Func curved;
        curved(x, y) = lut(buf(x, y));

        Var xi, yi, xo;
        curved.compute_root().tile(x, y, xi, yi, 32, 32)
            .gpu_blocks(x, y).gpu_threads(yi).gpu_lanes(xi);

        lut.compute_root();

        // Load the LUT into shared at the start of each block using warp 0.
        lut.in().compute_at(curved, x).split(x, xo, xi, 32 * 4).vectorize(xi, 4).gpu_lanes(xi).unroll(xo);

        // Load it from shared into registers for each warp.
        lut.in().in().compute_at(curved, yi).split(x, xo, xi, 32 * 4).vectorize(xi, 4).gpu_lanes(xi).unroll(xo);

        Buffer<uint16_t> out = curved.realize(buf.width(), buf.height());

        for (int y = 0; y < out.height(); y++) {
            for (int x = 0; x < out.width(); x++) {
                uint16_t actual = out(x, y);
                uint16_t correct = ((uint16_t)buf(x, y)) + 1;
                if (correct != actual) {
                    printf("out(%d, %d) = %d instead of %d\n",
                           x, y, actual, correct);
                    return -1;
                }
            }
        }
    }

    {
        // Test a case that caused combinatorial explosion
        Var x;
        Expr e = x;
        for (int i = 0; i < 10; i++) {
            e = fast_pow(e, e + 1);
        }

        Func f;
        f(x) = e;

        Var xo, xi;
        f.gpu_tile(x, xo, xi, 32);
        f.realize(1024);
    }

    printf("Success!\n");
    return 0;
}

コード例 #12

ファイルを表示

ファイル: vector_cast.cpp プロジェクト: kgnk/Halide

bool test(int vec_width, const Target &target) {
    if (!is_type_supported<A>(vec_width, target) || !is_type_supported<B>(vec_width, target)) {
        // Type not supported, return pass.
        return true;
    }

    int W = 1024;
    int H = 1;

    Buffer<A> input(W, H);
    for (int y = 0; y < H; y++) {
        for (int x = 0; x < W; x++) {
            input(x, y) = (A)((rand()&0xffff)*0.1);
        }
    }

    Var x, y;
    Func f;

    f(x, y) = cast<B>(input(x, y));

    if (target.has_gpu_feature()) {
        Var xo, xi;
        f.gpu_tile(x, xo, xi, 64);
    } else {
        if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
            // TODO: Non-native vector widths hang the compiler here.
            //f.hexagon();
        }
        if (vec_width > 1) {
            f.vectorize(x, vec_width);
        }
    }

    Buffer<B> output = f.realize(W, H);

    /*
    for (int y = 0; y < H; y++) {
        for (int x = 0; x < W; x++) {
            printf("%d %d -> %d %d\n", x, y, (int)(input(x, y)), (int)(output(x, y)));
        }
    }
    */

    for (int y = 0; y < H; y++) {
        for (int x = 0; x < W; x++) {

            bool ok = ((B)(input(x, y)) == output(x, y));
            if (!ok) {
                fprintf(stderr, "%s x %d -> %s x %d failed\n",
                       string_of_type<A>(), vec_width,
                       string_of_type<B>(), vec_width);
                fprintf(stderr, "At %d %d, %f -> %f instead of %f\n",
                       x, y,
                       (double)(input(x, y)),
                       (double)(output(x, y)),
                       (double)((B)(input(x, y))));
                return false;
            }
        }
    }

    return true;
}

コード例 #13

ファイルを表示

ファイル: widening_reduction.cpp プロジェクト: adityaatluri/Halide

int main(int arch, char **argv) {
    const int W = 256, H = 256;

    Buffer<uint8_t> in(W, H);
    // Set up the input.
    for (int y = 0; y < H; y++) {
        for (int x = 0; x < W; x++) {
            in(x, y) = rand() & 0xff;
        }
    }

    // Define a convolution kernel, and its sum.
    Buffer<int8_t> kernel(3, 3);
    kernel.set_min(-1, -1);
    for (int y = -1; y <= 1; y++) {
        for (int x = -1; x <= 1; x++) {
            kernel(x, y) = rand() % 8 - 4;
        }
    }

    Var x("x"), y("y"), xi("xi"), yi("yi");
    RDom r(-1, 3, -1, 3);

    // Boundary condition.
    Func input = BoundaryConditions::repeat_edge(in);
    input.compute_root();

    // Test a widening reduction, followed by a narrowing.
    {
        Func f;
        f(x, y) = u8_sat(sum(i16(input(x + r.x, y + r.y)) * kernel(r.x, r.y)) / 16);

        // Schedule.
        Target target = get_jit_target_from_environment();
        if (target.has_gpu_feature()) {
            f.gpu_tile(x, y, xi, yi, 16, 16);
        } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
            f.hexagon().vectorize(x, 128);
        } else {
            f.vectorize(x, target.natural_vector_size<uint8_t>());
        }

        // Run the pipeline and verify the results are correct.
        Buffer<uint8_t> out = f.realize(W, H, target);

        for (int y = 1; y < H-1; y++) {
            for (int x = 1; x < W-1; x++) {
                int16_t correct = 0;
                for (int ry = -1; ry <= 1; ry++) {
                    for (int rx = -1; rx <= 1; rx++) {
                        correct += static_cast<int16_t>(in(x + rx, y + ry)) * kernel(rx, ry);
                    }
                }
                correct = std::min(std::max(correct / 16, 0), 255);
                if (correct != out(x, y)) {
                    std::cout << "out(" << x << ", " << y << ") = " << (int)out(x, y) << " instead of " << correct << "\n";
                    return -1;
                }
            }
        }
    }

    // Test a tuple reduction with widening, followed by narrowing the result.
    {
        Func f;
        f(x, y) = { i16(0), i8(0) };
        f(x, y) = {
            f(x, y)[0] + i16(input(x + r.x, y + r.y)) * kernel(r.x, r.y),
            f(x, y)[1] + kernel(r.x, r.y),
        };

        Func g;
        g(x, y) = u8_sat((f(x, y)[0] + f(x, y)[1]) / 16);

        // Schedule.
        Target target = get_jit_target_from_environment();
        if (target.has_gpu_feature()) {
            g.gpu_tile(x, y, xi, yi, 16, 16);
        } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
            g.hexagon().vectorize(x, 128);
        } else {
            g.vectorize(x, target.natural_vector_size<uint8_t>());
        }

        // Run the pipeline and verify the results are correct.
        Buffer<uint8_t> out = g.realize(W, H, target);

        for (int y = 1; y < H-1; y++) {
            for (int x = 1; x < W-1; x++) {
                int16_t correct = 0;
                for (int ry = -1; ry <= 1; ry++) {
                    for (int rx = -1; rx <= 1; rx++) {
                        correct += static_cast<int16_t>(in(x + rx, y + ry)) * kernel(rx, ry);
                        correct += kernel(rx, ry);
                    }
                }
                correct = std::min(std::max(correct / 16, 0), 255);
                if (correct != out(x, y)) {
                    std::cout << "out(" << x << ", " << y << ") = " << (int)out(x, y) << " instead of " << correct << "\n";
                    return -1;
                }
            }
        }
    }
    std::cout << "Success!\n";
    return 0;
}