예제 #1
0
void transpose(Param<T> out, CParam<T> in, const bool conjugate,
               const bool is32multiple) {
    static const std::string source(transpose_cuh, transpose_cuh_len);

    // clang-format off
    auto transpose = getKernel("cuda::transpose", source,
            {
              TemplateTypename<T>(),
              TemplateArg(conjugate),
              TemplateArg(is32multiple)
            },
            {
              DefineValue(TILE_DIM),
              DefineValue(THREADS_Y)
            }
            );
    // clang-format on

    dim3 threads(kernel::THREADS_X, kernel::THREADS_Y);

    int blk_x = divup(in.dims[0], TILE_DIM);
    int blk_y = divup(in.dims[1], TILE_DIM);
    dim3 blocks(blk_x * in.dims[2], blk_y * in.dims[3]);
    const int maxBlocksY =
        cuda::getDeviceProp(getActiveDeviceId()).maxGridSize[1];
    blocks.z = divup(blocks.y, maxBlocksY);
    blocks.y = divup(blocks.y, blocks.z);

    EnqueueArgs qArgs(blocks, threads, getActiveStream());

    transpose(qArgs, out, in, blk_x, blk_y);

    POST_LAUNCH_CHECK();
}
예제 #2
0
    static void scan_first_launcher(Param<To> out,
                             Param<To> tmp,
                             CParam<Ti> in,
                             const uint blocks_x,
                             const uint blocks_y,
                             const uint threads_x)
    {

        dim3 threads(threads_x, THREADS_PER_BLOCK / threads_x);
        dim3 blocks(blocks_x * out.dims[2],
                    blocks_y * out.dims[3]);

        uint lim = divup(out.dims[0], (threads_x * blocks_x));

        switch (threads_x) {
        case 32:
            CUDA_LAUNCH((scan_first_kernel<Ti, To, op, isFinalPass,  32>), blocks, threads,
                out, tmp, in, blocks_x, blocks_y, lim); break;
        case 64:
            CUDA_LAUNCH((scan_first_kernel<Ti, To, op, isFinalPass,  64>), blocks, threads,
                out, tmp, in, blocks_x, blocks_y, lim); break;
        case 128:
            CUDA_LAUNCH((scan_first_kernel<Ti, To, op, isFinalPass,  128>), blocks, threads,
                out, tmp, in, blocks_x, blocks_y, lim); break;
        case 256:
            CUDA_LAUNCH((scan_first_kernel<Ti, To, op, isFinalPass,  256>), blocks, threads,
                out, tmp, in, blocks_x, blocks_y, lim); break;
        }

        POST_LAUNCH_CHECK();
    }
예제 #3
0
void meanshift(Param<T> out, CParam<T> in, float s_sigma, float c_sigma, uint iter)
{
    static dim3 threads(kernel::THREADS_X, kernel::THREADS_Y);

    int blk_x = divup(in.dims[0], THREADS_X);
    int blk_y = divup(in.dims[1], THREADS_Y);

    const int bCount   = (is_color ? 1 : in.dims[2]);
    const int channels = (is_color ? in.dims[2] : 1); // this has to be 3 for color images

    dim3 blocks(blk_x * bCount, blk_y * in.dims[3]);

    // clamp spatical and chromatic sigma's
    float space_     = std::min(11.5f, s_sigma);
    int radius  = std::max((int)(space_ * 1.5f), 1);
    int padding = 2*radius+1;
    const float cvar = c_sigma*c_sigma;
    size_t shrd_size = channels*(threads.x + padding)*(threads.y+padding)*sizeof(T);

    if (is_color)
        CUDA_LAUNCH_SMEM((meanshiftKernel<T, 3>), blocks, threads, shrd_size,
                         out, in, space_, radius, cvar, iter, blk_x, blk_y);
    else
        CUDA_LAUNCH_SMEM((meanshiftKernel<T, 1>), blocks, threads, shrd_size,
                         out, in, space_, radius, cvar, iter, blk_x, blk_y);

    POST_LAUNCH_CHECK();
}
예제 #4
0
        void transform(Param<T> out, CParam<T> in, CParam<float> tf,
                       const bool inverse)
        {
            int nimages = in.dims[2];
            // Multiplied in src/backend/transform.cpp
            const int ntransforms = out.dims[2] / in.dims[2];

            // Copy transform to constant memory.
            CUDA_CHECK(cudaMemcpyToSymbolAsync(c_tmat, tf.ptr, ntransforms * 6 * sizeof(float), 0,
                                          cudaMemcpyDeviceToDevice,
                                          cuda::getStream(cuda::getActiveDeviceId())));

            dim3 threads(TX, TY, 1);
            dim3 blocks(divup(out.dims[0], threads.x), divup(out.dims[1], threads.y));

            const int blocksXPerImage = blocks.x;
            if(nimages > TI) {
                int tile_images = divup(nimages, TI);
                nimages = TI;
                blocks.x = blocks.x * tile_images;
            }

            if (ntransforms > 1) { blocks.y *= ntransforms; }

            if(inverse) {
                CUDA_LAUNCH((transform_kernel<T, true, method>), blocks, threads,
                                out, in, nimages, ntransforms, blocksXPerImage);
            } else {
                CUDA_LAUNCH((transform_kernel<T, false, method>), blocks, threads,
                                out, in, nimages, ntransforms, blocksXPerImage);
            }
            POST_LAUNCH_CHECK();
        }
예제 #5
0
    static void scan_dim_nonfinal_launcher(Param<To> out,
                                           Param<To> tmp,
                                           Param<char> tflg,
                                           Param<int> tlid,
                                           CParam<Ti> in,
                                           CParam<Tk> key,
                                           const int dim,
                                           const uint threads_y,
                                           const uint blocks_all[4],
                                           bool inclusive_scan)
    {
        dim3 threads(THREADS_X, threads_y);

        dim3 blocks(blocks_all[0] * blocks_all[2],
                    blocks_all[1] * blocks_all[3]);

        uint lim = divup(out.dims[dim], (threads_y * blocks_all[dim]));

        switch (threads_y) {
        case 8:
            CUDA_LAUNCH((scan_dim_nonfinal_kernel<Ti, Tk, To, op, 8>), blocks, threads,
                        out, tmp, tflg, tlid, in, key, dim, blocks_all[0], blocks_all[1], lim, inclusive_scan); break;
        case 4:
            CUDA_LAUNCH((scan_dim_nonfinal_kernel<Ti, Tk, To, op, 4>), blocks, threads,
                        out, tmp, tflg, tlid, in, key, dim, blocks_all[0], blocks_all[1], lim, inclusive_scan); break;
        case 2:
            CUDA_LAUNCH((scan_dim_nonfinal_kernel<Ti, Tk, To, op, 2>), blocks, threads,
                        out, tmp, tflg, tlid, in, key, dim, blocks_all[0], blocks_all[1], lim, inclusive_scan); break;
        case 1:
            CUDA_LAUNCH((scan_dim_nonfinal_kernel<Ti, Tk, To, op, 1>), blocks, threads,
                        out, tmp, tflg, tlid, in, key, dim, blocks_all[0], blocks_all[1], lim, inclusive_scan); break;
        }

        POST_LAUNCH_CHECK();
    }
예제 #6
0
        void shift(Param<T> out, CParam<T> in, const int *sdims)
        {
            dim3 threads(TX, TY, 1);

            int blocksPerMatX = divup(out.dims[0], TILEX);
            int blocksPerMatY = divup(out.dims[1], TILEY);
            dim3 blocks(blocksPerMatX * out.dims[2],
                        blocksPerMatY * out.dims[3],
                        1);

            const int maxBlocksY = cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1];
            blocks.z = divup(blocks.y, maxBlocksY);
            blocks.y = divup(blocks.y, blocks.z);

            int sdims_[4];
            // Need to do this because we are mapping output to input in the kernel
            for(int i = 0; i < 4; i++) {
                // sdims_[i] will always be positive and always [0, oDims[i]].
                // Negative shifts are converted to position by going the other way round
                sdims_[i] = -(sdims[i] % (int)out.dims[i]) + out.dims[i] * (sdims[i] > 0);
                assert(sdims_[i] >= 0 && sdims_[i] <= out.dims[i]);
            }

            CUDA_LAUNCH((shift_kernel<T>), blocks, threads,
                    out, in, sdims_[0], sdims_[1], sdims_[2], sdims_[3],
                    blocksPerMatX, blocksPerMatY);
            POST_LAUNCH_CHECK();
        }
예제 #7
0
        void sort0_by_key(Param<Tk> okey, Param<Tv> oval)
        {
            thrust::device_ptr<Tk>       okey_ptr = thrust::device_pointer_cast(okey.ptr);
            thrust::device_ptr<Tv>       oval_ptr = thrust::device_pointer_cast(oval.ptr);

            for(int w = 0; w < okey.dims[3]; w++) {
                int okeyW = w * okey.strides[3];
                int ovalW = w * oval.strides[3];
                for(int z = 0; z < okey.dims[2]; z++) {
                    int okeyWZ = okeyW + z * okey.strides[2];
                    int ovalWZ = ovalW + z * oval.strides[2];
                    for(int y = 0; y < okey.dims[1]; y++) {

                        int okeyOffset = okeyWZ + y * okey.strides[1];
                        int ovalOffset = ovalWZ + y * oval.strides[1];

                        if(isAscending) {
                            thrust::sort_by_key(okey_ptr + okeyOffset, okey_ptr + okeyOffset + okey.dims[0],
                                                oval_ptr + ovalOffset);
                        } else {
                            thrust::sort_by_key(okey_ptr + okeyOffset, okey_ptr + okeyOffset + okey.dims[0],
                                                oval_ptr + ovalOffset, thrust::greater<Tk>());
                        }
                    }
                }
            }
            POST_LAUNCH_CHECK();
        }
예제 #8
0
    static void scan_dim_launcher(Param<To> out,
                           Param<To> tmp,
                           CParam<Ti> in,
                           const uint threads_y,
                           const uint blocks_all[4])
    {
        dim3 threads(THREADS_X, threads_y);

        dim3 blocks(blocks_all[0] * blocks_all[2],
                    blocks_all[1] * blocks_all[3]);

        uint lim = divup(out.dims[dim], (threads_y * blocks_all[dim]));

        switch (threads_y) {
        case 8:
            CUDA_LAUNCH((scan_dim_kernel<Ti, To, op, dim, isFinalPass, 8>), blocks, threads,
                out, tmp, in, blocks_all[0], blocks_all[1], blocks_all[dim], lim); break;
        case 4:
            CUDA_LAUNCH((scan_dim_kernel<Ti, To, op, dim, isFinalPass, 4>), blocks, threads,
                out, tmp, in, blocks_all[0], blocks_all[1], blocks_all[dim], lim); break;
        case 2:
            CUDA_LAUNCH((scan_dim_kernel<Ti, To, op, dim, isFinalPass, 2>), blocks, threads,
                out, tmp, in, blocks_all[0], blocks_all[1], blocks_all[dim], lim); break;
        case 1:
            CUDA_LAUNCH((scan_dim_kernel<Ti, To, op, dim, isFinalPass, 1>), blocks, threads,
                out, tmp, in, blocks_all[0], blocks_all[1], blocks_all[dim], lim); break;
        }

        POST_LAUNCH_CHECK();
    }
예제 #9
0
void morph(Param<T> out, CParam<T> in, int windLen)
{
    dim3 threads(kernel::THREADS_X, kernel::THREADS_Y);

    int blk_x = divup(in.dims[0], THREADS_X);
    int blk_y = divup(in.dims[1], THREADS_Y);
    // launch batch * blk_x blocks along x dimension
    dim3 blocks(blk_x * in.dims[2], blk_y * in.dims[3]);

    // calculate shared memory size
    int halo      = windLen/2;
    int padding   = 2*halo;
    int shrdLen   = kernel::THREADS_X + padding + 1; // +1 for to avoid bank conflicts
    int shrdSize  = shrdLen * (kernel::THREADS_Y + padding) * sizeof(T);

    switch(windLen) {
        case  3: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 3>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
        case  5: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 5>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
        case  7: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 7>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
        case  9: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 9>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
        case 11: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,11>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
        case 13: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,13>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
        case 15: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,15>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
        case 17: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,17>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
        case 19: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation,19>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
        default: CUDA_LAUNCH_SMEM((morphKernel<T, isDilation, 3>), blocks, threads, shrdSize, out, in, blk_x, blk_y); break;
    }

    POST_LAUNCH_CHECK();
}
예제 #10
0
void lookup(Param<in_t> out, CParam<in_t> in, CParam<idx_t> indices, int nDims)
{
    if (nDims==1) {
        const dim3 threads(THREADS, 1);
        /* find which dimension has non-zero # of elements */
        int vDim = 0;
        for (int i=0; i<4; i++) {
            if (in.dims[i]==1)
                vDim++;
            else
                break;
        }

        int blks = divup(out.dims[vDim], THREADS*THRD_LOAD);

        dim3 blocks(blks, 1);

        CUDA_LAUNCH((lookup1D<in_t, idx_t>), blocks, threads, out, in, indices, vDim);
    } else {
        const dim3 threads(THREADS_X, THREADS_Y);

        int blks_x = divup(out.dims[0], threads.x);
        int blks_y = divup(out.dims[1], threads.y);

        dim3 blocks(blks_x*out.dims[2], blks_y*out.dims[3]);

        CUDA_LAUNCH((lookupND<in_t, idx_t, dim>), blocks, threads, out, in, indices, blks_x, blks_y);
    }

    POST_LAUNCH_CHECK();
}
예제 #11
0
        void sort0_index(Param<T> val, Param<unsigned> idx)
        {
            thrust::device_ptr<T>        val_ptr = thrust::device_pointer_cast(val.ptr);
            thrust::device_ptr<unsigned> idx_ptr = thrust::device_pointer_cast(idx.ptr);

            for(int w = 0; w < val.dims[3]; w++) {
                int valW = w * val.strides[3];
                int idxW = w * idx.strides[3];
                for(int z = 0; z < val.dims[2]; z++) {
                    int valWZ = valW + z * val.strides[2];
                    int idxWZ = idxW + z * idx.strides[2];
                    for(int y = 0; y < val.dims[1]; y++) {

                        int valOffset = valWZ + y * val.strides[1];
                        int idxOffset = idxWZ + y * idx.strides[1];

                        THRUST_SELECT(thrust::sequence, idx_ptr + idxOffset, idx_ptr + idxOffset + idx.dims[0]);
                        if(isAscending) {
                            THRUST_SELECT(thrust::sort_by_key,
                                    val_ptr + valOffset, val_ptr + valOffset + val.dims[0],
                                    idx_ptr + idxOffset);
                        } else {
                            THRUST_SELECT(thrust::sort_by_key,
                                        val_ptr + valOffset, val_ptr + valOffset + val.dims[0],
                                        idx_ptr + idxOffset, thrust::greater<T>());
                        }
                    }
                }
            }
            POST_LAUNCH_CHECK();
        }
예제 #12
0
    static void identity(Param<T> out)
    {
        dim3 threads(32, 8);
        int blocks_x = divup(out.dims[0], threads.x);
        int blocks_y = divup(out.dims[1], threads.y);
        dim3 blocks(blocks_x * out.dims[2], blocks_y * out.dims[3]);

        CUDA_LAUNCH((identity_kernel<T>), blocks, threads, out, blocks_x, blocks_y);
        POST_LAUNCH_CHECK();
    }
예제 #13
0
        void coo2dense(Param<T> output, CParam<T> values, CParam<int> rowIdx, CParam<int> colIdx)
        {
            dim3 threads(256, 1, 1);

            dim3 blocks(divup(output.dims[0], threads.x * reps), 1, 1);

            CUDA_LAUNCH((coo2dense_kernel<T>), blocks, threads, output, values, rowIdx, colIdx);

            POST_LAUNCH_CHECK();
        }
예제 #14
0
        void approx1(Param<Ty> out, CParam<Ty> in,
               CParam<Tp> pos, const float offGrid)
        {
            dim3 threads(THREADS, 1, 1);
            int blocksPerMat = divup(out.dims[0], threads.x);
            dim3 blocks(blocksPerMat * out.dims[1], out.dims[2] * out.dims[3]);

            approx1_kernel<Ty, Tp, method><<<blocks, threads>>>
                          (out, in, pos, offGrid, blocksPerMat);
            POST_LAUNCH_CHECK();
        }
예제 #15
0
        void approx2(Param<Ty> out, CParam<Ty> in,
                    CParam<Tp> pos, CParam<Tp> qos, const float offGrid)
        {
            dim3 threads(TX, TY, 1);
            int blocksPerMatX = divup(out.dims[0], threads.x);
            int blocksPerMatY = divup(out.dims[1], threads.y);
            dim3 blocks(blocksPerMatX * out.dims[2], blocksPerMatY * out.dims[3]);

            approx2_kernel<Ty, Tp, method><<<blocks, threads>>>
                          (out, in, pos, qos, offGrid, blocksPerMatX, blocksPerMatY);
            POST_LAUNCH_CHECK();
        }
예제 #16
0
void matchTemplate(Param<outType> out, CParam<inType> srch, CParam<inType> tmplt)
{
    const dim3 threads(THREADS_X, THREADS_Y);

    int blk_x = divup(srch.dims[0], threads.x);
    int blk_y = divup(srch.dims[1], threads.y);

    dim3 blocks(blk_x*srch.dims[2], blk_y*srch.dims[3]);

    matchTemplate<inType, outType, mType, needMean> <<< blocks, threads >>> (out, srch, tmplt, blk_x, blk_y);

    POST_LAUNCH_CHECK();
}
예제 #17
0
        void gradient(Param<T> grad0, Param<T> grad1, CParam<T> in)
        {
            dim3 threads(TX, TY, 1);

            int blocksPerMatX = divup(in.dims[0], TX);
            int blocksPerMatY = divup(in.dims[1], TY);
            dim3 blocks(blocksPerMatX * in.dims[2],
                        blocksPerMatY * in.dims[3],
                        1);

            gradient_kernel<T><<<blocks, threads>>>(grad0, grad1, in, blocksPerMatX, blocksPerMatY);
            POST_LAUNCH_CHECK();
        }
예제 #18
0
파일: random.hpp 프로젝트: hxiaox/arrayfire
    void randu(T *out, size_t elements)
    {
        int device = getActiveDeviceId();

        int threads = THREADS;
        int blocks  = divup(elements, THREADS);
        if (blocks > BLOCKS) blocks = BLOCKS;

        curandState_t *state = getcurandState();

        CUDA_LAUNCH(uniform_kernel, blocks, threads, out, state, elements);
        POST_LAUNCH_CHECK();
    }
예제 #19
0
        void join(Param<Tx> out, CParam<Tx> X, CParam<Ty> Y)
        {
            dim3 threads(TX, TY, 1);

            dim_type blocksPerMatX = divup(out.dims[0], TILEX);
            dim_type blocksPerMatY = divup(out.dims[1], TILEY);
            dim3 blocks(blocksPerMatX * out.dims[2],
                        blocksPerMatY * out.dims[3],
                        1);

            join_kernel<Tx, Ty, dim><<<blocks, threads>>>(out, X, Y, blocksPerMatX, blocksPerMatY);
            POST_LAUNCH_CHECK();
        }
예제 #20
0
파일: range.hpp 프로젝트: DeepCV/arrayfire
        void range(Param<T> out, const int dim)
        {
            dim3 threads(RANGE_TX, RANGE_TY, 1);

            int blocksPerMatX = divup(out.dims[0], RANGE_TILEX);
            int blocksPerMatY = divup(out.dims[1], RANGE_TILEY);
            dim3 blocks(blocksPerMatX * out.dims[2],
                        blocksPerMatY * out.dims[3],
                        1);

            CUDA_LAUNCH((range_kernel<T>), blocks, threads, out, dim, blocksPerMatX, blocksPerMatY);
            POST_LAUNCH_CHECK();
        }
예제 #21
0
파일: iota.hpp 프로젝트: EasonYi/arrayfire
        void iota(Param<T> out)
        {
            dim3 threads(TX, TY, 1);

            dim_type blocksPerMatX = divup(out.dims[0], TILEX);
            dim_type blocksPerMatY = divup(out.dims[1], TILEY);
            dim3 blocks(blocksPerMatX * out.dims[2],
                        blocksPerMatY * out.dims[3],
                        1);

            iota_kernel<T, rep><<<blocks, threads>>>(out, blocksPerMatX, blocksPerMatY);
            POST_LAUNCH_CHECK();
        }
예제 #22
0
void assign(Param<T> out, CParam<T> in, const AssignKernelParam_t& p)
{
    const dim3 threads(THREADS_X, THREADS_Y);

    int blks_x = divup(in.dims[0], threads.x);
    int blks_y = divup(in.dims[1], threads.y);

    dim3 blocks(blks_x*in.dims[2], blks_y*in.dims[3]);

    AssignKernel<T> <<<blocks, threads>>> (out, in, p, blks_x, blks_y);

    POST_LAUNCH_CHECK();
}
예제 #23
0
파일: susan.hpp 프로젝트: 9prady9/arrayfire
void susan_responses(T* out, const T* in, const unsigned idim0,
                     const unsigned idim1, const int radius, const float t,
                     const float g, const unsigned edge) {
    dim3 threads(BLOCK_X, BLOCK_Y);
    dim3 blocks(divup(idim0 - edge * 2, BLOCK_X),
                divup(idim1 - edge * 2, BLOCK_Y));
    const size_t SMEM_SIZE =
        (BLOCK_X + 2 * radius) * (BLOCK_Y + 2 * radius) * sizeof(T);

    CUDA_LAUNCH_SMEM((susanKernel<T>), blocks, threads, SMEM_SIZE, out, in,
                     idim0, idim1, radius, t, g, edge);

    POST_LAUNCH_CHECK();
}
예제 #24
0
        void unwrap_row(Param<T> out, CParam<T> in, const dim_t wx, const dim_t wy,
                        const dim_t sx, const dim_t sy,
                        const dim_t px, const dim_t py, const dim_t nx)
        {
            dim3 threads(THREADS_X, THREADS_Y);
            dim3 blocks(divup(out.dims[0], threads.x), out.dims[2] * out.dims[3]);

            dim_t reps = divup((wx * wy), threads.y);

            CUDA_LAUNCH((unwrap_kernel<T, false>), blocks, threads,
                        out, in, wx, wy, sx, sy, px, py, nx, reps);

            POST_LAUNCH_CHECK();
        }
예제 #25
0
void exampleFunc(Param<T> out, CParam<T> in, const af_someenum_t p)
{

    dim3 threads(TX, TY, 1);            // set your cuda launch config for blocks

    dim_type blk_x = divup(out.dims[0], threads.x);
    dim_type blk_y = divup(out.dims[1], threads.y);
    dim3 blocks(blk_x, blk_y);          // set your opencl launch config for grid
                            
    // launch your kernel
    exampleFuncKernel<T> <<<blocks, threads>>> (out, in, p);

    POST_LAUNCH_CHECK();                // Macro for post kernel launch checks
                                        // these checks are carried  ONLY IN DEBUG mode
}
예제 #26
0
        void iota(Param<T> out, const dim4 &sdims, const dim4 &tdims)
        {
            dim3 threads(TX, TY, 1);

            int blocksPerMatX = divup(out.dims[0], TILEX);
            int blocksPerMatY = divup(out.dims[1], TILEY);
            dim3 blocks(blocksPerMatX * out.dims[2],
                        blocksPerMatY * out.dims[3],
                        1);

            CUDA_LAUNCH((iota_kernel<T>), blocks, threads,
                    out, sdims[0], sdims[1], sdims[2], sdims[3],
                    tdims[0], tdims[1], tdims[2], tdims[3], blocksPerMatX, blocksPerMatY);
            POST_LAUNCH_CHECK();
        }
예제 #27
0
void gradient(Param<T> grad0, Param<T> grad1, CParam<T> in) {
    dim3 threads(TX, TY, 1);

    int blocksPerMatX = divup(in.dims[0], TX);
    int blocksPerMatY = divup(in.dims[1], TY);
    dim3 blocks(blocksPerMatX * in.dims[2], blocksPerMatY * in.dims[3], 1);

    const int maxBlocksY =
        cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1];
    blocks.z = divup(blocks.y, maxBlocksY);
    blocks.y = divup(blocks.y, blocks.z);

    CUDA_LAUNCH((gradient_kernel<T>), blocks, threads, grad0, grad1, in,
                blocksPerMatX, blocksPerMatY);
    POST_LAUNCH_CHECK();
}
예제 #28
0
void sobel(Param<To> dx, Param<To> dy, CParam<Ti> in, const unsigned &ker_size)
{
    const dim3 threads(THREADS_X, THREADS_Y);

    int blk_x = divup(in.dims[0], threads.x);
    int blk_y = divup(in.dims[1], threads.y);

    dim3 blocks(blk_x*in.dims[2], blk_y*in.dims[3]);

    //TODO: add more cases when 5x5 and 7x7 kernels are done
    switch(ker_size) {
        case  3: CUDA_LAUNCH((sobel3x3<Ti, To>), blocks, threads, dx, dy, in, blk_x, blk_y); break;
    }

    POST_LAUNCH_CHECK();
}
예제 #29
0
        void unwrap_col(Param<T> out, CParam<T> in, const dim_t wx, const dim_t wy,
                        const dim_t sx, const dim_t sy,
                        const dim_t px, const dim_t py, const dim_t nx)
        {
            dim_t TX = std::min(THREADS_PER_BLOCK, nextpow2(out.dims[0]));

            dim3 threads(TX, THREADS_PER_BLOCK / TX);
            dim3 blocks(divup(out.dims[1], threads.y), out.dims[2] * out.dims[3]);

            dim_t reps = divup((wx * wy), threads.x); // is > 1 only when TX == 256 && wx * wy > 256

            CUDA_LAUNCH((unwrap_kernel<T, true>), blocks, threads,
                        out, in, wx, wy, sx, sy, px, py, nx, reps);

            POST_LAUNCH_CHECK();
        }
예제 #30
0
void histogram(Param<outType> out, CParam<inType> in, int nbins, float minval, float maxval)
{
    dim3 threads(kernel::THREADS_X, 1);

    int nElems = in.dims[0] * in.dims[1];
    int blk_x  = divup(nElems, THRD_LOAD*THREADS_X);

    dim3 blocks(blk_x * in.dims[2], in.dims[3]);

    // If nbins > MAX_BINS, we are using global memory so smem_size can be 0;
    int smem_size = nbins <= MAX_BINS ? (nbins * sizeof(outType)) : 0;

    CUDA_LAUNCH_SMEM((histogramKernel<inType, outType, isLinear>), blocks, threads, smem_size,
            out, in, nElems, nbins, minval, maxval, blk_x);

    POST_LAUNCH_CHECK();
}