Example #1
0
        void transform(Param<T> out, CParam<T> in, CParam<float> tf,
                       const bool inverse)
        {
            int nimages = in.dims[2];
            // Multiplied in src/backend/transform.cpp
            const int ntransforms = out.dims[2] / in.dims[2];

            // Copy transform to constant memory.
            CUDA_CHECK(cudaMemcpyToSymbolAsync(c_tmat, tf.ptr, ntransforms * 6 * sizeof(float), 0,
                                          cudaMemcpyDeviceToDevice,
                                          cuda::getStream(cuda::getActiveDeviceId())));

            dim3 threads(TX, TY, 1);
            dim3 blocks(divup(out.dims[0], threads.x), divup(out.dims[1], threads.y));

            const int blocksXPerImage = blocks.x;
            if(nimages > TI) {
                int tile_images = divup(nimages, TI);
                nimages = TI;
                blocks.x = blocks.x * tile_images;
            }

            if (ntransforms > 1) { blocks.y *= ntransforms; }

            if(inverse) {
                CUDA_LAUNCH((transform_kernel<T, true, method>), blocks, threads,
                                out, in, nimages, ntransforms, blocksXPerImage);
            } else {
                CUDA_LAUNCH((transform_kernel<T, false, method>), blocks, threads,
                                out, in, nimages, ntransforms, blocksXPerImage);
            }
            POST_LAUNCH_CHECK();
        }
Example #2
0
void lookup(Param<in_t> out, CParam<in_t> in, CParam<idx_t> indices, int nDims)
{
    if (nDims==1) {
        const dim3 threads(THREADS, 1);
        /* find which dimension has non-zero # of elements */
        int vDim = 0;
        for (int i=0; i<4; i++) {
            if (in.dims[i]==1)
                vDim++;
            else
                break;
        }

        int blks = divup(out.dims[vDim], THREADS*THRD_LOAD);

        dim3 blocks(blks, 1);

        CUDA_LAUNCH((lookup1D<in_t, idx_t>), blocks, threads, out, in, indices, vDim);
    } else {
        const dim3 threads(THREADS_X, THREADS_Y);

        int blks_x = divup(out.dims[0], threads.x);
        int blks_y = divup(out.dims[1], threads.y);

        dim3 blocks(blks_x*out.dims[2], blks_y*out.dims[3]);

        CUDA_LAUNCH((lookupND<in_t, idx_t, dim>), blocks, threads, out, in, indices, blks_x, blks_y);
    }

    POST_LAUNCH_CHECK();
}
Example #3
0
    static void scan_dim_launcher(Param<To> out,
                           Param<To> tmp,
                           CParam<Ti> in,
                           const uint threads_y,
                           const uint blocks_all[4])
    {
        dim3 threads(THREADS_X, threads_y);

        dim3 blocks(blocks_all[0] * blocks_all[2],
                    blocks_all[1] * blocks_all[3]);

        uint lim = divup(out.dims[dim], (threads_y * blocks_all[dim]));

        switch (threads_y) {
        case 8:
            CUDA_LAUNCH((scan_dim_kernel<Ti, To, op, dim, isFinalPass, 8>), blocks, threads,
                out, tmp, in, blocks_all[0], blocks_all[1], blocks_all[dim], lim); break;
        case 4:
            CUDA_LAUNCH((scan_dim_kernel<Ti, To, op, dim, isFinalPass, 4>), blocks, threads,
                out, tmp, in, blocks_all[0], blocks_all[1], blocks_all[dim], lim); break;
        case 2:
            CUDA_LAUNCH((scan_dim_kernel<Ti, To, op, dim, isFinalPass, 2>), blocks, threads,
                out, tmp, in, blocks_all[0], blocks_all[1], blocks_all[dim], lim); break;
        case 1:
            CUDA_LAUNCH((scan_dim_kernel<Ti, To, op, dim, isFinalPass, 1>), blocks, threads,
                out, tmp, in, blocks_all[0], blocks_all[1], blocks_all[dim], lim); break;
        }

        POST_LAUNCH_CHECK();
    }
    static void scan_dim_nonfinal_launcher(Param<To> out,
                                           Param<To> tmp,
                                           Param<char> tflg,
                                           Param<int> tlid,
                                           CParam<Ti> in,
                                           CParam<Tk> key,
                                           const int dim,
                                           const uint threads_y,
                                           const uint blocks_all[4],
                                           bool inclusive_scan)
    {
        dim3 threads(THREADS_X, threads_y);

        dim3 blocks(blocks_all[0] * blocks_all[2],
                    blocks_all[1] * blocks_all[3]);

        uint lim = divup(out.dims[dim], (threads_y * blocks_all[dim]));

        switch (threads_y) {
        case 8:
            CUDA_LAUNCH((scan_dim_nonfinal_kernel<Ti, Tk, To, op, 8>), blocks, threads,
                        out, tmp, tflg, tlid, in, key, dim, blocks_all[0], blocks_all[1], lim, inclusive_scan); break;
        case 4:
            CUDA_LAUNCH((scan_dim_nonfinal_kernel<Ti, Tk, To, op, 4>), blocks, threads,
                        out, tmp, tflg, tlid, in, key, dim, blocks_all[0], blocks_all[1], lim, inclusive_scan); break;
        case 2:
            CUDA_LAUNCH((scan_dim_nonfinal_kernel<Ti, Tk, To, op, 2>), blocks, threads,
                        out, tmp, tflg, tlid, in, key, dim, blocks_all[0], blocks_all[1], lim, inclusive_scan); break;
        case 1:
            CUDA_LAUNCH((scan_dim_nonfinal_kernel<Ti, Tk, To, op, 1>), blocks, threads,
                        out, tmp, tflg, tlid, in, key, dim, blocks_all[0], blocks_all[1], lim, inclusive_scan); break;
        }

        POST_LAUNCH_CHECK();
    }
Example #5
0
        void select(Param<T> out, CParam<char> cond, CParam<T> a, CParam<T> b, int ndims)
        {
            bool is_same = true;
            for (int i = 0; i < 4; i++) {
                is_same &= (a.dims[i] == b.dims[i]);
            }

            dim3 threads(DIMX, DIMY);

            if (ndims == 1) {
                threads.x *= threads.y;
                threads.y = 1;
            }

            int blk_x = divup(out.dims[0], threads.x);
            int blk_y = divup(out.dims[1], threads.y);


            dim3 blocks(blk_x * out.dims[2],
                        blk_y * out.dims[3]);

            if (is_same) {
                CUDA_LAUNCH((select_kernel<T, true>), blocks, threads,
                            out, cond, a, b, blk_x, blk_y);
            } else {
                CUDA_LAUNCH((select_kernel<T, false>), blocks, threads,
                            out, cond, a, b, blk_x, blk_y);
            }

        }
Example #6
0
    static void scan_first_launcher(Param<To> out,
                             Param<To> tmp,
                             CParam<Ti> in,
                             const uint blocks_x,
                             const uint blocks_y,
                             const uint threads_x)
    {

        dim3 threads(threads_x, THREADS_PER_BLOCK / threads_x);
        dim3 blocks(blocks_x * out.dims[2],
                    blocks_y * out.dims[3]);

        uint lim = divup(out.dims[0], (threads_x * blocks_x));

        switch (threads_x) {
        case 32:
            CUDA_LAUNCH((scan_first_kernel<Ti, To, op, isFinalPass,  32>), blocks, threads,
                out, tmp, in, blocks_x, blocks_y, lim); break;
        case 64:
            CUDA_LAUNCH((scan_first_kernel<Ti, To, op, isFinalPass,  64>), blocks, threads,
                out, tmp, in, blocks_x, blocks_y, lim); break;
        case 128:
            CUDA_LAUNCH((scan_first_kernel<Ti, To, op, isFinalPass,  128>), blocks, threads,
                out, tmp, in, blocks_x, blocks_y, lim); break;
        case 256:
            CUDA_LAUNCH((scan_first_kernel<Ti, To, op, isFinalPass,  256>), blocks, threads,
                out, tmp, in, blocks_x, blocks_y, lim); break;
        }

        POST_LAUNCH_CHECK();
    }
Example #7
0
        void shift(Param<T> out, CParam<T> in, const int *sdims)
        {
            dim3 threads(TX, TY, 1);

            int blocksPerMatX = divup(out.dims[0], TILEX);
            int blocksPerMatY = divup(out.dims[1], TILEY);
            dim3 blocks(blocksPerMatX * out.dims[2],
                        blocksPerMatY * out.dims[3],
                        1);

            const int maxBlocksY = cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1];
            blocks.z = divup(blocks.y, maxBlocksY);
            blocks.y = divup(blocks.y, blocks.z);

            int sdims_[4];
            // Need to do this because we are mapping output to input in the kernel
            for(int i = 0; i < 4; i++) {
                // sdims_[i] will always be positive and always [0, oDims[i]].
                // Negative shifts are converted to position by going the other way round
                sdims_[i] = -(sdims[i] % (int)out.dims[i]) + out.dims[i] * (sdims[i] > 0);
                assert(sdims_[i] >= 0 && sdims_[i] <= out.dims[i]);
            }

            CUDA_LAUNCH((shift_kernel<T>), blocks, threads,
                    out, in, sdims_[0], sdims_[1], sdims_[2], sdims_[3],
                    blocksPerMatX, blocksPerMatY);
            POST_LAUNCH_CHECK();
        }
        void coo2dense(Param<T> output, CParam<T> values, CParam<int> rowIdx, CParam<int> colIdx)
        {
            dim3 threads(256, 1, 1);

            dim3 blocks(divup(output.dims[0], threads.x * reps), 1, 1);

            CUDA_LAUNCH((coo2dense_kernel<T>), blocks, threads, output, values, rowIdx, colIdx);

            POST_LAUNCH_CHECK();
        }
Example #9
0
    static void identity(Param<T> out)
    {
        dim3 threads(32, 8);
        int blocks_x = divup(out.dims[0], threads.x);
        int blocks_y = divup(out.dims[1], threads.y);
        dim3 blocks(blocks_x * out.dims[2], blocks_y * out.dims[3]);

        CUDA_LAUNCH((identity_kernel<T>), blocks, threads, out, blocks_x, blocks_y);
        POST_LAUNCH_CHECK();
    }
Example #10
0
    void transpose(Param<T> out, CParam<T> in, const int ndims)
    {
        // dimensions passed to this function should be input dimensions
        // any necessary transformations and dimension related calculations are
        // carried out here and inside the kernel
        dim3 threads(kernel::THREADS_X,kernel::THREADS_Y);


        int blk_x = divup(in.dims[0],TILE_DIM);
        int blk_y = divup(in.dims[1],TILE_DIM);
        // launch batch * blk_x blocks along x dimension
        dim3 blocks(blk_x * in.dims[2], blk_y * in.dims[3]);

        if (in.dims[0] % TILE_DIM == 0 && in.dims[1] % TILE_DIM == 0)
            CUDA_LAUNCH((transpose<T, conjugate, true >), blocks, threads, out, in, blk_x, blk_y);
        else
            CUDA_LAUNCH((transpose<T, conjugate, false>), blocks, threads, out, in, blk_x, blk_y);

        POST_LAUNCH_CHECK();
    }
Example #11
0
        void range(Param<T> out, const int dim)
        {
            dim3 threads(RANGE_TX, RANGE_TY, 1);

            int blocksPerMatX = divup(out.dims[0], RANGE_TILEX);
            int blocksPerMatY = divup(out.dims[1], RANGE_TILEY);
            dim3 blocks(blocksPerMatX * out.dims[2],
                        blocksPerMatY * out.dims[3],
                        1);

            CUDA_LAUNCH((range_kernel<T>), blocks, threads, out, dim, blocksPerMatX, blocksPerMatY);
            POST_LAUNCH_CHECK();
        }
Example #12
0
void assign(Param<T> out, CParam<T> in, const AssignKernelParam_t& p)
{
    const dim3 threads(THREADS_X, THREADS_Y);

    int blks_x = divup(in.dims[0], threads.x);
    int blks_y = divup(in.dims[1], threads.y);

    dim3 blocks(blks_x*in.dims[2], blks_y*in.dims[3]);

    CUDA_LAUNCH((AssignKernel<T>), blocks, threads, out, in, p, blks_x, blks_y);

    POST_LAUNCH_CHECK();
}
Example #13
0
    void randu(T *out, size_t elements)
    {
        int device = getActiveDeviceId();

        int threads = THREADS;
        int blocks  = divup(elements, THREADS);
        if (blocks > BLOCKS) blocks = BLOCKS;

        curandState_t *state = getcurandState();

        CUDA_LAUNCH(uniform_kernel, blocks, threads, out, state, elements);
        POST_LAUNCH_CHECK();
    }
Example #14
0
void reduce_first_launcher(Param<To> out, CParam<Ti> in, const uint blocks_x,
                           const uint blocks_y, const uint threads_x,
                           bool change_nan, double nanval) {
    dim3 threads(threads_x, THREADS_PER_BLOCK / threads_x);
    dim3 blocks(blocks_x * in.dims[2], blocks_y * in.dims[3]);

    uint repeat = divup(in.dims[0], (blocks_x * threads_x));

    const int maxBlocksY =
        cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1];
    blocks.z = divup(blocks.y, maxBlocksY);
    blocks.y = divup(blocks.y, blocks.z);

    switch (threads_x) {
        case 32:
            CUDA_LAUNCH((reduce_first_kernel<Ti, To, op, 32>), blocks, threads,
                        out, in, blocks_x, blocks_y, repeat, change_nan,
                        scalar<To>(nanval));
            break;
        case 64:
            CUDA_LAUNCH((reduce_first_kernel<Ti, To, op, 64>), blocks, threads,
                        out, in, blocks_x, blocks_y, repeat, change_nan,
                        scalar<To>(nanval));
            break;
        case 128:
            CUDA_LAUNCH((reduce_first_kernel<Ti, To, op, 128>), blocks, threads,
                        out, in, blocks_x, blocks_y, repeat, change_nan,
                        scalar<To>(nanval));
            break;
        case 256:
            CUDA_LAUNCH((reduce_first_kernel<Ti, To, op, 256>), blocks, threads,
                        out, in, blocks_x, blocks_y, repeat, change_nan,
                        scalar<To>(nanval));
            break;
    }

    POST_LAUNCH_CHECK();
}
Example #15
0
        void unwrap_row(Param<T> out, CParam<T> in, const dim_t wx, const dim_t wy,
                        const dim_t sx, const dim_t sy,
                        const dim_t px, const dim_t py, const dim_t nx)
        {
            dim3 threads(THREADS_X, THREADS_Y);
            dim3 blocks(divup(out.dims[0], threads.x), out.dims[2] * out.dims[3]);

            dim_t reps = divup((wx * wy), threads.y);

            CUDA_LAUNCH((unwrap_kernel<T, false>), blocks, threads,
                        out, in, wx, wy, sx, sy, px, py, nx, reps);

            POST_LAUNCH_CHECK();
        }
Example #16
0
void reduce_dim_launcher(Param<To> out, CParam<Ti> in, const uint threads_y,
                         const dim_t blocks_dim[4], bool change_nan,
                         double nanval) {
    dim3 threads(THREADS_X, threads_y);

    dim3 blocks(blocks_dim[0] * blocks_dim[2], blocks_dim[1] * blocks_dim[3]);

    const int maxBlocksY =
        cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1];
    blocks.z = divup(blocks.y, maxBlocksY);
    blocks.y = divup(blocks.y, blocks.z);

    switch (threads_y) {
        case 8:
            CUDA_LAUNCH((reduce_dim_kernel<Ti, To, op, dim, 8>), blocks,
                        threads, out, in, blocks_dim[0], blocks_dim[1],
                        blocks_dim[dim], change_nan, scalar<To>(nanval));
            break;
        case 4:
            CUDA_LAUNCH((reduce_dim_kernel<Ti, To, op, dim, 4>), blocks,
                        threads, out, in, blocks_dim[0], blocks_dim[1],
                        blocks_dim[dim], change_nan, scalar<To>(nanval));
            break;
        case 2:
            CUDA_LAUNCH((reduce_dim_kernel<Ti, To, op, dim, 2>), blocks,
                        threads, out, in, blocks_dim[0], blocks_dim[1],
                        blocks_dim[dim], change_nan, scalar<To>(nanval));
            break;
        case 1:
            CUDA_LAUNCH((reduce_dim_kernel<Ti, To, op, dim, 1>), blocks,
                        threads, out, in, blocks_dim[0], blocks_dim[1],
                        blocks_dim[dim], change_nan, scalar<To>(nanval));
            break;
    }

    POST_LAUNCH_CHECK();
}
Example #17
0
        void iota(Param<T> out, const dim4 &sdims, const dim4 &tdims)
        {
            dim3 threads(TX, TY, 1);

            int blocksPerMatX = divup(out.dims[0], TILEX);
            int blocksPerMatY = divup(out.dims[1], TILEY);
            dim3 blocks(blocksPerMatX * out.dims[2],
                        blocksPerMatY * out.dims[3],
                        1);

            CUDA_LAUNCH((iota_kernel<T>), blocks, threads,
                    out, sdims[0], sdims[1], sdims[2], sdims[3],
                    tdims[0], tdims[1], tdims[2], tdims[3], blocksPerMatX, blocksPerMatY);
            POST_LAUNCH_CHECK();
        }
Example #18
0
void wrap(Param<T> out, CParam<T> in, const int wx, const int wy, const int sx,
          const int sy, const int px, const int py, const bool is_column) {
    int nx = (out.dims[0] + 2 * px - wx) / sx + 1;
    int ny = (out.dims[1] + 2 * py - wy) / sy + 1;

    dim3 threads(THREADS_X, THREADS_Y);
    int blocks_x = divup(out.dims[0], threads.x);
    int blocks_y = divup(out.dims[1], threads.y);

    dim3 blocks(blocks_x * out.dims[2], blocks_y * out.dims[3]);

    const int maxBlocksY =
        cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1];
    blocks.z = divup(blocks.y, maxBlocksY);
    blocks.y = divup(blocks.y, blocks.z);

    if (is_column) {
        CUDA_LAUNCH((wrap_kernel<T, true>), blocks, threads, out, in, wx, wy,
                    sx, sy, px, py, nx, ny, blocks_x, blocks_y);
    } else {
        CUDA_LAUNCH((wrap_kernel<T, false>), blocks, threads, out, in, wx, wy,
                    sx, sy, px, py, nx, ny, blocks_x, blocks_y);
    }
}
    void transpose(Param<T> out, CParam<T> in, const int ndims)
    {
        // dimensions passed to this function should be input dimensions
        // any necessary transformations and dimension related calculations are
        // carried out here and inside the kernel
        dim3 threads(kernel::THREADS_X,kernel::THREADS_Y);


        int blk_x = divup(in.dims[0],TILE_DIM);
        int blk_y = divup(in.dims[1],TILE_DIM);
        // launch batch * blk_x blocks along x dimension
        dim3 blocks(blk_x * in.dims[2], blk_y * in.dims[3]);
        const int maxBlocksY = cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1];
        blocks.z = divup(blocks.y, maxBlocksY);
        blocks.y = divup(blocks.y, blocks.z);

        if (in.dims[0] % TILE_DIM == 0 && in.dims[1] % TILE_DIM == 0) {
            CUDA_LAUNCH((transpose<T, conjugate, true >), blocks, threads, out, in, blk_x, blk_y);
        } else {
            CUDA_LAUNCH((transpose<T, conjugate, false>), blocks, threads, out, in, blk_x, blk_y);
        }

        POST_LAUNCH_CHECK();
    }
Example #20
0
void gradient(Param<T> grad0, Param<T> grad1, CParam<T> in) {
    dim3 threads(TX, TY, 1);

    int blocksPerMatX = divup(in.dims[0], TX);
    int blocksPerMatY = divup(in.dims[1], TY);
    dim3 blocks(blocksPerMatX * in.dims[2], blocksPerMatY * in.dims[3], 1);

    const int maxBlocksY =
        cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1];
    blocks.z = divup(blocks.y, maxBlocksY);
    blocks.y = divup(blocks.y, blocks.z);

    CUDA_LAUNCH((gradient_kernel<T>), blocks, threads, grad0, grad1, in,
                blocksPerMatX, blocksPerMatY);
    POST_LAUNCH_CHECK();
}
Example #21
0
void sobel(Param<To> dx, Param<To> dy, CParam<Ti> in, const unsigned &ker_size)
{
    const dim3 threads(THREADS_X, THREADS_Y);

    int blk_x = divup(in.dims[0], threads.x);
    int blk_y = divup(in.dims[1], threads.y);

    dim3 blocks(blk_x*in.dims[2], blk_y*in.dims[3]);

    //TODO: add more cases when 5x5 and 7x7 kernels are done
    switch(ker_size) {
        case  3: CUDA_LAUNCH((sobel3x3<Ti, To>), blocks, threads, dx, dy, in, blk_x, blk_y); break;
    }

    POST_LAUNCH_CHECK();
}
Example #22
0
        void unwrap_col(Param<T> out, CParam<T> in, const dim_t wx, const dim_t wy,
                        const dim_t sx, const dim_t sy,
                        const dim_t px, const dim_t py, const dim_t nx)
        {
            dim_t TX = std::min(THREADS_PER_BLOCK, nextpow2(out.dims[0]));

            dim3 threads(TX, THREADS_PER_BLOCK / TX);
            dim3 blocks(divup(out.dims[1], threads.y), out.dims[2] * out.dims[3]);

            dim_t reps = divup((wx * wy), threads.x); // is > 1 only when TX == 256 && wx * wy > 256

            CUDA_LAUNCH((unwrap_kernel<T, true>), blocks, threads,
                        out, in, wx, wy, sx, sy, px, py, nx, reps);

            POST_LAUNCH_CHECK();
        }
Example #23
0
        void rotate(Param<T> out, CParam<T> in, const float theta)
        {
            const float c = cos(-theta), s = sin(-theta);
            float tx, ty;
            {
                const float nx = 0.5 * (in.dims[0] - 1);
                const float ny = 0.5 * (in.dims[1] - 1);
                const float mx = 0.5 * (out.dims[0] - 1);
                const float my = 0.5 * (out.dims[1] - 1);
                const float sx = (mx * c + my *-s);
                const float sy = (mx * s + my * c);
                tx = -(sx - nx);
                ty = -(sy - ny);
            }

            // Rounding error. Anything more than 3 decimal points wont make a diff
            tmat_t t;
            t.tmat[0] = round( c * 1000) / 1000.0f;
            t.tmat[1] = round(-s * 1000) / 1000.0f;
            t.tmat[2] = round(tx * 1000) / 1000.0f;
            t.tmat[3] = round( s * 1000) / 1000.0f;
            t.tmat[4] = round( c * 1000) / 1000.0f;
            t.tmat[5] = round(ty * 1000) / 1000.0f;

            int nimages = in.dims[2];
            int nbatches = in.dims[3];

            dim3 threads(TX, TY, 1);
            dim3 blocks(divup(out.dims[0], threads.x), divup(out.dims[1], threads.y));

            const int blocksXPerImage = blocks.x;
            const int blocksYPerImage = blocks.y;

            if(nimages > TI) {
                int tile_images = divup(nimages, TI);
                nimages = TI;
                blocks.x = blocks.x * tile_images;
            }

            blocks.y = blocks.y * nbatches;

            CUDA_LAUNCH((rotate_kernel<T, method>), blocks, threads,
                    out, in, t, nimages, nbatches, blocksXPerImage, blocksYPerImage);

            POST_LAUNCH_CHECK();
        }
Example #24
0
    static void bcast_first_launcher(Param<To> out,
                                     CParam<To> tmp,
                                     const uint blocks_x,
                                     const uint blocks_y,
                                     const uint threads_x)
    {

        dim3 threads(threads_x, THREADS_PER_BLOCK / threads_x);
        dim3 blocks(blocks_x * out.dims[2],
                    blocks_y * out.dims[3]);

        uint lim = divup(out.dims[0], (threads_x * blocks_x));

        CUDA_LAUNCH((bcast_first_kernel<To, op>), blocks, threads, out, tmp, blocks_x, blocks_y, lim);

        POST_LAUNCH_CHECK();
    }
Example #25
0
    static void bcast_dim_launcher(Param<To> out,
                                   CParam<To> tmp,
                                   const uint threads_y,
                                   const uint blocks_all[4])
    {

        dim3 threads(THREADS_X, threads_y);

        dim3 blocks(blocks_all[0] * blocks_all[2],
                    blocks_all[1] * blocks_all[3]);

        uint lim = divup(out.dims[dim], (threads_y * blocks_all[dim]));

        CUDA_LAUNCH((bcast_dim_kernel<To, op, dim>), blocks, threads,
            out, tmp, blocks_all[0], blocks_all[1], blocks_all[dim], lim);

        POST_LAUNCH_CHECK();
    }
Example #26
0
void nonMaximal(float* x_out, float* y_out, float* resp_out,
                 unsigned* count, const unsigned idim0, const unsigned idim1,
                 const T * resp_in, const unsigned edge, const unsigned max_corners)
{
    dim3 threads(BLOCK_X, BLOCK_Y);
    dim3 blocks(divup(idim0-edge*2, BLOCK_X), divup(idim1-edge*2, BLOCK_Y));

    unsigned* d_corners_found = memAlloc<unsigned>(1);
    CUDA_CHECK(cudaMemsetAsync(d_corners_found, 0, sizeof(unsigned),
                cuda::getStream(cuda::getActiveDeviceId())));

    CUDA_LAUNCH((nonMaxKernel<T>), blocks, threads,
            x_out, y_out, resp_out, d_corners_found, idim0, idim1, resp_in, edge, max_corners);

    POST_LAUNCH_CHECK();

    CUDA_CHECK(cudaMemcpy(count, d_corners_found, sizeof(unsigned), cudaMemcpyDeviceToHost));
    memFree(d_corners_found);
}
Example #27
0
        void select_scalar(Param<T> out, CParam<char> cond, CParam<T> a, const double b, int ndims)
        {
            dim3 threads(DIMX, DIMY);

            if (ndims == 1) {
                threads.x *= threads.y;
                threads.y = 1;
            }

            int blk_x = divup(out.dims[0], threads.x);
            int blk_y = divup(out.dims[1], threads.y);


            dim3 blocks(blk_x * threads.x,
                        blk_y * threads.y);

            CUDA_LAUNCH((select_scalar_kernel<T, flip>), blocks, threads,
                        out, cond, a, scalar<T>(b), blk_x, blk_y);

        }
Example #28
0
        void iota(Param<T> out, const af::dim4 &sdims, const af::dim4 &tdims)
        {
            dim3 threads(IOTA_TX, IOTA_TY, 1);

            int blocksPerMatX = divup(out.dims[0], TILEX);
            int blocksPerMatY = divup(out.dims[1], TILEY);

            dim3 blocks(blocksPerMatX * out.dims[2],
                        blocksPerMatY * out.dims[3],
                        1);

            const int maxBlocksY = cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1];
            blocks.z = divup(blocks.y, maxBlocksY);
            blocks.y = divup(blocks.y, blocks.z);

            CUDA_LAUNCH((iota_kernel<T>), blocks, threads,
                    out, sdims[0], sdims[1], sdims[2], sdims[3],
                    tdims[0], tdims[1], tdims[2], tdims[3], blocksPerMatX, blocksPerMatY);

            POST_LAUNCH_CHECK();
        }
void nearest_neighbour(Param<uint> idx,
                       Param<To> dist,
                       CParam<T> query,
                       CParam<T> train,
                       const dim_t dist_dim,
                       const unsigned n_dist)
{
    const unsigned feat_len = query.dims[dist_dim];
    const To max_dist = maxval<To>();

    if (feat_len > THREADS) {
        CUDA_NOT_SUPPORTED();
    }

    const dim_t sample_dim = (dist_dim == 0) ? 1 : 0;

    const unsigned nquery = query.dims[sample_dim];
    const unsigned ntrain = train.dims[sample_dim];

    dim3 threads(THREADS, 1);
    dim3 blocks(divup(ntrain, threads.x), 1);

    // Determine maximum feat_len capable of using shared memory (faster)
    int device = getActiveDeviceId();
    cudaDeviceProp prop = getDeviceProp(device);
    size_t avail_smem = prop.sharedMemPerBlock;
    size_t smem_predef = 2 * THREADS * sizeof(unsigned) + feat_len * sizeof(T);
    size_t strain_sz = threads.x * feat_len * sizeof(T);
    bool use_shmem = (avail_smem >= (smem_predef + strain_sz)) ? true : false;
    unsigned smem_sz = (use_shmem) ? smem_predef + strain_sz : smem_predef;

    unsigned nblk = blocks.x;

    auto d_blk_idx  = memAlloc<unsigned>(nblk * nquery);
    auto d_blk_dist = memAlloc<To>(nblk * nquery);

    // For each query vector, find training vector with smallest Hamming
    // distance per CUDA block
    if (use_shmem) {
        switch(feat_len) {
        // Optimized lengths (faster due to loop unrolling)
        case 1:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,1,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 2:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,2,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 4:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,4,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 8:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,8,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 16:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,16,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 32:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,32,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 64:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,64,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        default:
            CUDA_LAUNCH_SMEM((nearest_neighbour<T,To,dist_type,true>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist, feat_len);
        }
    }
    else {
        switch(feat_len) {
        // Optimized lengths (faster due to loop unrolling)
        case 1:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,1,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 2:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,2,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 4:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,4,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 8:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,8,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 16:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,16,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 32:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,32,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        case 64:
            CUDA_LAUNCH_SMEM((nearest_neighbour_unroll<T,To,dist_type,64,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist);
            break;
        default:
            CUDA_LAUNCH_SMEM((nearest_neighbour<T,To,dist_type,false>), blocks, threads, smem_sz,
                             d_blk_idx.get(), d_blk_dist.get(), query, train, max_dist, feat_len);
        }
    }
    POST_LAUNCH_CHECK();

    threads = dim3(32, 8);
    blocks = dim3(nquery, 1);

    // Reduce all smallest Hamming distances from each block and store final
    // best match
    CUDA_LAUNCH(select_matches, blocks, threads,
                idx, dist, d_blk_idx.get(), d_blk_dist.get(), nquery, nblk, max_dist);
    POST_LAUNCH_CHECK();

}
Example #30
0
int computeH(
    Param<T> bestH,
    Param<T> H,
    Param<float> err,
    CParam<float> x_src,
    CParam<float> y_src,
    CParam<float> x_dst,
    CParam<float> y_dst,
    CParam<float> rnd,
    const unsigned iterations,
    const unsigned nsamples,
    const float inlier_thr,
    const af_homography_type htype)
{
    dim3 threads(16, 16);
    dim3 blocks(1, divup(iterations, threads.y));

    // Build linear system and solve SVD
    size_t ls_shared_sz = threads.x * 81 * 2 * sizeof(T);
    CUDA_LAUNCH_SMEM((buildLinearSystem<T>), blocks, threads, ls_shared_sz,
                H, x_src, y_src, x_dst, y_dst, rnd, iterations);
    POST_LAUNCH_CHECK();

    threads = dim3(256);
    blocks = dim3(divup(iterations, threads.x));

    // Allocate some temporary buffers
    Param<unsigned> idx, inliers;
    Param<float> median;
    inliers.dims[0] = (htype == AF_HOMOGRAPHY_RANSAC) ? blocks.x : divup(nsamples, threads.x);
    inliers.strides[0] = 1;
    idx.dims[0] = median.dims[0] = blocks.x;
    idx.strides[0] = median.strides[0] = 1;
    for (int k = 1; k < 4; k++) {
        inliers.dims[k] = 1;
        inliers.strides[k] = inliers.dims[k-1] * inliers.strides[k-1];
        idx.dims[k] = median.dims[k] = 1;
        idx.strides[k] = median.strides[k] = idx.dims[k-1] * idx.strides[k-1];
    }
    idx.ptr = memAlloc<unsigned>(idx.dims[3] * idx.strides[3]);
    inliers.ptr = memAlloc<unsigned>(inliers.dims[3] * inliers.strides[3]);
    if (htype == AF_HOMOGRAPHY_LMEDS)
        median.ptr = memAlloc<float>(median.dims[3] * median.strides[3]);

    // Compute (and for RANSAC, evaluate) homographies
    CUDA_LAUNCH((computeEvalHomography<T>), blocks, threads,
                 inliers, idx, H, err, x_src, y_src, x_dst, y_dst,
                 rnd, iterations, nsamples, inlier_thr, htype);
    POST_LAUNCH_CHECK();

    unsigned inliersH, idxH;
    if (htype == AF_HOMOGRAPHY_LMEDS) {
        // TODO: Improve this sorting, if the number of iterations is
        // sufficiently large, this can be *very* slow
        kernel::sort0<float, true>(err);

        unsigned minIdx;
        float minMedian;

        // Compute median of every iteration
        CUDA_LAUNCH((computeMedian), blocks, threads,
                    median, idx, err, iterations);
        POST_LAUNCH_CHECK();

        // Reduce medians, only in case iterations > 256
        if (blocks.x > 1) {
            blocks = dim3(1);

            float* finalMedian = memAlloc<float>(1);
            unsigned* finalIdx = memAlloc<unsigned>(1);

            CUDA_LAUNCH((findMinMedian), blocks, threads,
                        finalMedian, finalIdx, median, idx);
            POST_LAUNCH_CHECK();

            CUDA_CHECK(cudaMemcpyAsync(&minMedian, finalMedian, sizeof(float),
                        cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId())));
            CUDA_CHECK(cudaMemcpyAsync(&minIdx, finalIdx, sizeof(unsigned),
                        cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId())));

            memFree(finalMedian);
            memFree(finalIdx);
        } else {
            CUDA_CHECK(cudaMemcpyAsync(&minMedian, median.ptr, sizeof(float),
                        cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId())));
            CUDA_CHECK(cudaMemcpyAsync(&minIdx, idx.ptr, sizeof(unsigned),
                        cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId())));
        }

        // Copy best homography to output
        CUDA_CHECK(cudaMemcpyAsync(bestH.ptr, H.ptr + minIdx * 9, 9*sizeof(T),
                    cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));

        blocks = dim3(divup(nsamples, threads.x));
        // sync stream for the device to host copies to be visible for
        // the subsequent kernel launch
        CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));

        CUDA_LAUNCH((computeLMedSInliers<T>), blocks, threads,
                    inliers, bestH, x_src, y_src, x_dst, y_dst,
                    minMedian, nsamples);
        POST_LAUNCH_CHECK();

        // Adds up the total number of inliers
        Param<unsigned> totalInliers;
        for (int k = 0; k < 4; k++)
            totalInliers.dims[k] = totalInliers.strides[k] = 1;
        totalInliers.ptr = memAlloc<unsigned>(1);

        kernel::reduce<unsigned, unsigned, af_add_t>(totalInliers, inliers, 0, false, 0.0);

        CUDA_CHECK(cudaMemcpyAsync(&inliersH, totalInliers.ptr, sizeof(unsigned),
                    cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId())));

        memFree(totalInliers.ptr);
        memFree(median.ptr);
    } else if (htype == AF_HOMOGRAPHY_RANSAC) {
        Param<unsigned> bestInliers, bestIdx;
        for (int k = 0; k < 4; k++) {
            bestInliers.dims[k] = bestIdx.dims[k] = 1;
            bestInliers.strides[k] = bestIdx.strides[k] = 1;
        }
        bestInliers.ptr = memAlloc<unsigned>(1);
        bestIdx.ptr = memAlloc<unsigned>(1);

        kernel::ireduce<unsigned, af_max_t>(bestInliers, bestIdx.ptr, inliers, 0);

        unsigned blockIdx;
        CUDA_CHECK(cudaMemcpyAsync(&blockIdx, bestIdx.ptr, sizeof(unsigned),
                    cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId())));

        // Copies back index and number of inliers of best homography estimation
        CUDA_CHECK(cudaMemcpyAsync(&idxH, idx.ptr+blockIdx, sizeof(unsigned),
                    cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId())));
        CUDA_CHECK(cudaMemcpyAsync(&inliersH, bestInliers.ptr, sizeof(unsigned),
                    cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId())));
        CUDA_CHECK(cudaMemcpyAsync(bestH.ptr, H.ptr + idxH * 9, 9*sizeof(T),
                    cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId())));

        memFree(bestInliers.ptr);
        memFree(bestIdx.ptr);
    }

    memFree(inliers.ptr);
    memFree(idx.ptr);
    // sync stream for the device to host copies to be visible for
    // the subsequent kernel launch
    CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));

    return (int)inliersH;
}