Beispiel #1
0
    static Kernel get_scan_dim_kernels(int kerIdx, int dim, bool isFinalPass, uint threads_y)
    {
        std::string ref_name =
            std::string("scan_") +
            std::to_string(dim) +
            std::string("_") +
            std::to_string(isFinalPass) +
            std::string("_") +
            std::string(dtype_traits<Ti>::getName()) +
            std::string("_") +
            std::string(dtype_traits<To>::getName()) +
            std::string("_") +
            std::to_string(op) +
            std::string("_") +
            std::to_string(threads_y) +
            std::string("_") +
            std::to_string(int(inclusive_scan));

        int device = getActiveDeviceId();

        kc_entry_t entry = kernelCache(device, ref_name);

        if (entry.prog==0 && entry.ker==0) {
            Binary<To, op> scan;
            ToNumStr<To> toNumStr;

            std::ostringstream options;
            options << " -D To=" << dtype_traits<To>::getName()
                    << " -D Ti=" << dtype_traits<Ti>::getName()
                    << " -D T=To"
                    << " -D dim=" << dim
                    << " -D DIMY=" << threads_y
                    << " -D THREADS_X=" << THREADS_X
                    << " -D init=" << toNumStr(scan.init())
                    << " -D " << binOpName<op>()
                    << " -D CPLX=" << af::iscplx<Ti>()
                    << " -D isFinalPass="******" -D inclusive_scan=" << inclusive_scan;
            if (std::is_same<Ti, double>::value ||
                std::is_same<Ti, cdouble>::value) {
                options << " -D USE_DOUBLE";
            }

            const char *ker_strs[] = {ops_cl, scan_dim_cl};
            const int   ker_lens[] = {ops_cl_len, scan_dim_cl_len};
            cl::Program prog;
            buildProgram(prog, 2, ker_strs, ker_lens, options.str());

            entry.prog = new Program(prog);
            entry.ker = new Kernel[2];

            entry.ker[0] = Kernel(*entry.prog, "scan_dim_kernel");
            entry.ker[1] = Kernel(*entry.prog, "bcast_dim_kernel");


            addKernelToCache(device, ref_name, entry);
        }

        return entry.ker[kerIdx];
    }
Beispiel #2
0
    To reduce_all(const Array<Ti> &in)
    {
        Transform<Ti, To, op> transform;
        Binary<To, op> reduce;

        To out = reduce.init();

        // Decrement dimension of select dimension
        af::dim4 dims = in.dims();
        af::dim4 strides = in.strides();
        const Ti *inPtr = in.get();

        for(dim_t l = 0; l < dims[3]; l++) {
            dim_t off3 = l * strides[3];

            for(dim_t k = 0; k < dims[2]; k++) {
                dim_t off2 = k * strides[2];

                for(dim_t j = 0; j < dims[1]; j++) {
                    dim_t off1 = j * strides[1];

                    for(dim_t i = 0; i < dims[0]; i++) {
                        dim_t idx = i + off1 + off2 + off3;

                        To val = transform(inPtr[idx]);
                        out = reduce(val, out);
                    }
                }
            }
        }

        return out;
    }
Beispiel #3
0
        void operator()(To *out, const dim4 &ostrides, const dim4 &odims,
                        const Ti *in , const dim4 &istrides, const dim4 &idims,
                        const int dim)
        {
            dim_t stride = istrides[dim];

            To out_val = reduce.init();
            for (dim_t i = 0; i < idims[dim]; i++) {
                To in_val = transform(in[i * stride]);
                out_val = reduce(in_val, out_val);
            }

            *out = out_val;
        }
Beispiel #4
0
        void operator()(To *out, const dim4 &ostrides, const dim4 &odims,
                        const Ti *in , const dim4 &istrides, const dim4 &idims,
                        const int dim, bool change_nan, double nanval)
        {
            dim_t stride = istrides[dim];

            To out_val = reduce.init();
            for (dim_t i = 0; i < idims[dim]; i++) {
                To in_val = transform(in[i * stride]);
                if (change_nan) in_val = IS_NAN(in_val) ? nanval : in_val;
                out_val = reduce(in_val, out_val);
            }

            *out = out_val;
        }
Beispiel #5
0
static Kernel* get_scan_dim_kernels(int kerIdx)
{
    try {
        static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
        static std::map<int, Program*> scanProgs;
        static std::map<int, Kernel*>  scanKerns;
        static std::map<int, Kernel*>  bcastKerns;

        int device= getActiveDeviceId();

        std::call_once(compileFlags[device], [device] () {

            Binary<To, op> scan;
            ToNum<To> toNum;

            std::ostringstream options;
            options << " -D To=" << dtype_traits<To>::getName()
                    << " -D Ti=" << dtype_traits<Ti>::getName()
                    << " -D T=To"
                    << " -D dim=" << dim
                    << " -D DIMY=" << threads_y
                    << " -D THREADS_X=" << THREADS_X
                    << " -D init=" << toNum(scan.init())
                    << " -D " << binOpName<op>()
                    << " -D CPLX=" << af::iscplx<Ti>()
                    << " -D isFinalPass="******" -D USE_DOUBLE";
            }

            const char *ker_strs[] = {ops_cl, scan_dim_cl};
            const int   ker_lens[] = {ops_cl_len, scan_dim_cl_len};
            cl::Program prog;
            buildProgram(prog, 2, ker_strs, ker_lens, options.str());
            scanProgs[device] = new Program(prog);

            scanKerns[device] = new Kernel(*scanProgs[device],  "scan_dim_kernel");
            bcastKerns[device] = new Kernel(*scanProgs[device],  "bcast_dim_kernel");

        });

        return (kerIdx == 0) ? scanKerns[device] : bcastKerns[device];
    } catch (cl::Error err) {
        CL_TO_AF_ERROR(err);
        throw;
    }
}
Beispiel #6
0
        void operator()(To *out, const dim4 ostrides, const dim4 odims,
                        const Ti *in , const dim4 istrides, const dim4 idims,
                        const int dim)
        {

            dim_type istride = istrides[dim];
            dim_type ostride = ostrides[dim];

            Transform<Ti, To, op> transform;
            // FIXME: Change the name to something better
            Binary<To, op> scan;

            To out_val = scan.init();
            for (dim_type i = 0; i < idims[dim]; i++) {
                To in_val = transform(in[i * istride]);
                out_val = scan(in_val, out_val);
                out[i * ostride] = out_val;
            }
        }
Beispiel #7
0
    static Kernel get_scan_dim_kernels(int kerIdx)
    {
        static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
        static Program           scanProgs[DeviceManager::MAX_DEVICES];
        static Kernel            scanKerns[DeviceManager::MAX_DEVICES];
        static Kernel           bcastKerns[DeviceManager::MAX_DEVICES];

        int device= getActiveDeviceId();

        std::call_once(compileFlags[device], [device] () {

                Binary<To, op> scan;
                ToNum<To> toNum;

                std::ostringstream options;
                options << " -D To=" << dtype_traits<To>::getName()
                        << " -D Ti=" << dtype_traits<Ti>::getName()
                        << " -D T=To"
                        << " -D dim=" << dim
                        << " -D DIMY=" << threads_y
                        << " -D THREADS_X=" << THREADS_X
                        << " -D init=" << toNum(scan.init())
                        << " -D " << binOpName<op>()
                        << " -D CPLX=" << af::iscplx<Ti>()
                        << " -D isFinalPass="******"scan_dim_kernel");
                bcastKerns[device] = Kernel(scanProgs[device],  "bcast_dim_kernel");

            });

        return (kerIdx == 0) ? scanKerns[device] : bcastKerns[device];
    }
Beispiel #8
0
    __global__
    static void reduce_dim_kernel(Param<To> out,
                                  CParam <Ti> in,
                                  uint blocks_x, uint blocks_y, uint offset_dim,
                                  bool change_nan, To nanval)
    {
        const uint tidx = threadIdx.x;
        const uint tidy = threadIdx.y;
        const uint tid  = tidy * THREADS_X + tidx;

        const uint zid = blockIdx.x / blocks_x;
        const uint wid = blockIdx.y / blocks_y;
        const uint blockIdx_x = blockIdx.x - (blocks_x) * zid;
        const uint blockIdx_y = blockIdx.y - (blocks_y) * wid;
        const uint xid = blockIdx_x * blockDim.x + tidx;
        const uint yid = blockIdx_y; // yid  of output. updated for input later.

        uint ids[4] = {xid, yid, zid, wid};

        const Ti *iptr = in.ptr;
        To *optr = out.ptr;

        // There is only one element per block for out
        // There are blockDim.y elements per block for in
        // Hence increment ids[dim] just after offseting out and before offsetting in
        optr += ids[3] * out.strides[3] + ids[2] * out.strides[2] + ids[1] * out.strides[1] + ids[0];
        const uint blockIdx_dim = ids[dim];

        ids[dim] = ids[dim] * blockDim.y + tidy;
        iptr  += ids[3] * in.strides[3] + ids[2] * in.strides[2] + ids[1] * in.strides[1] + ids[0];
        const uint id_dim_in = ids[dim];

        const uint istride_dim = in.strides[dim];

        bool is_valid =
            (ids[0] < in.dims[0]) &&
            (ids[1] < in.dims[1]) &&
            (ids[2] < in.dims[2]) &&
            (ids[3] < in.dims[3]);

        Transform<Ti, To, op> transform;
        Binary<To, op> reduce;

        __shared__ To s_val[THREADS_X * DIMY];

        To out_val = reduce.init();
        for (int id = id_dim_in; is_valid && (id < in.dims[dim]); id += offset_dim * blockDim.y) {
            To in_val = transform(*iptr);
            if (change_nan) in_val = !IS_NAN(in_val) ? in_val : nanval;
            out_val = reduce(in_val, out_val);
            iptr = iptr + offset_dim * blockDim.y * istride_dim;
        }

        s_val[tid] = out_val;

        To *s_ptr = s_val + tid;
        __syncthreads();

        if (DIMY == 8) {
            if (tidy < 4) *s_ptr = reduce(*s_ptr, s_ptr[THREADS_X * 4]);
            __syncthreads();
        }

        if (DIMY >= 4) {
            if (tidy < 2) *s_ptr = reduce(*s_ptr, s_ptr[THREADS_X * 2]);
            __syncthreads();
        }

        if (DIMY >= 2) {
            if (tidy < 1) *s_ptr = reduce(*s_ptr, s_ptr[THREADS_X * 1]);
            __syncthreads();
        }

        if (tidy == 0 && is_valid &&
            (blockIdx_dim < out.dims[dim])) {
            *optr = *s_ptr;
        }

    }
    __global__
    static void scan_dim_nonfinal_kernel(Param<To> out,
                                         Param<To> tmp,
                                         Param<char> tflg,
                                         Param<int> tlid,
                                         CParam<Ti>  in,
                                         CParam<Tk> key,
                                         int dim,
                                         uint blocks_x,
                                         uint blocks_y,
                                         uint lim,
                                         bool inclusive_scan)
    {
        const int tidx = threadIdx.x;
        const int tidy = threadIdx.y;
        const int tid  = tidy * THREADS_X + tidx;

        const int zid = blockIdx.x / blocks_x;
        const int wid = blockIdx.y / blocks_y;
        const int blockIdx_x = blockIdx.x - (blocks_x) * zid;
        const int blockIdx_y = blockIdx.y - (blocks_y) * wid;
        const int xid = blockIdx_x * blockDim.x + tidx;
        const int yid = blockIdx_y; // yid  of output. updated for input later.

        int ids[4] = {xid, yid, zid, wid};

        const Ti *iptr = in.ptr;
        const Tk *kptr = key.ptr;
        To *optr = out.ptr;
        To *tptr = tmp.ptr;
        char *tfptr = tflg.ptr;
        int *tiptr = tlid.ptr;

        // There is only one element per block for out
        // There are blockDim.y elements per block for in
        // Hence increment ids[dim] just after offseting out and before offsetting in
        tptr    += ids[3] *  tmp.strides[3] + ids[2] *  tmp.strides[2] + ids[1] *  tmp.strides[1] + ids[0];
        tfptr   += ids[3] * tflg.strides[3] + ids[2] * tflg.strides[2] + ids[1] * tflg.strides[1] + ids[0];
        tiptr   += ids[3] * tlid.strides[3] + ids[2] * tlid.strides[2] + ids[1] * tlid.strides[1] + ids[0];
        const int blockIdx_dim = ids[dim];

        ids[dim] = ids[dim] * blockDim.y * lim + tidy;
        optr  += ids[3] * out.strides[3] + ids[2] * out.strides[2] + ids[1] * out.strides[1] + ids[0];
        iptr  += ids[3] *  in.strides[3] + ids[2] *  in.strides[2] + ids[1] *  in.strides[1] + ids[0];
        kptr  += ids[3] * key.strides[3] + ids[2] * key.strides[2] + ids[1] * key.strides[1] + ids[0];
        int id_dim = ids[dim];
        const int out_dim = out.dims[dim];

        bool is_valid =
            (ids[0] < out.dims[0]) &&
            (ids[1] < out.dims[1]) &&
            (ids[2] < out.dims[2]) &&
            (ids[3] < out.dims[3]);

        const int ostride_dim = out.strides[dim];
        const int istride_dim =  in.strides[dim];

        __shared__ char s_flg[THREADS_X * DIMY * 2];
        __shared__ To s_val[THREADS_X * DIMY * 2];
        __shared__ char s_ftmp[THREADS_X];
        __shared__ To s_tmp[THREADS_X];
        __shared__ int boundaryid[THREADS_X];
        To *sptr =  s_val + tid;
        char *sfptr = s_flg + tid;

        Transform<Ti, To, op> transform;
        Binary<To, op> binop;

        const To init = binop.init();
        To val = init;

        const bool isLast = (tidy == (DIMY - 1));
        if (isLast) {
            s_tmp[tidx] = val;
            s_ftmp[tidx] = 0;
            boundaryid[tidx] = -1;
        }
        __syncthreads();

        char flag = 0;
        for (int k = 0; k < lim; k++) {

            if (id_dim < out_dim) {
                flag = calculate_head_flags_dim(kptr, id_dim, key.strides[dim]);
            } else {
                flag = 0;
            }

            //Load val from global in
            if (inclusive_scan) {
                if (id_dim >= out_dim) {
                    val = init;
                } else {
                    val = transform(*iptr);
                }
            } else {
                if ((id_dim == 0) || (id_dim >= out_dim) || flag) {
                    val = init;
                } else {
                    val = transform(*(iptr - istride_dim));
                }
            }

            //Add partial result from last iteration before scan operation
            if ((tidy == 0) && (flag == 0)) {
                val = binop(val, s_tmp[tidx]);
                flag = s_ftmp[tidx];
            }

            //Write to shared memory
            *sptr = val;
            *sfptr = flag;
            __syncthreads();

            //Segmented Scan
            int start = 0;
#pragma unroll
            for (int off = 1; off < DIMY; off *= 2) {

                if (tidy >= off) {
                    val = sfptr[start * THREADS_X] ? val : binop(val, sptr[(start - off) * THREADS_X]);
                    flag = sfptr[start * THREADS_X] | sfptr[(start - off) * THREADS_X];
                }
                start = DIMY - start;
                sptr[start * THREADS_X] = val;
                sfptr[start * THREADS_X] = flag;

                __syncthreads();
            }

            //Identify segment boundary
            if (tidy == 0) {
                if ((s_ftmp[tidx] == 0) && (sfptr[start * THREADS_X] == 1)) {
                    boundaryid[tidx] = id_dim;
                }
            } else {
                if ((sfptr[(start - 1) * THREADS_X] == 0) && (sfptr[start * THREADS_X] == 1)) {
                    boundaryid[tidx] = id_dim;
                }
            }
            __syncthreads();

            if (is_valid && (id_dim < out_dim)) *optr = val;
            if (isLast) {
                s_tmp[tidx] = val;
                s_ftmp[tidx] = flag;
            }
            id_dim += blockDim.y;
            kptr += blockDim.y * key.strides[dim];
            iptr += blockDim.y * istride_dim;
            optr += blockDim.y * ostride_dim;
            __syncthreads();
        }

        if (is_valid &&
            (blockIdx_dim < tmp.dims[dim]) &&
            isLast) {
            *tptr = val;
            *tfptr = flag;
            int boundary = boundaryid[tidx];
            *tiptr = (boundary == -1) ? id_dim : boundary;
            }
    }
Beispiel #10
0
    __global__
    static void scan_dim_kernel(Param<To> out,
                                Param<To> tmp,
                                CParam<Ti>  in,
                                uint blocks_x,
                                uint blocks_y,
                                uint blocks_dim,
                                uint lim)
    {
        const int tidx = threadIdx.x;
        const int tidy = threadIdx.y;
        const int tid  = tidy * THREADS_X + tidx;

        const int zid = blockIdx.x / blocks_x;
        const int wid = blockIdx.y / blocks_y;
        const int blockIdx_x = blockIdx.x - (blocks_x) * zid;
        const int blockIdx_y = blockIdx.y - (blocks_y) * wid;
        const int xid = blockIdx_x * blockDim.x + tidx;
        const int yid = blockIdx_y; // yid  of output. updated for input later.

        int ids[4] = {xid, yid, zid, wid};

        const Ti *iptr = in.ptr;
        To *optr = out.ptr;
        To *tptr = tmp.ptr;

        // There is only one element per block for out
        // There are blockDim.y elements per block for in
        // Hence increment ids[dim] just after offseting out and before offsetting in
        tptr += ids[3] * tmp.strides[3] + ids[2] * tmp.strides[2] + ids[1] * tmp.strides[1] + ids[0];
        const int blockIdx_dim = ids[dim];

        ids[dim] = ids[dim] * blockDim.y * lim + tidy;
        optr  += ids[3] * out.strides[3] + ids[2] * out.strides[2] + ids[1] * out.strides[1] + ids[0];
        iptr  += ids[3] *  in.strides[3] + ids[2] *  in.strides[2] + ids[1] *  in.strides[1] + ids[0];
        int id_dim = ids[dim];
        const int out_dim = out.dims[dim];

        bool is_valid =
            (ids[0] < out.dims[0]) &&
            (ids[1] < out.dims[1]) &&
            (ids[2] < out.dims[2]) &&
            (ids[3] < out.dims[3]);

        const int ostride_dim = out.strides[dim];
        const int istride_dim =  in.strides[dim];

        __shared__ To s_val[THREADS_X * DIMY * 2];
        __shared__ To s_tmp[THREADS_X];
        To *sptr =  s_val + tid;

        Transform<Ti, To, op> transform;
        Binary<To, op> binop;

        const To init = binop.init();
        To val = init;

        const bool isLast = (tidy == (DIMY - 1));

        for (int k = 0; k < lim; k++) {

            if (isLast) s_tmp[tidx] = val;

            bool cond = (is_valid) && (id_dim < out_dim);
            val = cond ? transform(*iptr) : init;
            *sptr = val;
            __syncthreads();

            int start = 0;
#pragma unroll
            for (int off = 1; off < DIMY; off *= 2) {

                if (tidy >= off) val = binop(val, sptr[(start - off) * THREADS_X]);
                start = DIMY - start;
                sptr[start * THREADS_X] = val;

                __syncthreads();
            }

            val = binop(val, s_tmp[tidx]);
            __syncthreads();
            if (cond) *optr = val;

            id_dim += blockDim.y;
            iptr += blockDim.y * istride_dim;
            optr += blockDim.y * ostride_dim;
        }

        if (!isFinalPass &&
            is_valid &&
            (blockIdx_dim < tmp.dims[dim]) &&
            isLast) {
            *tptr = val;
            }
    }
Beispiel #11
0
    __global__
    static void ireduce_dim_kernel(Param<T> out, uint *olptr,
                                  CParam <T> in, const uint *ilptr,
                                  uint blocks_x, uint blocks_y, uint offset_dim)
    {
        const uint tidx = threadIdx.x;
        const uint tidy = threadIdx.y;
        const uint tid  = tidy * THREADS_X + tidx;

        const uint zid = blockIdx.x / blocks_x;
        const uint wid = blockIdx.y / blocks_y;
        const uint blockIdx_x = blockIdx.x - (blocks_x) * zid;
        const uint blockIdx_y = blockIdx.y - (blocks_y) * wid;
        const uint xid = blockIdx_x * blockDim.x + tidx;
        const uint yid = blockIdx_y; // yid  of output. updated for input later.

        uint ids[4] = {xid, yid, zid, wid};

        const T *iptr = in.ptr;
        T *optr = out.ptr;

        // There is only one element per block for out
        // There are blockDim.y elements per block for in
        // Hence increment ids[dim] just after offseting out and before offsetting in
        optr += ids[3] * out.strides[3] + ids[2] * out.strides[2] + ids[1] * out.strides[1] + ids[0];
        olptr += ids[3] * out.strides[3] + ids[2] * out.strides[2] + ids[1] * out.strides[1] + ids[0];
        const uint blockIdx_dim = ids[dim];

        ids[dim] = ids[dim] * blockDim.y + tidy;
        iptr  += ids[3] * in.strides[3] + ids[2] * in.strides[2] + ids[1] * in.strides[1] + ids[0];
        if (!is_first) ilptr  += ids[3] * in.strides[3] + ids[2] * in.strides[2] + ids[1] * in.strides[1] + ids[0];
        const uint id_dim_in = ids[dim];

        const uint istride_dim = in.strides[dim];

        bool is_valid =
            (ids[0] < in.dims[0]) &&
            (ids[1] < in.dims[1]) &&
            (ids[2] < in.dims[2]) &&
            (ids[3] < in.dims[3]);

        Binary<T, op> ireduce;

        T val = ireduce.init();
        uint idx = id_dim_in;

        if (is_valid && id_dim_in < in.dims[dim]) {
            val = *iptr;
            if (!is_first) idx = *ilptr;
        }

        MinMaxOp<op, T> Op(val, idx);

        const uint id_dim_in_start = id_dim_in + offset_dim * blockDim.y;

        __shared__ T s_val[THREADS_X * DIMY];
        __shared__ uint s_idx[THREADS_X * DIMY];

        for (int id = id_dim_in_start;
             is_valid && (id < in.dims[dim]);
             id += offset_dim * blockDim.y) {

            iptr = iptr + offset_dim * blockDim.y * istride_dim;
            if (!is_first) {
                ilptr = ilptr + offset_dim * blockDim.y * istride_dim;
                Op(*iptr, *ilptr);
            } else {
                Op(*iptr, id);
            }
        }

        s_val[tid] = Op.m_val;
        s_idx[tid] = Op.m_idx;

        T *s_vptr = s_val + tid;
        uint *s_iptr = s_idx + tid;
        __syncthreads();

        if (DIMY == 8) {
            if (tidy < 4) {
                Op(s_vptr[THREADS_X * 4], s_iptr[THREADS_X * 4]);
                *s_vptr = Op.m_val;
                *s_iptr = Op.m_idx;
            }
            __syncthreads();
        }

        if (DIMY >= 4) {
            if (tidy < 2) {
                Op(s_vptr[THREADS_X * 2], s_iptr[THREADS_X * 2]);
                *s_vptr = Op.m_val;
                *s_iptr = Op.m_idx;
            }
            __syncthreads();
        }

        if (DIMY >= 2) {
            if (tidy < 1) {
                Op(s_vptr[THREADS_X * 1], s_iptr[THREADS_X * 1]);
                *s_vptr = Op.m_val;
                *s_iptr = Op.m_idx;
            }
            __syncthreads();
        }

        if (tidy == 0 && is_valid &&
            (blockIdx_dim < out.dims[dim])) {
            *optr = *s_vptr;
            *olptr = *s_iptr;
        }

    }
Beispiel #12
0
    __global__
    static void ireduce_first_kernel(Param<T> out, uint *olptr,
                                    CParam<T>  in, const uint *ilptr,
                                    uint blocks_x, uint blocks_y, uint repeat)
    {
        const uint tidx = threadIdx.x;
        const uint tidy = threadIdx.y;
        const uint tid  = tidy * blockDim.x + tidx;

        const uint zid = blockIdx.x / blocks_x;
        const uint wid = blockIdx.y / blocks_y;
        const uint blockIdx_x = blockIdx.x - (blocks_x) * zid;
        const uint blockIdx_y = blockIdx.y - (blocks_y) * wid;
        const uint xid = blockIdx_x * blockDim.x * repeat + tidx;
        const uint yid = blockIdx_y * blockDim.y + tidy;

        const T *iptr = in.ptr;
        T *optr = out.ptr;

        iptr += wid *  in.strides[3] + zid *  in.strides[2] + yid *  in.strides[1];
        optr += wid * out.strides[3] + zid * out.strides[2] + yid * out.strides[1];

        if (!is_first) ilptr += wid *  in.strides[3] + zid *  in.strides[2] + yid *  in.strides[1];
        olptr += wid * out.strides[3] + zid * out.strides[2] + yid * out.strides[1];

        if (yid >= in.dims[1] ||
            zid >= in.dims[2] ||
            wid >= in.dims[3]) return;

        int lim = min((int)(xid + repeat * DIMX), in.dims[0]);

        Binary<T, op> ireduce;

        T val = ireduce.init();
        uint idx = xid;

        if (xid < lim) {
            val = iptr[xid];
            if (!is_first) idx = ilptr[xid];
        }

        MinMaxOp<op, T> Op(val, idx);

        __shared__ T s_val[THREADS_PER_BLOCK];
        __shared__ uint s_idx[THREADS_PER_BLOCK];


        for (int id = xid + DIMX; id < lim; id += DIMX) {
            Op(iptr[id], (!is_first) ? ilptr[id] : id);
        }

        s_val[tid] = Op.m_val;
        s_idx[tid] = Op.m_idx;
        __syncthreads();

        T *s_vptr = s_val + tidy * DIMX;
        uint *s_iptr = s_idx + tidy * DIMX;

        if (DIMX == 256) {
            if (tidx < 128) {
                Op(s_vptr[tidx + 128], s_iptr[tidx + 128]);
                s_vptr[tidx] = Op.m_val;
                s_iptr[tidx] = Op.m_idx;
            }
            __syncthreads();
        }

        if (DIMX >= 128) {
            if (tidx <  64) {
                Op(s_vptr[tidx +  64], s_iptr[tidx +  64]);
                s_vptr[tidx] = Op.m_val;
                s_iptr[tidx] = Op.m_idx;
            }
            __syncthreads();
        }

        if (DIMX >=  64) {
            if (tidx <  32) {
                Op(s_vptr[tidx +  32], s_iptr[tidx +  32]);
                s_vptr[tidx] = Op.m_val;
                s_iptr[tidx] = Op.m_idx;
            }
            __syncthreads();
        }

        warp_reduce<T, op>(s_vptr, s_iptr, tidx);

        if (tidx == 0) {
            optr[blockIdx_x] = s_vptr[0];
            olptr[blockIdx_x] = s_iptr[0];
        }
    }
Beispiel #13
0
void mean_first_launcher(Param out, Param owt,
        Param in, Param inWeight,
        const int threads_x,
        const uint groups_x,
        const uint groups_y)
{

    bool input_weight = ((inWeight.info.dims[0] *
                          inWeight.info.dims[1] *
                          inWeight.info.dims[2] *
                          inWeight.info.dims[3]) != 0);

    bool output_weight = (( owt.info.dims[0] *
                            owt.info.dims[1] *
                            owt.info.dims[2] *
                            owt.info.dims[3]) != 0);

    std::string ref_name =
        std::string("mean_0_") +
        std::string(dtype_traits<Ti>::getName()) +
        std::string("_") +
        std::string(dtype_traits<Tw>::getName()) +
        std::string("_") +
        std::string(dtype_traits<To>::getName()) +
        std::string("_") +
        std::to_string(threads_x) +
        std::string("_") +
        std::to_string(input_weight) +
        std::string("_") +
        std::to_string(output_weight);

    int device = getActiveDeviceId();

    kc_entry_t entry = kernelCache(device, ref_name);

    if (entry.prog==0 && entry.ker==0) {

        Binary<To, af_add_t> mean;
        ToNumStr<To> toNumStr;
        ToNumStr<Tw> twNumStr;
        Transform<uint, Tw, af_add_t> transform_weight;

        std::ostringstream options;
        options << " -D Ti=" << dtype_traits<Ti>::getName()
            << " -D Tw=" << dtype_traits<Tw>::getName()
            << " -D To=" << dtype_traits<To>::getName()
            << " -D DIMX=" << threads_x
            << " -D THREADS_PER_GROUP=" << THREADS_PER_GROUP
            << " -D init_To=" << toNumStr(mean.init())
            << " -D init_Tw=" << twNumStr(transform_weight(0))
            << " -D one_Tw=" << twNumStr(transform_weight(1));

        if (input_weight) { options << " -D INPUT_WEIGHT"; }
        if (output_weight) { options << " -D OUTPUT_WEIGHT"; }

        if (std::is_same<Ti, double>::value ||
                std::is_same<Ti, cdouble>::value ||
                std::is_same<To, double>::value) {
            options << " -D USE_DOUBLE";
        }

        const char *ker_strs[] = {mean_ops_cl, mean_first_cl};
        const int   ker_lens[] = {mean_ops_cl_len, mean_first_cl_len};
        Program prog;
        buildProgram(prog, 2, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker = new Kernel(*entry.prog, "mean_first_kernel");

        addKernelToCache(device, ref_name, entry);
    }

    NDRange local(threads_x, THREADS_PER_GROUP / threads_x);
    NDRange global(groups_x * in.info.dims[2] * local[0],
            groups_y * in.info.dims[3] * local[1]);

    uint repeat = divup(in.info.dims[0], (local[0] * groups_x));

    if (input_weight && output_weight) {
        auto meanOp = KernelFunctor<
            Buffer, KParam,
            Buffer, KParam,
            Buffer, KParam,
            Buffer, KParam,
            uint, uint, uint>(*entry.ker);
        meanOp(EnqueueArgs(getQueue(), global, local),
                *out.data, out.info,
                *owt.data, owt.info,
                *in.data, in.info,
                *inWeight.data, inWeight.info,
                groups_x, groups_y, repeat);
    } else if (!input_weight && !output_weight) {
        auto meanOp = KernelFunctor<
            Buffer, KParam,
            Buffer, KParam,
            uint, uint, uint>(*entry.ker);
        meanOp(EnqueueArgs(getQueue(), global, local),
                *out.data, out.info,
                *in.data, in.info,
                groups_x, groups_y, repeat);
    } else if ( input_weight && !output_weight) {
        auto meanOp = KernelFunctor<
            Buffer, KParam,
            Buffer, KParam,
            Buffer, KParam,
            uint, uint, uint>(*entry.ker);
        meanOp(EnqueueArgs(getQueue(), global, local),
                *out.data, out.info,
                *in.data, in.info,
                *inWeight.data, inWeight.info,
                groups_x, groups_y, repeat);
    } else if (!input_weight &&  output_weight) {
        auto meanOp = KernelFunctor<
            Buffer, KParam,
            Buffer, KParam,
            Buffer, KParam,
            uint, uint, uint>(*entry.ker);
        meanOp(EnqueueArgs(getQueue(), global, local),
                *out.data, out.info,
                *owt.data, owt.info,
                *in.data, in.info,
                groups_x, groups_y, repeat);
    }

    CL_DEBUG_FINISH(getQueue());
}
Beispiel #14
0
    __global__
    static void scan_first_kernel(Param<To> out,
                                  Param<To> tmp,
                                  CParam<Ti>  in,
                                  uint blocks_x,
                                  uint blocks_y,
                                  uint lim)
    {
        const int tidx = threadIdx.x;
        const int tidy = threadIdx.y;

        const int zid = blockIdx.x / blocks_x;
        const int wid = blockIdx.y / blocks_y;
        const int blockIdx_x = blockIdx.x - (blocks_x) * zid;
        const int blockIdx_y = blockIdx.y - (blocks_y) * wid;
        const int xid = blockIdx_x * blockDim.x * lim + tidx;
        const int yid = blockIdx_y * blockDim.y + tidy;

        bool cond_yzw = (yid < out.dims[1]) && (zid < out.dims[2]) && (wid < out.dims[3]);

        if (!cond_yzw) return; // retire warps early

        const Ti *iptr = in.ptr;
        To *optr = out.ptr;
        To *tptr = tmp.ptr;

        iptr += wid *  in.strides[3] + zid *  in.strides[2] + yid *  in.strides[1];
        optr += wid * out.strides[3] + zid * out.strides[2] + yid * out.strides[1];
        tptr += wid * tmp.strides[3] + zid * tmp.strides[2] + yid * tmp.strides[1];


        const int DIMY = THREADS_PER_BLOCK / DIMX;
        const int SHARED_MEM_SIZE = (2 * DIMX + 1) * (DIMY);

        __shared__ To s_val[SHARED_MEM_SIZE];
        __shared__ To s_tmp[DIMY];

        To *sptr = s_val + tidy * (2 * DIMX + 1);

        Transform<Ti, To, op> transform;
        Binary<To, op> binop;

        const To init = binop.init();
        int id = xid;
        To val = init;

        const bool isLast = (tidx == (DIMX - 1));

        for (int k = 0; k < lim; k++) {

            if (isLast) s_tmp[tidy] = val;

            bool cond = ((id < out.dims[0]));
            val = cond ? transform(iptr[id]) : init;
            sptr[tidx] = val;
            __syncthreads();


            int start = 0;
#pragma unroll
            for (int off = 1; off < DIMX; off *= 2) {

                if (tidx >= off) val = binop(val, sptr[(start - off) + tidx]);
                start = DIMX - start;
                sptr[start + tidx] = val;

                __syncthreads();
            }

            val = binop(val, s_tmp[tidy]);
            if (cond) optr[id] = val;
            id += blockDim.x;
            __syncthreads();
        }

        if (!isFinalPass && isLast) {
            tptr[blockIdx_x] = val;
        }
    }