static Kernel get_scan_dim_kernels(int kerIdx, int dim, bool isFinalPass, uint threads_y) { std::string ref_name = std::string("scan_") + std::to_string(dim) + std::string("_") + std::to_string(isFinalPass) + std::string("_") + std::string(dtype_traits<Ti>::getName()) + std::string("_") + std::string(dtype_traits<To>::getName()) + std::string("_") + std::to_string(op) + std::string("_") + std::to_string(threads_y) + std::string("_") + std::to_string(int(inclusive_scan)); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, ref_name); if (entry.prog==0 && entry.ker==0) { Binary<To, op> scan; ToNumStr<To> toNumStr; std::ostringstream options; options << " -D To=" << dtype_traits<To>::getName() << " -D Ti=" << dtype_traits<Ti>::getName() << " -D T=To" << " -D dim=" << dim << " -D DIMY=" << threads_y << " -D THREADS_X=" << THREADS_X << " -D init=" << toNumStr(scan.init()) << " -D " << binOpName<op>() << " -D CPLX=" << af::iscplx<Ti>() << " -D isFinalPass="******" -D inclusive_scan=" << inclusive_scan; if (std::is_same<Ti, double>::value || std::is_same<Ti, cdouble>::value) { options << " -D USE_DOUBLE"; } const char *ker_strs[] = {ops_cl, scan_dim_cl}; const int ker_lens[] = {ops_cl_len, scan_dim_cl_len}; cl::Program prog; buildProgram(prog, 2, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel[2]; entry.ker[0] = Kernel(*entry.prog, "scan_dim_kernel"); entry.ker[1] = Kernel(*entry.prog, "bcast_dim_kernel"); addKernelToCache(device, ref_name, entry); } return entry.ker[kerIdx]; }
To reduce_all(const Array<Ti> &in) { Transform<Ti, To, op> transform; Binary<To, op> reduce; To out = reduce.init(); // Decrement dimension of select dimension af::dim4 dims = in.dims(); af::dim4 strides = in.strides(); const Ti *inPtr = in.get(); for(dim_t l = 0; l < dims[3]; l++) { dim_t off3 = l * strides[3]; for(dim_t k = 0; k < dims[2]; k++) { dim_t off2 = k * strides[2]; for(dim_t j = 0; j < dims[1]; j++) { dim_t off1 = j * strides[1]; for(dim_t i = 0; i < dims[0]; i++) { dim_t idx = i + off1 + off2 + off3; To val = transform(inPtr[idx]); out = reduce(val, out); } } } } return out; }
void operator()(To *out, const dim4 &ostrides, const dim4 &odims, const Ti *in , const dim4 &istrides, const dim4 &idims, const int dim) { dim_t stride = istrides[dim]; To out_val = reduce.init(); for (dim_t i = 0; i < idims[dim]; i++) { To in_val = transform(in[i * stride]); out_val = reduce(in_val, out_val); } *out = out_val; }
void operator()(To *out, const dim4 &ostrides, const dim4 &odims, const Ti *in , const dim4 &istrides, const dim4 &idims, const int dim, bool change_nan, double nanval) { dim_t stride = istrides[dim]; To out_val = reduce.init(); for (dim_t i = 0; i < idims[dim]; i++) { To in_val = transform(in[i * stride]); if (change_nan) in_val = IS_NAN(in_val) ? nanval : in_val; out_val = reduce(in_val, out_val); } *out = out_val; }
static Kernel* get_scan_dim_kernels(int kerIdx) { try { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static std::map<int, Program*> scanProgs; static std::map<int, Kernel*> scanKerns; static std::map<int, Kernel*> bcastKerns; int device= getActiveDeviceId(); std::call_once(compileFlags[device], [device] () { Binary<To, op> scan; ToNum<To> toNum; std::ostringstream options; options << " -D To=" << dtype_traits<To>::getName() << " -D Ti=" << dtype_traits<Ti>::getName() << " -D T=To" << " -D dim=" << dim << " -D DIMY=" << threads_y << " -D THREADS_X=" << THREADS_X << " -D init=" << toNum(scan.init()) << " -D " << binOpName<op>() << " -D CPLX=" << af::iscplx<Ti>() << " -D isFinalPass="******" -D USE_DOUBLE"; } const char *ker_strs[] = {ops_cl, scan_dim_cl}; const int ker_lens[] = {ops_cl_len, scan_dim_cl_len}; cl::Program prog; buildProgram(prog, 2, ker_strs, ker_lens, options.str()); scanProgs[device] = new Program(prog); scanKerns[device] = new Kernel(*scanProgs[device], "scan_dim_kernel"); bcastKerns[device] = new Kernel(*scanProgs[device], "bcast_dim_kernel"); }); return (kerIdx == 0) ? scanKerns[device] : bcastKerns[device]; } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }
void operator()(To *out, const dim4 ostrides, const dim4 odims, const Ti *in , const dim4 istrides, const dim4 idims, const int dim) { dim_type istride = istrides[dim]; dim_type ostride = ostrides[dim]; Transform<Ti, To, op> transform; // FIXME: Change the name to something better Binary<To, op> scan; To out_val = scan.init(); for (dim_type i = 0; i < idims[dim]; i++) { To in_val = transform(in[i * istride]); out_val = scan(in_val, out_val); out[i * ostride] = out_val; } }
static Kernel get_scan_dim_kernels(int kerIdx) { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static Program scanProgs[DeviceManager::MAX_DEVICES]; static Kernel scanKerns[DeviceManager::MAX_DEVICES]; static Kernel bcastKerns[DeviceManager::MAX_DEVICES]; int device= getActiveDeviceId(); std::call_once(compileFlags[device], [device] () { Binary<To, op> scan; ToNum<To> toNum; std::ostringstream options; options << " -D To=" << dtype_traits<To>::getName() << " -D Ti=" << dtype_traits<Ti>::getName() << " -D T=To" << " -D dim=" << dim << " -D DIMY=" << threads_y << " -D THREADS_X=" << THREADS_X << " -D init=" << toNum(scan.init()) << " -D " << binOpName<op>() << " -D CPLX=" << af::iscplx<Ti>() << " -D isFinalPass="******"scan_dim_kernel"); bcastKerns[device] = Kernel(scanProgs[device], "bcast_dim_kernel"); }); return (kerIdx == 0) ? scanKerns[device] : bcastKerns[device]; }
__global__ static void reduce_dim_kernel(Param<To> out, CParam <Ti> in, uint blocks_x, uint blocks_y, uint offset_dim, bool change_nan, To nanval) { const uint tidx = threadIdx.x; const uint tidy = threadIdx.y; const uint tid = tidy * THREADS_X + tidx; const uint zid = blockIdx.x / blocks_x; const uint wid = blockIdx.y / blocks_y; const uint blockIdx_x = blockIdx.x - (blocks_x) * zid; const uint blockIdx_y = blockIdx.y - (blocks_y) * wid; const uint xid = blockIdx_x * blockDim.x + tidx; const uint yid = blockIdx_y; // yid of output. updated for input later. uint ids[4] = {xid, yid, zid, wid}; const Ti *iptr = in.ptr; To *optr = out.ptr; // There is only one element per block for out // There are blockDim.y elements per block for in // Hence increment ids[dim] just after offseting out and before offsetting in optr += ids[3] * out.strides[3] + ids[2] * out.strides[2] + ids[1] * out.strides[1] + ids[0]; const uint blockIdx_dim = ids[dim]; ids[dim] = ids[dim] * blockDim.y + tidy; iptr += ids[3] * in.strides[3] + ids[2] * in.strides[2] + ids[1] * in.strides[1] + ids[0]; const uint id_dim_in = ids[dim]; const uint istride_dim = in.strides[dim]; bool is_valid = (ids[0] < in.dims[0]) && (ids[1] < in.dims[1]) && (ids[2] < in.dims[2]) && (ids[3] < in.dims[3]); Transform<Ti, To, op> transform; Binary<To, op> reduce; __shared__ To s_val[THREADS_X * DIMY]; To out_val = reduce.init(); for (int id = id_dim_in; is_valid && (id < in.dims[dim]); id += offset_dim * blockDim.y) { To in_val = transform(*iptr); if (change_nan) in_val = !IS_NAN(in_val) ? in_val : nanval; out_val = reduce(in_val, out_val); iptr = iptr + offset_dim * blockDim.y * istride_dim; } s_val[tid] = out_val; To *s_ptr = s_val + tid; __syncthreads(); if (DIMY == 8) { if (tidy < 4) *s_ptr = reduce(*s_ptr, s_ptr[THREADS_X * 4]); __syncthreads(); } if (DIMY >= 4) { if (tidy < 2) *s_ptr = reduce(*s_ptr, s_ptr[THREADS_X * 2]); __syncthreads(); } if (DIMY >= 2) { if (tidy < 1) *s_ptr = reduce(*s_ptr, s_ptr[THREADS_X * 1]); __syncthreads(); } if (tidy == 0 && is_valid && (blockIdx_dim < out.dims[dim])) { *optr = *s_ptr; } }
__global__ static void scan_dim_nonfinal_kernel(Param<To> out, Param<To> tmp, Param<char> tflg, Param<int> tlid, CParam<Ti> in, CParam<Tk> key, int dim, uint blocks_x, uint blocks_y, uint lim, bool inclusive_scan) { const int tidx = threadIdx.x; const int tidy = threadIdx.y; const int tid = tidy * THREADS_X + tidx; const int zid = blockIdx.x / blocks_x; const int wid = blockIdx.y / blocks_y; const int blockIdx_x = blockIdx.x - (blocks_x) * zid; const int blockIdx_y = blockIdx.y - (blocks_y) * wid; const int xid = blockIdx_x * blockDim.x + tidx; const int yid = blockIdx_y; // yid of output. updated for input later. int ids[4] = {xid, yid, zid, wid}; const Ti *iptr = in.ptr; const Tk *kptr = key.ptr; To *optr = out.ptr; To *tptr = tmp.ptr; char *tfptr = tflg.ptr; int *tiptr = tlid.ptr; // There is only one element per block for out // There are blockDim.y elements per block for in // Hence increment ids[dim] just after offseting out and before offsetting in tptr += ids[3] * tmp.strides[3] + ids[2] * tmp.strides[2] + ids[1] * tmp.strides[1] + ids[0]; tfptr += ids[3] * tflg.strides[3] + ids[2] * tflg.strides[2] + ids[1] * tflg.strides[1] + ids[0]; tiptr += ids[3] * tlid.strides[3] + ids[2] * tlid.strides[2] + ids[1] * tlid.strides[1] + ids[0]; const int blockIdx_dim = ids[dim]; ids[dim] = ids[dim] * blockDim.y * lim + tidy; optr += ids[3] * out.strides[3] + ids[2] * out.strides[2] + ids[1] * out.strides[1] + ids[0]; iptr += ids[3] * in.strides[3] + ids[2] * in.strides[2] + ids[1] * in.strides[1] + ids[0]; kptr += ids[3] * key.strides[3] + ids[2] * key.strides[2] + ids[1] * key.strides[1] + ids[0]; int id_dim = ids[dim]; const int out_dim = out.dims[dim]; bool is_valid = (ids[0] < out.dims[0]) && (ids[1] < out.dims[1]) && (ids[2] < out.dims[2]) && (ids[3] < out.dims[3]); const int ostride_dim = out.strides[dim]; const int istride_dim = in.strides[dim]; __shared__ char s_flg[THREADS_X * DIMY * 2]; __shared__ To s_val[THREADS_X * DIMY * 2]; __shared__ char s_ftmp[THREADS_X]; __shared__ To s_tmp[THREADS_X]; __shared__ int boundaryid[THREADS_X]; To *sptr = s_val + tid; char *sfptr = s_flg + tid; Transform<Ti, To, op> transform; Binary<To, op> binop; const To init = binop.init(); To val = init; const bool isLast = (tidy == (DIMY - 1)); if (isLast) { s_tmp[tidx] = val; s_ftmp[tidx] = 0; boundaryid[tidx] = -1; } __syncthreads(); char flag = 0; for (int k = 0; k < lim; k++) { if (id_dim < out_dim) { flag = calculate_head_flags_dim(kptr, id_dim, key.strides[dim]); } else { flag = 0; } //Load val from global in if (inclusive_scan) { if (id_dim >= out_dim) { val = init; } else { val = transform(*iptr); } } else { if ((id_dim == 0) || (id_dim >= out_dim) || flag) { val = init; } else { val = transform(*(iptr - istride_dim)); } } //Add partial result from last iteration before scan operation if ((tidy == 0) && (flag == 0)) { val = binop(val, s_tmp[tidx]); flag = s_ftmp[tidx]; } //Write to shared memory *sptr = val; *sfptr = flag; __syncthreads(); //Segmented Scan int start = 0; #pragma unroll for (int off = 1; off < DIMY; off *= 2) { if (tidy >= off) { val = sfptr[start * THREADS_X] ? val : binop(val, sptr[(start - off) * THREADS_X]); flag = sfptr[start * THREADS_X] | sfptr[(start - off) * THREADS_X]; } start = DIMY - start; sptr[start * THREADS_X] = val; sfptr[start * THREADS_X] = flag; __syncthreads(); } //Identify segment boundary if (tidy == 0) { if ((s_ftmp[tidx] == 0) && (sfptr[start * THREADS_X] == 1)) { boundaryid[tidx] = id_dim; } } else { if ((sfptr[(start - 1) * THREADS_X] == 0) && (sfptr[start * THREADS_X] == 1)) { boundaryid[tidx] = id_dim; } } __syncthreads(); if (is_valid && (id_dim < out_dim)) *optr = val; if (isLast) { s_tmp[tidx] = val; s_ftmp[tidx] = flag; } id_dim += blockDim.y; kptr += blockDim.y * key.strides[dim]; iptr += blockDim.y * istride_dim; optr += blockDim.y * ostride_dim; __syncthreads(); } if (is_valid && (blockIdx_dim < tmp.dims[dim]) && isLast) { *tptr = val; *tfptr = flag; int boundary = boundaryid[tidx]; *tiptr = (boundary == -1) ? id_dim : boundary; } }
__global__ static void scan_dim_kernel(Param<To> out, Param<To> tmp, CParam<Ti> in, uint blocks_x, uint blocks_y, uint blocks_dim, uint lim) { const int tidx = threadIdx.x; const int tidy = threadIdx.y; const int tid = tidy * THREADS_X + tidx; const int zid = blockIdx.x / blocks_x; const int wid = blockIdx.y / blocks_y; const int blockIdx_x = blockIdx.x - (blocks_x) * zid; const int blockIdx_y = blockIdx.y - (blocks_y) * wid; const int xid = blockIdx_x * blockDim.x + tidx; const int yid = blockIdx_y; // yid of output. updated for input later. int ids[4] = {xid, yid, zid, wid}; const Ti *iptr = in.ptr; To *optr = out.ptr; To *tptr = tmp.ptr; // There is only one element per block for out // There are blockDim.y elements per block for in // Hence increment ids[dim] just after offseting out and before offsetting in tptr += ids[3] * tmp.strides[3] + ids[2] * tmp.strides[2] + ids[1] * tmp.strides[1] + ids[0]; const int blockIdx_dim = ids[dim]; ids[dim] = ids[dim] * blockDim.y * lim + tidy; optr += ids[3] * out.strides[3] + ids[2] * out.strides[2] + ids[1] * out.strides[1] + ids[0]; iptr += ids[3] * in.strides[3] + ids[2] * in.strides[2] + ids[1] * in.strides[1] + ids[0]; int id_dim = ids[dim]; const int out_dim = out.dims[dim]; bool is_valid = (ids[0] < out.dims[0]) && (ids[1] < out.dims[1]) && (ids[2] < out.dims[2]) && (ids[3] < out.dims[3]); const int ostride_dim = out.strides[dim]; const int istride_dim = in.strides[dim]; __shared__ To s_val[THREADS_X * DIMY * 2]; __shared__ To s_tmp[THREADS_X]; To *sptr = s_val + tid; Transform<Ti, To, op> transform; Binary<To, op> binop; const To init = binop.init(); To val = init; const bool isLast = (tidy == (DIMY - 1)); for (int k = 0; k < lim; k++) { if (isLast) s_tmp[tidx] = val; bool cond = (is_valid) && (id_dim < out_dim); val = cond ? transform(*iptr) : init; *sptr = val; __syncthreads(); int start = 0; #pragma unroll for (int off = 1; off < DIMY; off *= 2) { if (tidy >= off) val = binop(val, sptr[(start - off) * THREADS_X]); start = DIMY - start; sptr[start * THREADS_X] = val; __syncthreads(); } val = binop(val, s_tmp[tidx]); __syncthreads(); if (cond) *optr = val; id_dim += blockDim.y; iptr += blockDim.y * istride_dim; optr += blockDim.y * ostride_dim; } if (!isFinalPass && is_valid && (blockIdx_dim < tmp.dims[dim]) && isLast) { *tptr = val; } }
__global__ static void ireduce_dim_kernel(Param<T> out, uint *olptr, CParam <T> in, const uint *ilptr, uint blocks_x, uint blocks_y, uint offset_dim) { const uint tidx = threadIdx.x; const uint tidy = threadIdx.y; const uint tid = tidy * THREADS_X + tidx; const uint zid = blockIdx.x / blocks_x; const uint wid = blockIdx.y / blocks_y; const uint blockIdx_x = blockIdx.x - (blocks_x) * zid; const uint blockIdx_y = blockIdx.y - (blocks_y) * wid; const uint xid = blockIdx_x * blockDim.x + tidx; const uint yid = blockIdx_y; // yid of output. updated for input later. uint ids[4] = {xid, yid, zid, wid}; const T *iptr = in.ptr; T *optr = out.ptr; // There is only one element per block for out // There are blockDim.y elements per block for in // Hence increment ids[dim] just after offseting out and before offsetting in optr += ids[3] * out.strides[3] + ids[2] * out.strides[2] + ids[1] * out.strides[1] + ids[0]; olptr += ids[3] * out.strides[3] + ids[2] * out.strides[2] + ids[1] * out.strides[1] + ids[0]; const uint blockIdx_dim = ids[dim]; ids[dim] = ids[dim] * blockDim.y + tidy; iptr += ids[3] * in.strides[3] + ids[2] * in.strides[2] + ids[1] * in.strides[1] + ids[0]; if (!is_first) ilptr += ids[3] * in.strides[3] + ids[2] * in.strides[2] + ids[1] * in.strides[1] + ids[0]; const uint id_dim_in = ids[dim]; const uint istride_dim = in.strides[dim]; bool is_valid = (ids[0] < in.dims[0]) && (ids[1] < in.dims[1]) && (ids[2] < in.dims[2]) && (ids[3] < in.dims[3]); Binary<T, op> ireduce; T val = ireduce.init(); uint idx = id_dim_in; if (is_valid && id_dim_in < in.dims[dim]) { val = *iptr; if (!is_first) idx = *ilptr; } MinMaxOp<op, T> Op(val, idx); const uint id_dim_in_start = id_dim_in + offset_dim * blockDim.y; __shared__ T s_val[THREADS_X * DIMY]; __shared__ uint s_idx[THREADS_X * DIMY]; for (int id = id_dim_in_start; is_valid && (id < in.dims[dim]); id += offset_dim * blockDim.y) { iptr = iptr + offset_dim * blockDim.y * istride_dim; if (!is_first) { ilptr = ilptr + offset_dim * blockDim.y * istride_dim; Op(*iptr, *ilptr); } else { Op(*iptr, id); } } s_val[tid] = Op.m_val; s_idx[tid] = Op.m_idx; T *s_vptr = s_val + tid; uint *s_iptr = s_idx + tid; __syncthreads(); if (DIMY == 8) { if (tidy < 4) { Op(s_vptr[THREADS_X * 4], s_iptr[THREADS_X * 4]); *s_vptr = Op.m_val; *s_iptr = Op.m_idx; } __syncthreads(); } if (DIMY >= 4) { if (tidy < 2) { Op(s_vptr[THREADS_X * 2], s_iptr[THREADS_X * 2]); *s_vptr = Op.m_val; *s_iptr = Op.m_idx; } __syncthreads(); } if (DIMY >= 2) { if (tidy < 1) { Op(s_vptr[THREADS_X * 1], s_iptr[THREADS_X * 1]); *s_vptr = Op.m_val; *s_iptr = Op.m_idx; } __syncthreads(); } if (tidy == 0 && is_valid && (blockIdx_dim < out.dims[dim])) { *optr = *s_vptr; *olptr = *s_iptr; } }
__global__ static void ireduce_first_kernel(Param<T> out, uint *olptr, CParam<T> in, const uint *ilptr, uint blocks_x, uint blocks_y, uint repeat) { const uint tidx = threadIdx.x; const uint tidy = threadIdx.y; const uint tid = tidy * blockDim.x + tidx; const uint zid = blockIdx.x / blocks_x; const uint wid = blockIdx.y / blocks_y; const uint blockIdx_x = blockIdx.x - (blocks_x) * zid; const uint blockIdx_y = blockIdx.y - (blocks_y) * wid; const uint xid = blockIdx_x * blockDim.x * repeat + tidx; const uint yid = blockIdx_y * blockDim.y + tidy; const T *iptr = in.ptr; T *optr = out.ptr; iptr += wid * in.strides[3] + zid * in.strides[2] + yid * in.strides[1]; optr += wid * out.strides[3] + zid * out.strides[2] + yid * out.strides[1]; if (!is_first) ilptr += wid * in.strides[3] + zid * in.strides[2] + yid * in.strides[1]; olptr += wid * out.strides[3] + zid * out.strides[2] + yid * out.strides[1]; if (yid >= in.dims[1] || zid >= in.dims[2] || wid >= in.dims[3]) return; int lim = min((int)(xid + repeat * DIMX), in.dims[0]); Binary<T, op> ireduce; T val = ireduce.init(); uint idx = xid; if (xid < lim) { val = iptr[xid]; if (!is_first) idx = ilptr[xid]; } MinMaxOp<op, T> Op(val, idx); __shared__ T s_val[THREADS_PER_BLOCK]; __shared__ uint s_idx[THREADS_PER_BLOCK]; for (int id = xid + DIMX; id < lim; id += DIMX) { Op(iptr[id], (!is_first) ? ilptr[id] : id); } s_val[tid] = Op.m_val; s_idx[tid] = Op.m_idx; __syncthreads(); T *s_vptr = s_val + tidy * DIMX; uint *s_iptr = s_idx + tidy * DIMX; if (DIMX == 256) { if (tidx < 128) { Op(s_vptr[tidx + 128], s_iptr[tidx + 128]); s_vptr[tidx] = Op.m_val; s_iptr[tidx] = Op.m_idx; } __syncthreads(); } if (DIMX >= 128) { if (tidx < 64) { Op(s_vptr[tidx + 64], s_iptr[tidx + 64]); s_vptr[tidx] = Op.m_val; s_iptr[tidx] = Op.m_idx; } __syncthreads(); } if (DIMX >= 64) { if (tidx < 32) { Op(s_vptr[tidx + 32], s_iptr[tidx + 32]); s_vptr[tidx] = Op.m_val; s_iptr[tidx] = Op.m_idx; } __syncthreads(); } warp_reduce<T, op>(s_vptr, s_iptr, tidx); if (tidx == 0) { optr[blockIdx_x] = s_vptr[0]; olptr[blockIdx_x] = s_iptr[0]; } }
void mean_first_launcher(Param out, Param owt, Param in, Param inWeight, const int threads_x, const uint groups_x, const uint groups_y) { bool input_weight = ((inWeight.info.dims[0] * inWeight.info.dims[1] * inWeight.info.dims[2] * inWeight.info.dims[3]) != 0); bool output_weight = (( owt.info.dims[0] * owt.info.dims[1] * owt.info.dims[2] * owt.info.dims[3]) != 0); std::string ref_name = std::string("mean_0_") + std::string(dtype_traits<Ti>::getName()) + std::string("_") + std::string(dtype_traits<Tw>::getName()) + std::string("_") + std::string(dtype_traits<To>::getName()) + std::string("_") + std::to_string(threads_x) + std::string("_") + std::to_string(input_weight) + std::string("_") + std::to_string(output_weight); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, ref_name); if (entry.prog==0 && entry.ker==0) { Binary<To, af_add_t> mean; ToNumStr<To> toNumStr; ToNumStr<Tw> twNumStr; Transform<uint, Tw, af_add_t> transform_weight; std::ostringstream options; options << " -D Ti=" << dtype_traits<Ti>::getName() << " -D Tw=" << dtype_traits<Tw>::getName() << " -D To=" << dtype_traits<To>::getName() << " -D DIMX=" << threads_x << " -D THREADS_PER_GROUP=" << THREADS_PER_GROUP << " -D init_To=" << toNumStr(mean.init()) << " -D init_Tw=" << twNumStr(transform_weight(0)) << " -D one_Tw=" << twNumStr(transform_weight(1)); if (input_weight) { options << " -D INPUT_WEIGHT"; } if (output_weight) { options << " -D OUTPUT_WEIGHT"; } if (std::is_same<Ti, double>::value || std::is_same<Ti, cdouble>::value || std::is_same<To, double>::value) { options << " -D USE_DOUBLE"; } const char *ker_strs[] = {mean_ops_cl, mean_first_cl}; const int ker_lens[] = {mean_ops_cl_len, mean_first_cl_len}; Program prog; buildProgram(prog, 2, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "mean_first_kernel"); addKernelToCache(device, ref_name, entry); } NDRange local(threads_x, THREADS_PER_GROUP / threads_x); NDRange global(groups_x * in.info.dims[2] * local[0], groups_y * in.info.dims[3] * local[1]); uint repeat = divup(in.info.dims[0], (local[0] * groups_x)); if (input_weight && output_weight) { auto meanOp = KernelFunctor< Buffer, KParam, Buffer, KParam, Buffer, KParam, Buffer, KParam, uint, uint, uint>(*entry.ker); meanOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *owt.data, owt.info, *in.data, in.info, *inWeight.data, inWeight.info, groups_x, groups_y, repeat); } else if (!input_weight && !output_weight) { auto meanOp = KernelFunctor< Buffer, KParam, Buffer, KParam, uint, uint, uint>(*entry.ker); meanOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info, groups_x, groups_y, repeat); } else if ( input_weight && !output_weight) { auto meanOp = KernelFunctor< Buffer, KParam, Buffer, KParam, Buffer, KParam, uint, uint, uint>(*entry.ker); meanOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info, *inWeight.data, inWeight.info, groups_x, groups_y, repeat); } else if (!input_weight && output_weight) { auto meanOp = KernelFunctor< Buffer, KParam, Buffer, KParam, Buffer, KParam, uint, uint, uint>(*entry.ker); meanOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *owt.data, owt.info, *in.data, in.info, groups_x, groups_y, repeat); } CL_DEBUG_FINISH(getQueue()); }
__global__ static void scan_first_kernel(Param<To> out, Param<To> tmp, CParam<Ti> in, uint blocks_x, uint blocks_y, uint lim) { const int tidx = threadIdx.x; const int tidy = threadIdx.y; const int zid = blockIdx.x / blocks_x; const int wid = blockIdx.y / blocks_y; const int blockIdx_x = blockIdx.x - (blocks_x) * zid; const int blockIdx_y = blockIdx.y - (blocks_y) * wid; const int xid = blockIdx_x * blockDim.x * lim + tidx; const int yid = blockIdx_y * blockDim.y + tidy; bool cond_yzw = (yid < out.dims[1]) && (zid < out.dims[2]) && (wid < out.dims[3]); if (!cond_yzw) return; // retire warps early const Ti *iptr = in.ptr; To *optr = out.ptr; To *tptr = tmp.ptr; iptr += wid * in.strides[3] + zid * in.strides[2] + yid * in.strides[1]; optr += wid * out.strides[3] + zid * out.strides[2] + yid * out.strides[1]; tptr += wid * tmp.strides[3] + zid * tmp.strides[2] + yid * tmp.strides[1]; const int DIMY = THREADS_PER_BLOCK / DIMX; const int SHARED_MEM_SIZE = (2 * DIMX + 1) * (DIMY); __shared__ To s_val[SHARED_MEM_SIZE]; __shared__ To s_tmp[DIMY]; To *sptr = s_val + tidy * (2 * DIMX + 1); Transform<Ti, To, op> transform; Binary<To, op> binop; const To init = binop.init(); int id = xid; To val = init; const bool isLast = (tidx == (DIMX - 1)); for (int k = 0; k < lim; k++) { if (isLast) s_tmp[tidy] = val; bool cond = ((id < out.dims[0])); val = cond ? transform(iptr[id]) : init; sptr[tidx] = val; __syncthreads(); int start = 0; #pragma unroll for (int off = 1; off < DIMX; off *= 2) { if (tidx >= off) val = binop(val, sptr[(start - off) + tidx]); start = DIMX - start; sptr[start + tidx] = val; __syncthreads(); } val = binop(val, s_tmp[tidy]); if (cond) optr[id] = val; id += blockDim.x; __syncthreads(); } if (!isFinalPass && isLast) { tptr[blockIdx_x] = val; } }