clsparseStatus dot(clsparse::array_base<T>& pR, const clsparse::array_base<T>& pX, const clsparse::array_base<T>& pY, const clsparseControl control) { cl_int status; //not necessary to have it, but remember to init the pR with the proper value init_scalar(pR, (T)0, control); // with REDUCE_BLOCKS_NUMBER = 256 final reduction can be performed // within one block; const cl_ulong REDUCE_BLOCKS_NUMBER = 256; /* For future optimisation //workgroups per compute units; const cl_uint WG_PER_CU = 64; const cl_ulong REDUCE_BLOCKS_NUMBER = control->max_compute_units * WG_PER_CU; */ const cl_ulong REDUCE_BLOCK_SIZE = 256; cl_ulong xSize = pX.size(); cl_ulong ySize = pY.size(); assert (xSize == ySize); cl_ulong size = xSize; if (size > 0) { cl::Context context = control->getContext(); //partial result clsparse::vector<T> partial(control, REDUCE_BLOCKS_NUMBER, 0, CL_MEM_READ_WRITE, false); status = inner_product<T>(partial, pX, pY, size, REDUCE_BLOCKS_NUMBER, REDUCE_BLOCK_SIZE, control); if (status != clsparseSuccess) { return clsparseInvalidKernelExecution; } status = atomic_reduce<T>(pR, partial, REDUCE_BLOCK_SIZE, control); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } } return clsparseSuccess; }
static C init_scalar_periodic( int rnk_n, const INT *n, const INT *ind ) { INT *periodic_ind = PX(malloc_INT)(rnk_n); /* assure periodicity in all directions */ for(int t=0; t<rnk_n; t++) periodic_ind[t] = ind[t] % n[t]; C result = init_scalar(rnk_n, n, periodic_ind); free(periodic_ind); return result; }