clsparseStatus atomic_reduce(clsparse::array_base<T>& pR, const clsparse::array_base<T>& pX, const cl_ulong wg_size, const clsparseControl control) { assert(wg_size == pX.size()); std::string params = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type + " -DVALUE_TYPE=" + OclTypeTraits<T>::type + " -DWG_SIZE=" + std::to_string(wg_size) + " -D" + ReduceOperatorTrait<OP>::operation; if (typeid(cl_float) == typeid(T)) { std::string options = std::string() + " -DATOMIC_FLOAT"; params.append(options); } else if (typeid(cl_double) == typeid(T)) { std::string options = std::string() + " -DATOMIC_DOUBLE"; params.append(options); } else if (typeid(cl_int) == typeid(T)) { std::string options = std::string() + " -DATOMIC_INT"; params.append(options); } else { return clsparseInvalidType; } cl::Kernel kernel = KernelCache::get(control->queue, "atomic_reduce", "reduce_block", params); KernelWrap kWrapper(kernel); kWrapper << pR.data(); kWrapper << pX.data(); int blocksNum = (pX.size() + wg_size - 1) / wg_size; int globalSize = blocksNum * wg_size; cl::NDRange local(wg_size); cl::NDRange global(globalSize); cl_int status = kWrapper.run(control, global, local); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } return clsparseSuccess; }
clsparseStatus dot(clsparse::array_base<T>& pR, const clsparse::array_base<T>& pX, const clsparse::array_base<T>& pY, const clsparseControl control) { cl_int status; //not necessary to have it, but remember to init the pR with the proper value init_scalar(pR, (T)0, control); // with REDUCE_BLOCKS_NUMBER = 256 final reduction can be performed // within one block; const cl_ulong REDUCE_BLOCKS_NUMBER = 256; /* For future optimisation //workgroups per compute units; const cl_uint WG_PER_CU = 64; const cl_ulong REDUCE_BLOCKS_NUMBER = control->max_compute_units * WG_PER_CU; */ const cl_ulong REDUCE_BLOCK_SIZE = 256; cl_ulong xSize = pX.size(); cl_ulong ySize = pY.size(); assert (xSize == ySize); cl_ulong size = xSize; if (size > 0) { cl::Context context = control->getContext(); //partial result clsparse::vector<T> partial(control, REDUCE_BLOCKS_NUMBER, 0, CL_MEM_READ_WRITE, false); status = inner_product<T>(partial, pX, pY, size, REDUCE_BLOCKS_NUMBER, REDUCE_BLOCK_SIZE, control); if (status != clsparseSuccess) { return clsparseInvalidKernelExecution; } status = atomic_reduce<T>(pR, partial, REDUCE_BLOCK_SIZE, control); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } } return clsparseSuccess; }
clsparseStatus axpby(clsparse::array_base<T>& pY, const clsparse::array_base<T>& pAlpha, const clsparse::array_base<T>& pX, const clsparse::array_base<T>& pBeta, const clsparse::array_base<T>& pZ, const clsparseControl control) { const int group_size = 256; // this or higher? control->max_wg_size? const std::string params = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type + " -DVALUE_TYPE=" + OclTypeTraits<T>::type + " -DWG_SIZE=" + std::to_string( group_size ) + " -D" + ElementWiseOperatorTrait<OP>::operation; cl::Kernel kernel = KernelCache::get(control->queue, "blas1", "axpby", params); KernelWrap kWrapper(kernel); cl_ulong size = pY.size(); //clsparse do not support offset; cl_ulong offset = 0; kWrapper << size << pY.data() << offset << pAlpha.data() << offset << pX.data() << offset << pBeta.data() << offset << pZ.data() << offset; int blocksNum = (size + group_size - 1) / group_size; int globalSize = blocksNum * group_size; cl::NDRange local(group_size); cl::NDRange global (globalSize); cl_int status = kWrapper.run(control, global, local); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } return clsparseSuccess; }
clsparseStatus scale(clsparse::array_base<T>& pVector, const clsparse::array_base<T>& pAlpha, clsparseControl control) { const int group_size = 256; //const int group_size = control->max_wg_size; const std::string params = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type + " -DVALUE_TYPE="+ OclTypeTraits<T>::type + " -DWG_SIZE=" + std::to_string(group_size); cl::Kernel kernel = KernelCache::get(control->queue, "blas1", "scale", params); KernelWrap kWrapper(kernel); cl_ulong size = pVector.size(); cl_ulong offset = 0; kWrapper << size << pVector.data() << offset << pAlpha.data() << offset; int blocksNum = (size + group_size - 1) / group_size; int globalSize = blocksNum * group_size; cl::NDRange local(group_size); cl::NDRange global (globalSize); cl_int status = kWrapper.run(control, global, local); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } return clsparseSuccess; }
clsparseStatus scale( clsparse::array_base<T>& pResult, const clsparse::array_base<T>& pAlpha, const clsparse::array_base<T>& pVector, clsparseControl control) { const int group_size = 256; //const int group_size = control->max_wg_size; std::string params = std::string() + " -DVALUE_TYPE="+ OclTypeTraits<T>::type + " -DWG_SIZE=" + std::to_string(group_size); if (sizeof(clsparseIdx_t) == 8) { std::string options = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type; params.append(options); } else { std::string options = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_uint>::type; params.append(options); } if(typeid(T) == typeid(cl_double)) { params.append(" -DDOUBLE"); if (!control->dpfp_support) { #ifndef NDEBUG std::cerr << "Failure attempting to run double precision kernel on device without DPFP support." << std::endl; #endif return clsparseInvalidDevice; } } cl::Kernel kernel = KernelCache::get(control->queue, "blas1", "scale", params); KernelWrap kWrapper(kernel); clsparseIdx_t size = pResult.size(); clsparseIdx_t offset = 0; kWrapper << size << pResult.data() << offset << pVector.data() << offset << pAlpha.data() << offset; clsparseIdx_t blocksNum = (size + group_size - 1) / group_size; clsparseIdx_t globalSize = blocksNum * group_size; cl::NDRange local(group_size); cl::NDRange global (globalSize); cl_int status = kWrapper.run(control, global, local); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } return clsparseSuccess; }