clsparseStatus atomic_reduce(clsparse::array_base<T>& pR, const clsparse::array_base<T>& pX, const cl_ulong wg_size, const clsparseControl control) { assert(wg_size == pX.size()); std::string params = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type + " -DVALUE_TYPE=" + OclTypeTraits<T>::type + " -DWG_SIZE=" + std::to_string(wg_size) + " -D" + ReduceOperatorTrait<OP>::operation; if (typeid(cl_float) == typeid(T)) { std::string options = std::string() + " -DATOMIC_FLOAT"; params.append(options); } else if (typeid(cl_double) == typeid(T)) { std::string options = std::string() + " -DATOMIC_DOUBLE"; params.append(options); } else if (typeid(cl_int) == typeid(T)) { std::string options = std::string() + " -DATOMIC_INT"; params.append(options); } else { return clsparseInvalidType; } cl::Kernel kernel = KernelCache::get(control->queue, "atomic_reduce", "reduce_block", params); KernelWrap kWrapper(kernel); kWrapper << pR.data(); kWrapper << pX.data(); int blocksNum = (pX.size() + wg_size - 1) / wg_size; int globalSize = blocksNum * wg_size; cl::NDRange local(wg_size); cl::NDRange global(globalSize); cl_int status = kWrapper.run(control, global, local); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } return clsparseSuccess; }
clsparseStatus dot(clsparse::array_base<T>& pR, const clsparse::array_base<T>& pX, const clsparse::array_base<T>& pY, const clsparseControl control) { cl_int status; //not necessary to have it, but remember to init the pR with the proper value init_scalar(pR, (T)0, control); // with REDUCE_BLOCKS_NUMBER = 256 final reduction can be performed // within one block; const cl_ulong REDUCE_BLOCKS_NUMBER = 256; /* For future optimisation //workgroups per compute units; const cl_uint WG_PER_CU = 64; const cl_ulong REDUCE_BLOCKS_NUMBER = control->max_compute_units * WG_PER_CU; */ const cl_ulong REDUCE_BLOCK_SIZE = 256; cl_ulong xSize = pX.size(); cl_ulong ySize = pY.size(); assert (xSize == ySize); cl_ulong size = xSize; if (size > 0) { cl::Context context = control->getContext(); //partial result clsparse::vector<T> partial(control, REDUCE_BLOCKS_NUMBER, 0, CL_MEM_READ_WRITE, false); status = inner_product<T>(partial, pX, pY, size, REDUCE_BLOCKS_NUMBER, REDUCE_BLOCK_SIZE, control); if (status != clsparseSuccess) { return clsparseInvalidKernelExecution; } status = atomic_reduce<T>(pR, partial, REDUCE_BLOCK_SIZE, control); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } } return clsparseSuccess; }
clsparseStatus csrmv_adaptive( const clsparse::array_base<T>& pAlpha, const clsparseCsrMatrixPrivate* pCsrMatx, const clsparse::array_base<T>& pX, const clsparse::array_base<T>& pBeta, clsparse::array_base<T>& pY, clsparseControl control ) { const cl_uint group_size = 256; std::string params = std::string( ) + " -DROWBITS=" + std::to_string( ROW_BITS ) + " -DWGBITS=" + std::to_string( WG_BITS ) + " -DBLOCKSIZE=" + std::to_string( BLKSIZE ); if(typeid(T) == typeid(cl_double)) { std::string options = std::string() + " -DDOUBLE"; params.append(options); } cl::Kernel kernel = KernelCache::get( control->queue, "csrmv_adaptive", "csrmv_adaptive", params ); KernelWrap kWrapper( kernel ); kWrapper << pCsrMatx->values << pCsrMatx->colIndices << pCsrMatx->rowOffsets << pX.data() << pY.data() << pCsrMatx->rowBlocks << pAlpha.data() << pBeta.data(); //<< h_alpha << h_beta; // if NVIDIA is used it does not allow to run the group size // which is not a multiplication of group_size. Don't know if that // have an impact on performance cl_uint global_work_size = ( pCsrMatx->rowBlockSize - 1 ) * group_size; cl::NDRange local( group_size ); cl::NDRange global( global_work_size > local[ 0 ] ? global_work_size : local[ 0 ] ); cl_int status = kWrapper.run( control, global, local ); if( status != CL_SUCCESS ) { return clsparseInvalidKernelExecution; } return clsparseSuccess; }
clsparseStatus axpy(clsparse::array_base<T>& pY, const clsparse::array_base<T>& pAlpha, const clsparse::array_base<T>& pX, const clsparse::array_base<T>& pZ, const clsparseControl control) { const int group_size = 256; // this or higher? control->max_wg_size? const std::string params = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type + " -DVALUE_TYPE=" + OclTypeTraits<T>::type + " -DWG_SIZE=" + std::to_string( group_size ) + " -D" + ElementWiseOperatorTrait<OP>::operation; cl::Kernel kernel = KernelCache::get(control->queue, "blas1", "axpy", params); KernelWrap kWrapper(kernel); cl_ulong size = pY.size(); cl_ulong offset = 0; kWrapper << size << pY.data() << offset << pAlpha.data() << offset << pX.data() << offset << pZ.data() << offset; int blocksNum = (size + group_size - 1) / group_size; int globalSize = blocksNum * group_size; cl::NDRange local(group_size); cl::NDRange global (globalSize); cl_int status = kWrapper.run(control, global, local); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } return clsparseSuccess; }
clsparseStatus inner_product (clsparse::array_base<T>& partial, const clsparse::array_base<T>& pX, const clsparse::array_base<T>& pY, const cl_ulong size, const cl_ulong REDUCE_BLOCKS_NUMBER, const cl_ulong REDUCE_BLOCK_SIZE, const clsparseControl control) { cl_ulong nthreads = REDUCE_BLOCK_SIZE * REDUCE_BLOCKS_NUMBER; std::string params = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type + " -DVALUE_TYPE=" + OclTypeTraits<T>::type + " -DWG_SIZE=" + std::to_string(REDUCE_BLOCK_SIZE) + " -DREDUCE_BLOCK_SIZE=" + std::to_string(REDUCE_BLOCK_SIZE) + " -DN_THREADS=" + std::to_string(nthreads); cl::Kernel kernel = KernelCache::get(control->queue, "dot", "inner_product", params); KernelWrap kWrapper(kernel); kWrapper << size << partial.data() << pX.data() << pY.data(); cl::NDRange local(REDUCE_BLOCK_SIZE); cl::NDRange global(REDUCE_BLOCKS_NUMBER * REDUCE_BLOCK_SIZE); cl_int status = kWrapper.run(control, global, local); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } return clsparseSuccess; }
clsparseStatus scale(clsparse::array_base<T>& pVector, const clsparse::array_base<T>& pAlpha, clsparseControl control) { const int group_size = 256; //const int group_size = control->max_wg_size; const std::string params = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type + " -DVALUE_TYPE="+ OclTypeTraits<T>::type + " -DWG_SIZE=" + std::to_string(group_size); cl::Kernel kernel = KernelCache::get(control->queue, "blas1", "scale", params); KernelWrap kWrapper(kernel); cl_ulong size = pVector.size(); cl_ulong offset = 0; kWrapper << size << pVector.data() << offset << pAlpha.data() << offset; int blocksNum = (size + group_size - 1) / group_size; int globalSize = blocksNum * group_size; cl::NDRange local(group_size); cl::NDRange global (globalSize); cl_int status = kWrapper.run(control, global, local); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } return clsparseSuccess; }
clsparseStatus scale( clsparse::array_base<T>& pResult, const clsparse::array_base<T>& pAlpha, const clsparse::array_base<T>& pVector, clsparseControl control) { const int group_size = 256; //const int group_size = control->max_wg_size; std::string params = std::string() + " -DVALUE_TYPE="+ OclTypeTraits<T>::type + " -DWG_SIZE=" + std::to_string(group_size); if (sizeof(clsparseIdx_t) == 8) { std::string options = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type; params.append(options); } else { std::string options = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_uint>::type; params.append(options); } if(typeid(T) == typeid(cl_double)) { params.append(" -DDOUBLE"); if (!control->dpfp_support) { #ifndef NDEBUG std::cerr << "Failure attempting to run double precision kernel on device without DPFP support." << std::endl; #endif return clsparseInvalidDevice; } } cl::Kernel kernel = KernelCache::get(control->queue, "blas1", "scale", params); KernelWrap kWrapper(kernel); clsparseIdx_t size = pResult.size(); clsparseIdx_t offset = 0; kWrapper << size << pResult.data() << offset << pVector.data() << offset << pAlpha.data() << offset; clsparseIdx_t blocksNum = (size + group_size - 1) / group_size; clsparseIdx_t globalSize = blocksNum * group_size; cl::NDRange local(group_size); cl::NDRange global (globalSize); cl_int status = kWrapper.run(control, global, local); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } return clsparseSuccess; }
clsparseStatus csrmv_vector(const clsparse::array_base<T>& pAlpha, const clsparseCsrMatrixPrivate* pMatx, const clsparse::array_base<T>& pX, const clsparse::array_base<T>& pBeta, clsparse::array_base<T>& pY, clsparseControl control) { cl_uint nnz_per_row = pMatx->nnz_per_row(); //average nnz per row cl_uint wave_size = control->wavefront_size; cl_uint group_size = 256; // 256 gives best performance! cl_uint subwave_size = wave_size; // adjust subwave_size according to nnz_per_row; // each wavefron will be assigned to the row of the csr matrix if(wave_size > 32) { //this apply only for devices with wavefront > 32 like AMD(64) if (nnz_per_row < 64) { subwave_size = 32; } } if (nnz_per_row < 32) { subwave_size = 16; } if (nnz_per_row < 16) { subwave_size = 8; } if (nnz_per_row < 8) { subwave_size = 4; } if (nnz_per_row < 4) { subwave_size = 2; } const std::string params = std::string() + "-DINDEX_TYPE=" + OclTypeTraits<cl_int>::type + " -DVALUE_TYPE=" + OclTypeTraits<T>::type + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type + " -DWG_SIZE=" + std::to_string(group_size) + " -DWAVE_SIZE=" + std::to_string(wave_size) + " -DSUBWAVE_SIZE=" + std::to_string(subwave_size); cl::Kernel kernel = KernelCache::get(control->queue, "csrmv_general", "csrmv_general", params); KernelWrap kWrapper(kernel); cl_ulong offset = 0; kWrapper << pMatx->num_rows << pAlpha.data() << offset << pMatx->rowOffsets << pMatx->colIndices << pMatx->values << pX.data() << offset << pBeta.data() << offset << pY.data() << offset; // subwave takes care of each row in matrix; // predicted number of subwaves to be executed; cl_uint predicted = subwave_size * pMatx->num_rows; // if NVIDIA is used it does not allow to run the group size // which is not a multiplication of group_size. Don't know if that // have an impact on performance cl_uint global_work_size = group_size* ((predicted + group_size - 1 ) / group_size); cl::NDRange local(group_size); //cl::NDRange global(predicted > local[0] ? predicted : local[0]); cl::NDRange global(global_work_size > local[0] ? global_work_size : local[0]); cl_int status = kWrapper.run(control, global, local); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } return clsparseSuccess; }
clsparseStatus csrmv_adaptive( const clsparse::array_base<T>& pAlpha, const clsparseCsrMatrixPrivate* pCsrMatx, const clsparse::array_base<T>& pX, const clsparse::array_base<T>& pBeta, clsparse::array_base<T>& pY, clsparseControl control ) { const cl_uint group_size = 256; std::string params = std::string( ) + " -DROWBITS=" + std::to_string( ROW_BITS ) + " -DWGBITS=" + std::to_string( WG_BITS ) + " -DVALUE_TYPE=" + OclTypeTraits<T>::type + " -DWG_SIZE=" + std::to_string( group_size ) + " -DBLOCKSIZE=" + std::to_string( BLKSIZE ) + " -DBLOCK_MULTIPLIER=" + std::to_string( BLOCK_MULTIPLIER ) + " -DROWS_FOR_VECTOR=" + std::to_string( ROWS_FOR_VECTOR ); if( sizeof( clsparseIdx_t ) == 8 ) { std::string options = std::string() + " -DINDEX_TYPE=" + OclTypeTraits<cl_ulong>::type; params.append(options); } else { std::string options = std::string() + " -DINDEX_TYPE=" + OclTypeTraits<cl_uint>::type; params.append(options); } std::string options; if(typeid(T) == typeid(cl_double)) { options = std::string() + " -DDOUBLE"; if (!control->dpfp_support) { #ifndef NDEBUG std::cerr << "Failure attempting to run double precision kernel on device without DPFP support." << std::endl; #endif return clsparseInvalidDevice; } } else if(typeid(T) == typeid(cl_ulong)) options = std::string() + " -DLONG"; else if(typeid(T) == typeid(cl_long)) options = std::string() + " -DLONG"; if(control->extended_precision) options += " -DEXTENDED_PRECISION"; params.append(options); cl::Kernel kernel = KernelCache::get( control->queue, "csrmv_adaptive", "csrmv_adaptive", params ); KernelWrap kWrapper( kernel ); const matrix_meta* meta_ptr = static_cast< const matrix_meta* >( pCsrMatx->meta ); kWrapper << pCsrMatx->values << pCsrMatx->col_indices << pCsrMatx->row_pointer << pX.data() << pY.data() << meta_ptr->rowBlocks << pAlpha.data() << pBeta.data(); //<< h_alpha << h_beta; // if NVIDIA is used it does not allow to run the group size // which is not a multiplication of group_size. Don't know if that // have an impact on performance // Setting global work size to half the row block size because we are only // using half the row blocks buffer for actual work. // The other half is used for the extended precision reduction. clsparseIdx_t global_work_size = ( ( meta_ptr->rowBlockSize/2) - 1 ) * group_size; cl::NDRange local( group_size ); cl::NDRange global( global_work_size > local[ 0 ] ? global_work_size : local[ 0 ] ); cl_int status = kWrapper.run( control, global, local ); if( status != CL_SUCCESS ) { return clsparseInvalidKernelExecution; } return clsparseSuccess; }