clsparseStatus atomic_reduce(clsparse::array_base<T>& pR, const clsparse::array_base<T>& pX, const cl_ulong wg_size, const clsparseControl control) { assert(wg_size == pX.size()); std::string params = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type + " -DVALUE_TYPE=" + OclTypeTraits<T>::type + " -DWG_SIZE=" + std::to_string(wg_size) + " -D" + ReduceOperatorTrait<OP>::operation; if (typeid(cl_float) == typeid(T)) { std::string options = std::string() + " -DATOMIC_FLOAT"; params.append(options); } else if (typeid(cl_double) == typeid(T)) { std::string options = std::string() + " -DATOMIC_DOUBLE"; params.append(options); } else if (typeid(cl_int) == typeid(T)) { std::string options = std::string() + " -DATOMIC_INT"; params.append(options); } else { return clsparseInvalidType; } cl::Kernel kernel = KernelCache::get(control->queue, "atomic_reduce", "reduce_block", params); KernelWrap kWrapper(kernel); kWrapper << pR.data(); kWrapper << pX.data(); int blocksNum = (pX.size() + wg_size - 1) / wg_size; int globalSize = blocksNum * wg_size; cl::NDRange local(wg_size); cl::NDRange global(globalSize); cl_int status = kWrapper.run(control, global, local); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } return clsparseSuccess; }
clsparseStatus csrmv_adaptive( const clsparseScalarPrivate* pAlpha, const clsparseCsrMatrixPrivate* pCsrMatx, const cldenseVectorPrivate* pX, const clsparseScalarPrivate* pBeta, cldenseVectorPrivate* pY, clsparseControl control ) { const cl_uint group_size = 256; std::string params = std::string( ) + " -DROWBITS=" + std::to_string( ROW_BITS ) + " -DWGBITS=" + std::to_string( WG_BITS ) + " -DBLOCKSIZE=" + std::to_string( BLKSIZE ); #ifdef DOUBLE buildFlags += " -DDOUBLE"; #endif if(typeid(T) == typeid(cl_double)) { std::string options = std::string() + " -DDOUBLE"; params.append(options); } cl::Kernel kernel = KernelCache::get( control->queue, "csrmv_adaptive", "csrmv_adaptive", params ); KernelWrap kWrapper( kernel ); kWrapper << pCsrMatx->values << pCsrMatx->colIndices << pCsrMatx->rowOffsets << pX->values << pY->values << pCsrMatx->rowBlocks << pAlpha->value << pBeta->value; //<< h_alpha << h_beta; // if NVIDIA is used it does not allow to run the group size // which is not a multiplication of group_size. Don't know if that // have an impact on performance cl_uint global_work_size = ( pCsrMatx->rowBlockSize - 1 ) * group_size; cl::NDRange local( group_size ); cl::NDRange global( global_work_size > local[ 0 ] ? global_work_size : local[ 0 ] ); cl_int status = kWrapper.run( control, global, local ); if( status != CL_SUCCESS ) { return clsparseInvalidKernelExecution; } return clsparseSuccess; }
clsparseStatus axpby(clsparse::array_base<T>& pY, const clsparse::array_base<T>& pAlpha, const clsparse::array_base<T>& pX, const clsparse::array_base<T>& pBeta, const clsparse::array_base<T>& pZ, const clsparseControl control) { const int group_size = 256; // this or higher? control->max_wg_size? const std::string params = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type + " -DVALUE_TYPE=" + OclTypeTraits<T>::type + " -DWG_SIZE=" + std::to_string( group_size ) + " -D" + ElementWiseOperatorTrait<OP>::operation; cl::Kernel kernel = KernelCache::get(control->queue, "blas1", "axpby", params); KernelWrap kWrapper(kernel); cl_ulong size = pY.size(); //clsparse do not support offset; cl_ulong offset = 0; kWrapper << size << pY.data() << offset << pAlpha.data() << offset << pX.data() << offset << pBeta.data() << offset << pZ.data() << offset; int blocksNum = (size + group_size - 1) / group_size; int globalSize = blocksNum * group_size; cl::NDRange local(group_size); cl::NDRange global (globalSize); cl_int status = kWrapper.run(control, global, local); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } return clsparseSuccess; }
clsparseStatus axpby(cl_ulong size, cldenseVectorPrivate* pY, const clsparseScalarPrivate* pAlpha, const cldenseVectorPrivate* pX, const clsparseScalarPrivate* pBeta, const clsparseControl control) { const int group_size = 256; // this or higher? control->max_wg_size? const std::string params = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type + " -DVALUE_TYPE=" + OclTypeTraits<T>::type + " -DWG_SIZE=" + std::to_string( group_size ) + " -D" + ElementWiseOperatorTrait<OP>::operation; cl::Kernel kernel = KernelCache::get(control->queue, "blas1", "axpby", params); KernelWrap kWrapper(kernel); kWrapper << size << pY->values << pY->offset() << pAlpha->value << pAlpha->offset() << pX->values << pX->offset() << pBeta->value << pBeta->offset() << pY->values << pY->offset(); int blocksNum = (size + group_size - 1) / group_size; int globalSize = blocksNum * group_size; cl::NDRange local(group_size); cl::NDRange global (globalSize); cl_int status = kWrapper.run(control, global, local); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } return clsparseSuccess; }
clsparseStatus inner_product (cldenseVectorPrivate* partial, const cldenseVectorPrivate* pX, const cldenseVectorPrivate* pY, const cl_ulong size, const cl_ulong REDUCE_BLOCKS_NUMBER, const cl_ulong REDUCE_BLOCK_SIZE, const clsparseControl control) { cl_ulong nthreads = REDUCE_BLOCK_SIZE * REDUCE_BLOCKS_NUMBER; std::string params = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type + " -DVALUE_TYPE=" + OclTypeTraits<T>::type + " -DWG_SIZE=" + std::to_string(REDUCE_BLOCK_SIZE) + " -DREDUCE_BLOCK_SIZE=" + std::to_string(REDUCE_BLOCK_SIZE) + " -DN_THREADS=" + std::to_string(nthreads); cl::Kernel kernel = KernelCache::get(control->queue, "dot", "inner_product", params); KernelWrap kWrapper(kernel); kWrapper << size << partial->values << pX->values << pY->values; cl::NDRange local(REDUCE_BLOCK_SIZE); cl::NDRange global(REDUCE_BLOCKS_NUMBER * REDUCE_BLOCK_SIZE); cl_int status = kWrapper.run(control, global, local); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } return clsparseSuccess; }
clsparseStatus scale(clsparse::array_base<T>& pVector, const clsparse::array_base<T>& pAlpha, clsparseControl control) { const int group_size = 256; //const int group_size = control->max_wg_size; const std::string params = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type + " -DVALUE_TYPE="+ OclTypeTraits<T>::type + " -DWG_SIZE=" + std::to_string(group_size); cl::Kernel kernel = KernelCache::get(control->queue, "blas1", "scale", params); KernelWrap kWrapper(kernel); cl_ulong size = pVector.size(); cl_ulong offset = 0; kWrapper << size << pVector.data() << offset << pAlpha.data() << offset; int blocksNum = (size + group_size - 1) / group_size; int globalSize = blocksNum * group_size; cl::NDRange local(group_size); cl::NDRange global (globalSize); cl_int status = kWrapper.run(control, global, local); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } return clsparseSuccess; }
clsparseStatus csr2coo_transform(const int m, const int n, cl_mem csr_row_offsets, cl_mem coo_row_indices, const std::string& params, const cl_uint group_size, const cl_uint subwave_size, clsparseControl control) { cl::Kernel kernel = KernelCache::get(control->queue,"csr2coo", "csr2coo", params); KernelWrap kWrapper(kernel); kWrapper << m << n << csr_row_offsets << coo_row_indices; // subwave takes care of each row in matrix; // predicted number of subwaves to be executed; cl_uint predicted = subwave_size * m; //cl::NDRange local(group_size); //cl::NDRange global(predicted > local[0] ? predicted : local[0]); cl_uint global_work_size = group_size* ((predicted + group_size - 1 ) / group_size); cl::NDRange local(group_size); cl::NDRange global(global_work_size > local[0] ? global_work_size : local[0]); cl_int status = kWrapper.run(control, global, local); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } return clsparseSuccess; }
clsparseStatus scale( clsparse::array_base<T>& pResult, const clsparse::array_base<T>& pAlpha, const clsparse::array_base<T>& pVector, clsparseControl control) { const int group_size = 256; //const int group_size = control->max_wg_size; std::string params = std::string() + " -DVALUE_TYPE="+ OclTypeTraits<T>::type + " -DWG_SIZE=" + std::to_string(group_size); if (sizeof(clsparseIdx_t) == 8) { std::string options = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type; params.append(options); } else { std::string options = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_uint>::type; params.append(options); } if(typeid(T) == typeid(cl_double)) { params.append(" -DDOUBLE"); if (!control->dpfp_support) { #ifndef NDEBUG std::cerr << "Failure attempting to run double precision kernel on device without DPFP support." << std::endl; #endif return clsparseInvalidDevice; } } cl::Kernel kernel = KernelCache::get(control->queue, "blas1", "scale", params); KernelWrap kWrapper(kernel); clsparseIdx_t size = pResult.size(); clsparseIdx_t offset = 0; kWrapper << size << pResult.data() << offset << pVector.data() << offset << pAlpha.data() << offset; clsparseIdx_t blocksNum = (size + group_size - 1) / group_size; clsparseIdx_t globalSize = blocksNum * group_size; cl::NDRange local(group_size); cl::NDRange global (globalSize); cl_int status = kWrapper.run(control, global, local); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } return clsparseSuccess; }
clsparseStatus csrmm( const clsparseScalarPrivate& pAlpha, const clsparseCsrMatrixPrivate& pSparseCsrA, const cldenseMatrixPrivate& pDenseB, const clsparseScalarPrivate& pBeta, cldenseMatrixPrivate& pDenseC, const clsparseControl control ) { cl_uint nnz_per_row = pSparseCsrA.nnz_per_row( ); //average nnz per row cl_uint wave_size = control->wavefront_size; cl_uint group_size = 256; // 256 gives best performance! cl_uint subwave_size = wave_size; // adjust subwave_size according to nnz_per_row; // each wavefron will be assigned to the row of the csr matrix if( wave_size > 32 ) { //this apply only for devices with wavefront > 32 like AMD(64) if( nnz_per_row < 64 ) { subwave_size = 32; } } if( nnz_per_row < 32 ) { subwave_size = 16; } if( nnz_per_row < 16 ) { subwave_size = 8; } if( nnz_per_row < 8 ) { subwave_size = 4; } if( nnz_per_row < 4 ) { subwave_size = 2; } std::string params = std::string( ) + "-DINDEX_TYPE=" + OclTypeTraits<cl_int>::type + " -DVALUE_TYPE=" + OclTypeTraits<T>::type + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type + " -DWG_SIZE=" + std::to_string( group_size ) + " -DWAVE_SIZE=" + std::to_string( wave_size ) + " -DSUBWAVE_SIZE=" + std::to_string( subwave_size ); if( typeid( T ) == typeid( cl_double ) ) { params += " -DDOUBLE"; } cl::Kernel kernel = KernelCache::get( control->queue, "csrmm_general", "csrmv_batched", params ); KernelWrap kWrapper( kernel ); kWrapper << pSparseCsrA.num_rows << pAlpha.value << pAlpha.offset( ) << pSparseCsrA.rowOffsets << pSparseCsrA.colIndices << pSparseCsrA.values << pDenseB.values << pDenseB.lead_dim << pDenseB.offset( ) << pBeta.value << pBeta.offset( ) << pDenseC.values << pDenseC.num_rows << pDenseC.num_cols << pDenseC.lead_dim << pDenseC.offset( ); // subwave takes care of each row in matrix; // predicted number of subwaves to be executed; cl_uint predicted = subwave_size * pSparseCsrA.num_rows; // if NVIDIA is used it does not allow to run the group size // which is not a multiplication of group_size. Don't know if that // have an impact on performance cl_uint global_work_size = group_size* ( ( predicted + group_size - 1 ) / group_size ); cl::NDRange local( group_size ); //cl::NDRange global(predicted > local[0] ? predicted : local[0]); cl::NDRange global( global_work_size > local[ 0 ] ? global_work_size : local[ 0 ] ); cl_int status = kWrapper.run( control, global, local ); if( status != CL_SUCCESS ) { return clsparseInvalidKernelExecution; } return clsparseSuccess; }
clsparseStatus dense_to_coo(clsparseCooMatrix* coo, const clsparse::vector<V>& A, const clsparse::vector<I>& nnz_locations, const clsparse::vector<I>& coo_indexes, const clsparseControl control) { typedef typename clsparse::vector<V>::size_type SizeType; assert(coo->num_nonzeros > 0); assert(coo->num_cols > 0); assert(coo->num_rows > 0); assert(A.size() > 0); assert(nnz_locations.size() > 0); assert(coo_indexes.size() > 0); SizeType dense_size = A.size(); cl_int cl_status; coo->values = clCreateBuffer( control->getContext()(), CL_MEM_READ_WRITE, coo->num_nonzeros * sizeof(V), NULL, &cl_status ); CLSPARSE_V(cl_status, "Create coo values buffer"); coo->colIndices = clCreateBuffer( control->getContext()(), CL_MEM_READ_WRITE, coo->num_nonzeros * sizeof(I), NULL, &cl_status ); CLSPARSE_V(cl_status, "Create coo col indices buffer"); coo->rowIndices = clCreateBuffer(control->getContext()(), CL_MEM_READ_WRITE, coo->num_nonzeros * sizeof(I), NULL, &cl_status ); CLSPARSE_V(cl_status, "Create coo row indices buffer"); SizeType workgroup_size = 256; SizeType global_work_size = 0; if (dense_size % workgroup_size == 0) global_work_size = dense_size; else global_work_size = dense_size / workgroup_size * workgroup_size + workgroup_size; if (dense_size < workgroup_size) global_work_size = workgroup_size; const std::string params = std::string() + " -DINDEX_TYPE=" + OclTypeTraits<I>::type + " -DSIZE_TYPE=" + OclTypeTraits<SizeType>::type + " -DVALUE_TYPE=" + OclTypeTraits<V>::type + " -DWG_SIZE=" + std::to_string(workgroup_size) + " -DSUBWAVE_SIZE=" + std::to_string(2); //required by program; //cl::Kernel kernel = KernelCache::get(control->queue,"dense2csr", "spread_value", params); cl::Kernel kernel = KernelCache::get(control->queue,"conversion_utils", "scatter_coo_locations", params); KernelWrap kWrapper(kernel); kWrapper << (SizeType) coo->num_rows << (SizeType) coo->num_cols << (SizeType) dense_size << A.data() << nnz_locations.data() << coo_indexes.data() << coo->rowIndices << coo->colIndices << coo->values; cl::NDRange local(workgroup_size); cl::NDRange global(global_work_size); cl_status = kWrapper.run(control, global, local); CLSPARSE_V(cl_status, "Error process scaninput"); if (cl_status != CL_SUCCESS) return clsparseInvalidKernelExecution; return clsparseSuccess; }
clsparseStatus calculate_num_nonzeros(/*dense matrix*/ const clsparse::vector<V>& A, clsparse::vector<I>& nnz_locations, I& num_nonzeros, const clsparseControl control) { typedef typename clsparse::vector<I>::size_type SizeType; SizeType dense_size = A.size(); SizeType workgroup_size = 256; SizeType global_work_size = 0; if (dense_size % workgroup_size == 0) global_work_size = dense_size; else global_work_size = dense_size / workgroup_size * workgroup_size + workgroup_size; if (dense_size < workgroup_size) global_work_size = workgroup_size; const std::string params = std::string() + " -DINDEX_TYPE=" + OclTypeTraits<I>::type + " -DSIZE_TYPE=" + OclTypeTraits<SizeType>::type + " -DVALUE_TYPE=" + OclTypeTraits<V>::type + " -DWG_SIZE=" + std::to_string(workgroup_size) + " -DSUBWAVE_SIZE=" + std::to_string(2); //required by program; //cl::Kernel kernel = KernelCache::get(control->queue,"dense2csr", "process_scaninput", params); cl::Kernel kernel = KernelCache::get(control->queue,"conversion_utils", "scan_nonzero_locations", params); KernelWrap kWrapper(kernel); kWrapper << dense_size << A.data() << nnz_locations.data(); cl::NDRange local(workgroup_size); cl::NDRange global(global_work_size); cl_int cl_status = kWrapper.run(control, global, local); CLSPARSE_V(cl_status, "Error process scaninput"); if (cl_status != CL_SUCCESS) return clsparseInvalidKernelExecution; //TODO: is it just write_only? clsparse::vector<I> nnz (control, 1, 0, CL_MEM_READ_WRITE, false); //due to this definition nnz and nnz_location have to be of the same type; clsparseStatus status = reduce<I, RO_PLUS>(nnz, nnz_locations, control); CLSPARSE_V(status, "Error: reduce"); if (status!= clsparseSuccess) return clsparseInvalidKernelExecution; num_nonzeros = nnz[0]; //std::cout << "NNZ: " << num_nonzeros << std::endl; return status; }
clsparseStatus csrmv_adaptive( const clsparseScalarPrivate* pAlpha, const clsparseCsrMatrixPrivate* pCsrMatx, const cldenseVectorPrivate* pX, const clsparseScalarPrivate* pBeta, cldenseVectorPrivate* pY, clsparseControl control ) { const cl_uint group_size = 256; std::string params = std::string( ) + " -DROWBITS=" + std::to_string( ROW_BITS ) + " -DWGBITS=" + std::to_string( WG_BITS ) + " -DVALUE_TYPE=" + OclTypeTraits<T>::type + " -DWG_SIZE=" + std::to_string( group_size ) + " -DBLOCKSIZE=" + std::to_string( BLKSIZE ) + " -DBLOCK_MULTIPLIER=" + std::to_string( BLOCK_MULTIPLIER ) + " -DROWS_FOR_VECTOR=" + std::to_string( ROWS_FOR_VECTOR ); if( sizeof( clsparseIdx_t ) == 8 ) { std::string options = std::string() + " -DINDEX_TYPE=" + OclTypeTraits<cl_ulong>::type; params.append(options); } else { std::string options = std::string() + " -DINDEX_TYPE=" + OclTypeTraits<cl_uint>::type; params.append(options); } std::string options; if(typeid(T) == typeid(cl_double)) { options = std::string() + " -DDOUBLE"; if (!control->dpfp_support) { #ifndef NDEBUG std::cerr << "Failure attempting to run double precision kernel on device without DPFP support." << std::endl; #endif return clsparseInvalidDevice; } } else if(typeid(T) == typeid(cl_ulong)) options = std::string() + " -DLONG"; else if(typeid(T) == typeid(cl_long)) options = std::string() + " -DLONG"; if(control->extended_precision) options += " -DEXTENDED_PRECISION"; params.append(options); cl::Kernel kernel = KernelCache::get( control->queue, "csrmv_adaptive", "csrmv_adaptive", params ); KernelWrap kWrapper( kernel ); const matrix_meta* meta_ptr = static_cast< const matrix_meta* >( pCsrMatx->meta ); kWrapper << pCsrMatx->values << pCsrMatx->col_indices << pCsrMatx->row_pointer << pX->values << pY->values << meta_ptr->rowBlocks << pAlpha->value << pBeta->value; //<< h_alpha << h_beta; // if NVIDIA is used it does not allow to run the group size // which is not a multiplication of group_size. Don't know if that // have an impact on performance // Setting global work size to half the row block size because we are only // using half the row blocks buffer for actual work. // The other half is used for the extended precision reduction. clsparseIdx_t global_work_size = ( ( meta_ptr->rowBlockSize/2) - 1 ) * group_size; cl::NDRange local( group_size ); cl::NDRange global( global_work_size > local[ 0 ] ? global_work_size : local[ 0 ] ); cl_int status = kWrapper.run( control, global, local ); if( status != CL_SUCCESS ) { return clsparseInvalidKernelExecution; } return clsparseSuccess; }
clsparseStatus scan(VectorType& output, const VectorType& input, clsparseControl control, bool exclusive) { typedef typename VectorType::size_type SizeType; //check for cl_ulong typedef typename VectorType::value_type T; if (!clsparseInitialized) { return clsparseNotInitialized; } //check opencl elements if (control == nullptr) { return clsparseInvalidControlObject; } assert (input.size() == output.size()); SizeType num_elements = input.size(); //std::cout << "num_elements = " << num_elements << std::endl; SizeType KERNEL02WAVES = 4; SizeType KERNEL1WAVES = 4; SizeType WAVESIZE = control->wavefront_size; SizeType kernel0_WgSize = WAVESIZE*KERNEL02WAVES; SizeType kernel1_WgSize = WAVESIZE*KERNEL1WAVES; SizeType kernel2_WgSize = WAVESIZE*KERNEL02WAVES; SizeType numElementsRUP = num_elements; SizeType modWgSize = (numElementsRUP & ((kernel0_WgSize*2)-1)); if( modWgSize ) { numElementsRUP &= ~modWgSize; numElementsRUP += (kernel0_WgSize*2); } //2 element per work item SizeType numWorkGroupsK0 = numElementsRUP / (kernel0_WgSize*2); SizeType sizeScanBuff = numWorkGroupsK0; modWgSize = (sizeScanBuff & ((kernel0_WgSize*2)-1)); if( modWgSize ) { sizeScanBuff &= ~modWgSize; sizeScanBuff += (kernel0_WgSize*2); } cl::Context ctx = control->getContext(); clsparse::vector<T> preSumArray(control, sizeScanBuff, 0, CL_MEM_READ_WRITE, false); clsparse::vector<T> preSumArray1(control, sizeScanBuff, 0, CL_MEM_READ_WRITE, false); clsparse::vector<T> postSumArray(control, sizeScanBuff, 0, CL_MEM_READ_WRITE, false); T operator_identity = 0; //std::cout << "operator_identity = " << operator_identity << std::endl; //scan in blocks { //local mem size std::size_t lds = kernel0_WgSize * 2 * sizeof(T); std::string params = std::string() + " -DVALUE_TYPE=" + OclTypeTraits<T>::type + " -DWG_SIZE=" + std::to_string(kernel0_WgSize) + " -D" + ElementWiseOperatorTrait<OP>::operation; if (sizeof(clsparseIdx_t) == 8) { std::string options = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type; params.append(options); } else { std::string options = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_uint>::type; params.append(options); } if(typeid(T) == typeid(cl_double)) { params.append(" -DDOUBLE"); if (!control->dpfp_support) { #ifndef NDEBUG std::cerr << "Failure attempting to run double precision kernel on device without DPFP support." << std::endl; #endif return clsparseInvalidDevice; } } cl::Kernel kernel = KernelCache::get(control->queue, "scan", "per_block_inclusive_scan", params); KernelWrap kWrapper(kernel); kWrapper << input.data() << operator_identity << (SizeType)input.size() << cl::Local(lds) << preSumArray.data() << preSumArray1.data() << (int) exclusive; cl::NDRange global(numElementsRUP/2); cl::NDRange local (kernel0_WgSize); cl_int status = kWrapper.run(control, global, local); CLSPARSE_V(status, "Error: per_block_inclusive_scan"); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } } { //local mem size std::size_t lds = kernel0_WgSize * sizeof(T); SizeType workPerThread = sizeScanBuff / kernel1_WgSize; std::string params = std::string() + " -DVALUE_TYPE=" + OclTypeTraits<T>::type + " -DWG_SIZE=" + std::to_string(kernel1_WgSize) + " -D" + ElementWiseOperatorTrait<OP>::operation; if (sizeof(clsparseIdx_t) == 8) { std::string options = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type; params.append(options); } else { std::string options = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_uint>::type; params.append(options); } if(typeid(T) == typeid(cl_double)) { params.append(" -DDOUBLE"); if (!control->dpfp_support) { #ifndef NDEBUG std::cerr << "Failure attempting to run double precision kernel on device without DPFP support." << std::endl; #endif return clsparseInvalidDevice; } } cl::Kernel kernel = KernelCache::get(control->queue, "scan", "intra_block_inclusive_scan", params); KernelWrap kWrapper(kernel); kWrapper << postSumArray.data() << preSumArray.data() << operator_identity << numWorkGroupsK0 << cl::Local(lds) << workPerThread; cl::NDRange global ( kernel1_WgSize ); cl::NDRange local ( kernel1_WgSize ); cl_int status = kWrapper.run(control, global, local); CLSPARSE_V(status, "Error: intra_block_inclusive_scan"); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } } { std::size_t lds = kernel0_WgSize * sizeof(T); //local mem size std::string params = std::string() + " -DVALUE_TYPE=" + OclTypeTraits<T>::type + " -DWG_SIZE=" + std::to_string(kernel1_WgSize) + " -D" + ElementWiseOperatorTrait<OP>::operation; if (sizeof(clsparseIdx_t) == 8) { std::string options = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type; params.append(options); } else { std::string options = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_uint>::type; params.append(options); } if(typeid(T) == typeid(cl_double)) { params.append(" -DDOUBLE"); if (!control->dpfp_support) { #ifndef NDEBUG std::cerr << "Failure attempting to run double precision kernel on device without DPFP support." << std::endl; #endif return clsparseInvalidDevice; } } cl::Kernel kernel = KernelCache::get(control->queue, "scan", "per_block_addition", params); KernelWrap kWrapper(kernel); kWrapper << output.data() << input.data() << postSumArray.data() << preSumArray1.data() << cl::Local(lds) << num_elements << (int)exclusive << operator_identity; cl::NDRange global ( numElementsRUP ); cl::NDRange local ( kernel2_WgSize ); cl_int status = kWrapper.run(control, global, local); CLSPARSE_V(status, "Error: per_block_addition"); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } } return clsparseSuccess; }
clsparseStatus extract_diagonal(cldenseVectorPrivate* pDiag, const clsparseCsrMatrixPrivate* pA, clsparseControl control) { if (!clsparseInitialized) { return clsparseNotInitialized; } //check opencl elements if (control == nullptr) { return clsparseInvalidControlObject; } assert (pA->num_rows > 0); assert (pA->num_cols > 0); assert (pA->num_nonzeros > 0); assert (pDiag->num_values == std::min(pA->num_rows, pA->num_cols)); cl_ulong wg_size = 256; cl_ulong size = pA->num_rows; cl_ulong nnz_per_row = pA->nnz_per_row(); cl_ulong wave_size = control->wavefront_size; cl_ulong subwave_size = wave_size; // adjust subwave_size according to nnz_per_row; // each wavefron will be assigned to the row of the csr matrix if(wave_size > 32) { //this apply only for devices with wavefront > 32 like AMD(64) if (nnz_per_row < 64) { subwave_size = 32; } } if (nnz_per_row < 32) { subwave_size = 16; } if (nnz_per_row < 16) { subwave_size = 8; } if (nnz_per_row < 8) { subwave_size = 4; } if (nnz_per_row < 4) { subwave_size = 2; } std::string params = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type + " -DINDEX_TYPE=" + OclTypeTraits<cl_int>::type + " -DVALUE_TYPE=" + OclTypeTraits<T>::type + " -DWG_SIZE=" + std::to_string(wg_size) + " -DWAVE_SIZE=" + std::to_string(wave_size) + " -DSUBWAVE_SIZE=" + std::to_string(subwave_size); if (inverse) params.append(" -DOP_DIAG_INVERSE"); if(typeid(T) == typeid(cl_double)) { params.append(" -DDOUBLE"); if (!control->dpfp_support) { #ifndef NDEBUG std::cerr << "Failure attempting to run double precision kernel on device without DPFP support." << std::endl; #endif return clsparseInvalidDevice; } } cl::Kernel kernel = KernelCache::get(control->queue, "matrix_utils", "extract_diagonal", params); KernelWrap kWrapper(kernel); kWrapper << size << pDiag->values << pA->rowOffsets << pA->colIndices << pA->values; cl_uint predicted = subwave_size * size; cl_uint global_work_size = wg_size * ((predicted + wg_size - 1 ) / wg_size); cl::NDRange local(wg_size); //cl::NDRange global(predicted > local[0] ? predicted : local[0]); cl::NDRange global(global_work_size > local[0] ? global_work_size : local[0]); cl_int status = kWrapper.run(control, global, local); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } return clsparseSuccess; }