clsparseStatus dot(clsparseScalarPrivate* pR, const cldenseVectorPrivate* pX, const cldenseVectorPrivate* pY, const clsparseControl control) { cl_int status; init_scalar(pR, (T)0, control); // with REDUCE_BLOCKS_NUMBER = 256 final reduction can be performed // within one block; const cl_ulong REDUCE_BLOCKS_NUMBER = 256; /* For future optimisation //workgroups per compute units; const cl_uint WG_PER_CU = 64; const cl_ulong REDUCE_BLOCKS_NUMBER = control->max_compute_units * WG_PER_CU; */ const cl_ulong REDUCE_BLOCK_SIZE = 256; cl_ulong xSize = pX->num_values - pX->offset(); cl_ulong ySize = pY->num_values - pY->offset(); assert (xSize == ySize); cl_ulong size = xSize; if (size > 0) { cl::Context context = control->getContext(); //partial result cldenseVectorPrivate partial; clsparseInitVector(&partial); partial.num_values = REDUCE_BLOCKS_NUMBER; clMemRAII<T> rPartial (control->queue(), &partial.values, partial.num_values); status = inner_product<T>(&partial, pX, pY, size, REDUCE_BLOCKS_NUMBER, REDUCE_BLOCK_SIZE, control); if (status != clsparseSuccess) { return clsparseInvalidKernelExecution; } status = atomic_reduce<T>(pR, &partial, REDUCE_BLOCK_SIZE, control); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } } return clsparseSuccess; }
clsparseStatus dot(clsparse::array_base<T>& pR, const clsparse::array_base<T>& pX, const clsparse::array_base<T>& pY, const clsparseControl control) { cl_int status; //not necessary to have it, but remember to init the pR with the proper value init_scalar(pR, (T)0, control); // with REDUCE_BLOCKS_NUMBER = 256 final reduction can be performed // within one block; const cl_ulong REDUCE_BLOCKS_NUMBER = 256; /* For future optimisation //workgroups per compute units; const cl_uint WG_PER_CU = 64; const cl_ulong REDUCE_BLOCKS_NUMBER = control->max_compute_units * WG_PER_CU; */ const cl_ulong REDUCE_BLOCK_SIZE = 256; cl_ulong xSize = pX.size(); cl_ulong ySize = pY.size(); assert (xSize == ySize); cl_ulong size = xSize; if (size > 0) { cl::Context context = control->getContext(); //partial result clsparse::vector<T> partial(control, REDUCE_BLOCKS_NUMBER, 0, CL_MEM_READ_WRITE, false); status = inner_product<T>(partial, pX, pY, size, REDUCE_BLOCKS_NUMBER, REDUCE_BLOCK_SIZE, control); if (status != clsparseSuccess) { return clsparseInvalidKernelExecution; } status = atomic_reduce<T>(pR, partial, REDUCE_BLOCK_SIZE, control); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } } return clsparseSuccess; }
clsparseStatus scan(VectorType& output, const VectorType& input, clsparseControl control, bool exclusive) { typedef typename VectorType::size_type SizeType; //check for cl_ulong typedef typename VectorType::value_type T; if (!clsparseInitialized) { return clsparseNotInitialized; } //check opencl elements if (control == nullptr) { return clsparseInvalidControlObject; } assert (input.size() == output.size()); SizeType num_elements = input.size(); //std::cout << "num_elements = " << num_elements << std::endl; SizeType KERNEL02WAVES = 4; SizeType KERNEL1WAVES = 4; SizeType WAVESIZE = control->wavefront_size; SizeType kernel0_WgSize = WAVESIZE*KERNEL02WAVES; SizeType kernel1_WgSize = WAVESIZE*KERNEL1WAVES; SizeType kernel2_WgSize = WAVESIZE*KERNEL02WAVES; SizeType numElementsRUP = num_elements; SizeType modWgSize = (numElementsRUP & ((kernel0_WgSize*2)-1)); if( modWgSize ) { numElementsRUP &= ~modWgSize; numElementsRUP += (kernel0_WgSize*2); } //2 element per work item SizeType numWorkGroupsK0 = numElementsRUP / (kernel0_WgSize*2); SizeType sizeScanBuff = numWorkGroupsK0; modWgSize = (sizeScanBuff & ((kernel0_WgSize*2)-1)); if( modWgSize ) { sizeScanBuff &= ~modWgSize; sizeScanBuff += (kernel0_WgSize*2); } cl::Context ctx = control->getContext(); clsparse::vector<T> preSumArray(control, sizeScanBuff, 0, CL_MEM_READ_WRITE, false); clsparse::vector<T> preSumArray1(control, sizeScanBuff, 0, CL_MEM_READ_WRITE, false); clsparse::vector<T> postSumArray(control, sizeScanBuff, 0, CL_MEM_READ_WRITE, false); T operator_identity = 0; //std::cout << "operator_identity = " << operator_identity << std::endl; //scan in blocks { //local mem size std::size_t lds = kernel0_WgSize * 2 * sizeof(T); std::string params = std::string() + " -DVALUE_TYPE=" + OclTypeTraits<T>::type + " -DWG_SIZE=" + std::to_string(kernel0_WgSize) + " -D" + ElementWiseOperatorTrait<OP>::operation; if (sizeof(clsparseIdx_t) == 8) { std::string options = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type; params.append(options); } else { std::string options = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_uint>::type; params.append(options); } if(typeid(T) == typeid(cl_double)) { params.append(" -DDOUBLE"); if (!control->dpfp_support) { #ifndef NDEBUG std::cerr << "Failure attempting to run double precision kernel on device without DPFP support." << std::endl; #endif return clsparseInvalidDevice; } } cl::Kernel kernel = KernelCache::get(control->queue, "scan", "per_block_inclusive_scan", params); KernelWrap kWrapper(kernel); kWrapper << input.data() << operator_identity << (SizeType)input.size() << cl::Local(lds) << preSumArray.data() << preSumArray1.data() << (int) exclusive; cl::NDRange global(numElementsRUP/2); cl::NDRange local (kernel0_WgSize); cl_int status = kWrapper.run(control, global, local); CLSPARSE_V(status, "Error: per_block_inclusive_scan"); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } } { //local mem size std::size_t lds = kernel0_WgSize * sizeof(T); SizeType workPerThread = sizeScanBuff / kernel1_WgSize; std::string params = std::string() + " -DVALUE_TYPE=" + OclTypeTraits<T>::type + " -DWG_SIZE=" + std::to_string(kernel1_WgSize) + " -D" + ElementWiseOperatorTrait<OP>::operation; if (sizeof(clsparseIdx_t) == 8) { std::string options = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type; params.append(options); } else { std::string options = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_uint>::type; params.append(options); } if(typeid(T) == typeid(cl_double)) { params.append(" -DDOUBLE"); if (!control->dpfp_support) { #ifndef NDEBUG std::cerr << "Failure attempting to run double precision kernel on device without DPFP support." << std::endl; #endif return clsparseInvalidDevice; } } cl::Kernel kernel = KernelCache::get(control->queue, "scan", "intra_block_inclusive_scan", params); KernelWrap kWrapper(kernel); kWrapper << postSumArray.data() << preSumArray.data() << operator_identity << numWorkGroupsK0 << cl::Local(lds) << workPerThread; cl::NDRange global ( kernel1_WgSize ); cl::NDRange local ( kernel1_WgSize ); cl_int status = kWrapper.run(control, global, local); CLSPARSE_V(status, "Error: intra_block_inclusive_scan"); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } } { std::size_t lds = kernel0_WgSize * sizeof(T); //local mem size std::string params = std::string() + " -DVALUE_TYPE=" + OclTypeTraits<T>::type + " -DWG_SIZE=" + std::to_string(kernel1_WgSize) + " -D" + ElementWiseOperatorTrait<OP>::operation; if (sizeof(clsparseIdx_t) == 8) { std::string options = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type; params.append(options); } else { std::string options = std::string() + " -DSIZE_TYPE=" + OclTypeTraits<cl_uint>::type; params.append(options); } if(typeid(T) == typeid(cl_double)) { params.append(" -DDOUBLE"); if (!control->dpfp_support) { #ifndef NDEBUG std::cerr << "Failure attempting to run double precision kernel on device without DPFP support." << std::endl; #endif return clsparseInvalidDevice; } } cl::Kernel kernel = KernelCache::get(control->queue, "scan", "per_block_addition", params); KernelWrap kWrapper(kernel); kWrapper << output.data() << input.data() << postSumArray.data() << preSumArray1.data() << cl::Local(lds) << num_elements << (int)exclusive << operator_identity; cl::NDRange global ( numElementsRUP ); cl::NDRange local ( kernel2_WgSize ); cl_int status = kWrapper.run(control, global, local); CLSPARSE_V(status, "Error: per_block_addition"); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } } return clsparseSuccess; }
clsparseStatus dense_to_coo(clsparseCooMatrix* coo, const clsparse::vector<V>& A, const clsparse::vector<I>& nnz_locations, const clsparse::vector<I>& coo_indexes, const clsparseControl control) { typedef typename clsparse::vector<V>::size_type SizeType; assert(coo->num_nonzeros > 0); assert(coo->num_cols > 0); assert(coo->num_rows > 0); assert(A.size() > 0); assert(nnz_locations.size() > 0); assert(coo_indexes.size() > 0); SizeType dense_size = A.size(); cl_int cl_status; coo->values = clCreateBuffer( control->getContext()(), CL_MEM_READ_WRITE, coo->num_nonzeros * sizeof(V), NULL, &cl_status ); CLSPARSE_V(cl_status, "Create coo values buffer"); coo->colIndices = clCreateBuffer( control->getContext()(), CL_MEM_READ_WRITE, coo->num_nonzeros * sizeof(I), NULL, &cl_status ); CLSPARSE_V(cl_status, "Create coo col indices buffer"); coo->rowIndices = clCreateBuffer(control->getContext()(), CL_MEM_READ_WRITE, coo->num_nonzeros * sizeof(I), NULL, &cl_status ); CLSPARSE_V(cl_status, "Create coo row indices buffer"); SizeType workgroup_size = 256; SizeType global_work_size = 0; if (dense_size % workgroup_size == 0) global_work_size = dense_size; else global_work_size = dense_size / workgroup_size * workgroup_size + workgroup_size; if (dense_size < workgroup_size) global_work_size = workgroup_size; const std::string params = std::string() + " -DINDEX_TYPE=" + OclTypeTraits<I>::type + " -DSIZE_TYPE=" + OclTypeTraits<SizeType>::type + " -DVALUE_TYPE=" + OclTypeTraits<V>::type + " -DWG_SIZE=" + std::to_string(workgroup_size) + " -DSUBWAVE_SIZE=" + std::to_string(2); //required by program; //cl::Kernel kernel = KernelCache::get(control->queue,"dense2csr", "spread_value", params); cl::Kernel kernel = KernelCache::get(control->queue,"conversion_utils", "scatter_coo_locations", params); KernelWrap kWrapper(kernel); kWrapper << (SizeType) coo->num_rows << (SizeType) coo->num_cols << (SizeType) dense_size << A.data() << nnz_locations.data() << coo_indexes.data() << coo->rowIndices << coo->colIndices << coo->values; cl::NDRange local(workgroup_size); cl::NDRange global(global_work_size); cl_status = kWrapper.run(control, global, local); CLSPARSE_V(cl_status, "Error process scaninput"); if (cl_status != CL_SUCCESS) return clsparseInvalidKernelExecution; return clsparseSuccess; }
clsparseStatus reduce_by_key( int keys_first, int keys_last, int values_first, cl_mem keys_input, cl_mem values_input, cl_mem keys_output, cl_mem values_output, int *count, clsparseControl control ) { cl_int l_Error; /********************************************************************************** * Compile Options *********************************************************************************/ const int kernel0_WgSize = WAVESIZE*KERNEL02WAVES; const int kernel1_WgSize = WAVESIZE*KERNEL1WAVES; const int kernel2_WgSize = WAVESIZE*KERNEL02WAVES; //const std::string params = std::string() + // " -DKERNEL0WORKGROUPSIZE=" + std::to_string(kernel0_WgSize) // + " -DKERNEL1WORKGROUPSIZE=" + std::to_string(kernel1_WgSize) // + " -DKERNEL2WORKGROUPSIZE=" + std::to_string(kernel2_WgSize); const std::string params; cl::Context context = control->getContext(); std::vector<cl::Device> dev = context.getInfo<CL_CONTEXT_DEVICES>(); int computeUnits = dev[0].getInfo< CL_DEVICE_MAX_COMPUTE_UNITS >( ); int wgPerComputeUnit = dev[0].getInfo< CL_DEVICE_MAX_WORK_GROUP_SIZE >( ); int resultCnt = computeUnits * wgPerComputeUnit; cl_uint numElements = keys_last - keys_first + 1; size_t sizeInputBuff = numElements; int modWgSize = (sizeInputBuff & (kernel0_WgSize-1)); if( modWgSize ) { sizeInputBuff &= ~modWgSize; sizeInputBuff += kernel0_WgSize; } cl_uint numWorkGroupsK0 = static_cast< cl_uint >( sizeInputBuff / kernel0_WgSize ); size_t sizeScanBuff = numWorkGroupsK0; modWgSize = (sizeScanBuff & (kernel0_WgSize-1)); if( modWgSize ) { sizeScanBuff &= ~modWgSize; sizeScanBuff += kernel0_WgSize; } cl_mem tempArrayVec = clCreateBuffer(context(),CL_MEM_READ_WRITE, (numElements)*sizeof(int), NULL, NULL ); /********************************************************************************** * Kernel 0 *********************************************************************************/ cl::Kernel kernel0 = KernelCache::get(control->queue,"reduce_by_key", "OffsetCalculation", params); KernelWrap kWrapper0(kernel0); kWrapper0 << keys_input << tempArrayVec << numElements; cl::NDRange local0(kernel0_WgSize); cl::NDRange global0(sizeInputBuff); cl_int status = kWrapper0.run(control, global0, local0); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } int init = 0; scan(0, numElements - 1, tempArrayVec, tempArrayVec, 0, 0, control ); int pattern = 0; cl_mem keySumArray = clCreateBuffer(context(),CL_MEM_READ_WRITE, (sizeScanBuff)*sizeof(int), NULL, NULL ); cl_mem preSumArray = clCreateBuffer(context(),CL_MEM_READ_WRITE, (sizeScanBuff)*sizeof(int), NULL, NULL ); cl_mem postSumArray = clCreateBuffer(context(),CL_MEM_READ_WRITE,(sizeScanBuff)*sizeof(int), NULL, NULL ); clEnqueueFillBuffer(control->queue(), keySumArray, &pattern, sizeof(int), 0, (sizeScanBuff)*sizeof(int), 0, NULL, NULL); clEnqueueFillBuffer(control->queue(), preSumArray, &pattern, sizeof(int), 0, (sizeScanBuff)*sizeof(int), 0, NULL, NULL); clEnqueueFillBuffer(control->queue(), postSumArray, &pattern, sizeof(int), 0, (sizeScanBuff)*sizeof(int), 0, NULL, NULL); /********************************************************************************** * Kernel 1 *********************************************************************************/ cl::Kernel kernel1 = KernelCache::get(control->queue,"reduce_by_key", "perBlockScanByKey", params); KernelWrap kWrapper1(kernel1); kWrapper1 << tempArrayVec << values_input << numElements << keySumArray << preSumArray; cl::NDRange local1(kernel0_WgSize); cl::NDRange global1(sizeInputBuff); status = kWrapper1.run(control, global1, local1); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } /********************************************************************************** * Kernel 2 *********************************************************************************/ cl_uint workPerThread = static_cast< cl_uint >( sizeScanBuff / kernel1_WgSize ); cl::Kernel kernel2 = KernelCache::get(control->queue,"reduce_by_key", "intraBlockInclusiveScanByKey", params); KernelWrap kWrapper2(kernel2); kWrapper2 << keySumArray << preSumArray << postSumArray << numWorkGroupsK0 << workPerThread; cl::NDRange local2(kernel1_WgSize); cl::NDRange global2(kernel1_WgSize); status = kWrapper2.run(control, global2, local2); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } /********************************************************************************** * Kernel 3 *********************************************************************************/ cl::Kernel kernel3 = KernelCache::get(control->queue,"reduce_by_key", "keyValueMapping", params); KernelWrap kWrapper3(kernel3); kWrapper3 << keys_input << keys_output << values_input << values_output << tempArrayVec << keySumArray << postSumArray << numElements; cl::NDRange local3(kernel0_WgSize); cl::NDRange global3(sizeInputBuff); status = kWrapper3.run(control, global3, local3); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } int *h_result = (int *) malloc (sizeof(int)); clEnqueueReadBuffer(control->queue(), tempArrayVec, 1, (numElements-1)*sizeof(int), sizeof(int), h_result, 0, 0, 0); *count = *(h_result); //printf("h_result = %d\n", *count ); //release buffers clReleaseMemObject(tempArrayVec); clReleaseMemObject(preSumArray); clReleaseMemObject(postSumArray); clReleaseMemObject(keySumArray); return clsparseSuccess; } //end of reduce_by_key