clsparseStatus clsparseDCooMatrixfromFile( clsparseCooMatrix* cooMatx, const char* filePath, clsparseControl control, cl_bool read_explicit_zeroes ) { clsparseCooMatrixPrivate* pCooMatx = static_cast<clsparseCooMatrixPrivate*>( cooMatx ); // Check that the file format is matrix market; the only format we can read right now // This is not a complete solution, and fails for directories with file names etc... // TODO: Should we use boost filesystem? std::string strPath( filePath ); if( strPath.find_last_of( '.' ) != std::string::npos ) { std::string ext = strPath.substr( strPath.find_last_of( '.' ) + 1 ); if( ext != "mtx" ) return clsparseInvalidFileFormat; } else return clsparseInvalidFileFormat; MatrixMarketReader< cl_double > mm_reader; if( mm_reader.MMReadFormat( filePath, read_explicit_zeroes ) ) return clsparseInvalidFile; pCooMatx->num_rows = mm_reader.GetNumRows( ); pCooMatx->num_cols = mm_reader.GetNumCols( ); pCooMatx->num_nonzeros = mm_reader.GetNumNonZeroes( ); // Transfers data from CPU buffer to GPU buffers clMemRAII< cl_double > rCooValues( control->queue( ), pCooMatx->values ); clMemRAII< clsparseIdx_t > rCoocol_indices( control->queue( ), pCooMatx->col_indices ); clMemRAII< clsparseIdx_t > rCoorow_indices( control->queue( ), pCooMatx->row_indices ); cl_double* fCooValues = rCooValues.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCooMatx->valOffset( ), pCooMatx->num_nonzeros ); clsparseIdx_t* iCoocol_indices = rCoocol_indices.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCooMatx->colIndOffset( ), pCooMatx->num_nonzeros ); clsparseIdx_t* iCoorow_indices = rCoorow_indices.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCooMatx->rowOffOffset( ), pCooMatx->num_nonzeros ); Coordinate< cl_double >* coords = mm_reader.GetUnsymCoordinates( ); //JPA:: Coo matrix is need to be sorted as well because we need to have matrix // which is sorted by row and then column, in the mtx files usually is opposite. std::sort( coords, coords + pCooMatx->num_nonzeros, CoordinateCompare< cl_double > ); for( clsparseIdx_t c = 0; c < pCooMatx->num_nonzeros; ++c ) { iCoorow_indices[ c ] = coords[ c ].x; iCoocol_indices[ c ] = coords[ c ].y; fCooValues[ c ] = coords[ c ].val; } return clsparseSuccess; }
clsparseStatus dot(clsparseScalarPrivate* pR, const cldenseVectorPrivate* pX, const cldenseVectorPrivate* pY, const clsparseControl control) { cl_int status; init_scalar(pR, (T)0, control); // with REDUCE_BLOCKS_NUMBER = 256 final reduction can be performed // within one block; const cl_ulong REDUCE_BLOCKS_NUMBER = 256; /* For future optimisation //workgroups per compute units; const cl_uint WG_PER_CU = 64; const cl_ulong REDUCE_BLOCKS_NUMBER = control->max_compute_units * WG_PER_CU; */ const cl_ulong REDUCE_BLOCK_SIZE = 256; cl_ulong xSize = pX->num_values - pX->offset(); cl_ulong ySize = pY->num_values - pY->offset(); assert (xSize == ySize); cl_ulong size = xSize; if (size > 0) { cl::Context context = control->getContext(); //partial result cldenseVectorPrivate partial; clsparseInitVector(&partial); partial.num_values = REDUCE_BLOCKS_NUMBER; clMemRAII<T> rPartial (control->queue(), &partial.values, partial.num_values); status = inner_product<T>(&partial, pX, pY, size, REDUCE_BLOCKS_NUMBER, REDUCE_BLOCK_SIZE, control); if (status != clsparseSuccess) { return clsparseInvalidKernelExecution; } status = atomic_reduce<T>(pR, &partial, REDUCE_BLOCK_SIZE, control); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } } return clsparseSuccess; }
clsparseStatus clsparseCsrMetaCompute( clsparseCsrMatrix* csrMatx, clsparseControl control ) { clsparseCsrMatrixPrivate* pCsrMatx = static_cast<clsparseCsrMatrixPrivate*>( csrMatx ); // Check to ensure nRows can fit in 32 bits if( static_cast<cl_ulong>( pCsrMatx->num_rows ) > static_cast<cl_ulong>( pow( 2, ( 64 - ROW_BITS ) ) ) ) { printf( "Number of Rows in the Sparse Matrix is greater than what is supported at present ((64-WG_BITS) bits) !" ); return clsparseOutOfResources; } clMemRAII< cl_int > rCsrRowOffsets( control->queue( ), pCsrMatx->rowOffsets ); cl_int* rowDelimiters = rCsrRowOffsets.clMapMem( CL_TRUE, CL_MAP_READ, pCsrMatx->rowOffOffset( ), pCsrMatx->num_rows + 1 ); clMemRAII< cl_ulong > rRowBlocks( control->queue( ), pCsrMatx->rowBlocks ); cl_ulong* ulCsrRowBlocks = rRowBlocks.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->rowBlocksOffset( ), pCsrMatx->rowBlockSize ); ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, rowDelimiters, pCsrMatx->num_rows, BLKSIZE ); return clsparseSuccess; }
// Converts a sparse matrix in COO compressed format into CSR compressed format // Pre-condition: The CL device memory for CSR values, colIndices, rowOffsets has to be allocated prior to entering this routine // and the offset variables for cl1.2 set clsparseStatus clsparseScoo2csr_host( clsparseCsrMatrix* csrMatx, const clsparseCooMatrix* cooMatx, clsparseControl control ) { if( !clsparseInitialized ) { return clsparseNotInitialized; } //check opencl elements if( control == nullptr ) { return clsparseInvalidControlObject; } const clsparseCooMatrixPrivate* pCooMatx = static_cast<const clsparseCooMatrixPrivate*>( cooMatx ); clsparseCsrMatrixPrivate* pCsrMatx = static_cast<clsparseCsrMatrixPrivate*>( csrMatx ); pCsrMatx->num_rows = pCooMatx->num_rows; pCsrMatx->num_cols = pCooMatx->num_cols; pCsrMatx->num_nonzeros = pCooMatx->num_nonzeros; clMemRAII< cl_float > rCooValues( control->queue( ), pCooMatx->values ); clMemRAII< cl_int > rCooColIndices( control->queue( ), pCooMatx->colIndices ); clMemRAII< cl_int > rCooRowIndices( control->queue( ), pCooMatx->rowIndices ); clMemRAII< cl_float > rCsrValues( control->queue( ), pCsrMatx->values ); clMemRAII< cl_int > rCsrColIndices( control->queue( ), pCsrMatx->colIndices ); clMemRAII< cl_int > rCsrRowOffsets( control->queue( ), pCsrMatx->rowOffsets ); cl_float* fCooValues = rCooValues.clMapMem( CL_TRUE, CL_MAP_READ, pCooMatx->valOffset( ), pCooMatx->num_nonzeros ); cl_int* iCooColIndices = rCooColIndices.clMapMem( CL_TRUE, CL_MAP_READ, pCooMatx->colIndOffset( ), pCooMatx->num_nonzeros ); cl_int* iCooRowIndices = rCooRowIndices.clMapMem( CL_TRUE, CL_MAP_READ, pCooMatx->rowOffOffset( ), pCooMatx->num_nonzeros ); cl_float* fCsrValues = rCsrValues.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->valOffset( ), pCsrMatx->num_nonzeros ); cl_int* iCsrColIndices = rCsrColIndices.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->colIndOffset( ), pCsrMatx->num_nonzeros ); cl_int* iCsrRowOffsets = rCsrRowOffsets.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->rowOffOffset( ), pCsrMatx->num_rows + 1 ); coo2csr_transform( fCooValues, iCooColIndices, iCooRowIndices, pCooMatx->num_nonzeros, fCsrValues, iCsrColIndices, iCsrRowOffsets ); return clsparseSuccess; }
clsparseStatus clsparseScsr2coo(const clsparseCsrMatrix* csr, clsparseCooMatrix* coo, const clsparseControl control) { const clsparseCsrMatrixPrivate* pCsr = static_cast<const clsparseCsrMatrixPrivate*>(csr); clsparseCooMatrixPrivate* pCoo = static_cast<clsparseCooMatrixPrivate*>(coo); pCoo->num_rows = pCsr->num_rows; pCoo->num_cols = pCsr->num_cols; pCoo->num_nonzeros = pCsr->num_nonzeros; if (!clsparseInitialized) { return clsparseNotInitialized; } //check opencl elements if (control == nullptr) { return clsparseInvalidControlObject; } clsparseStatus status; //validate cl_mem objects status = validateMemObject(pCoo->rowIndices, sizeof(cl_int)* pCoo->num_nonzeros); if(status != clsparseSuccess) return status; status = validateMemObject(pCoo->colIndices, sizeof(cl_int)* pCoo->num_nonzeros); if(status != clsparseSuccess) return status; status = validateMemObject(pCoo->values, sizeof(cl_float)* pCoo->num_nonzeros); if(status != clsparseSuccess) return status; //validate cl_mem sizes //TODO: ask about validateMemObjectSize cl_uint nnz_per_row = pCoo->num_nonzeros / pCoo->num_rows; //average num_nonzeros per row cl_uint wave_size = control->wavefront_size; cl_uint group_size = 256; //wave_size * 8; // 256 gives best performance! cl_uint subwave_size = wave_size; // adjust subwave_size according to nnz_per_row; // each wavefron will be assigned to the row of the csr matrix if(wave_size > 32) { //this apply only for devices with wavefront > 32 like AMD(64) if (nnz_per_row < 64) { subwave_size = 32; } } if (nnz_per_row < 32) { subwave_size = 16; } if (nnz_per_row < 16) { subwave_size = 8; } if (nnz_per_row < 8) { subwave_size = 4; } if (nnz_per_row < 4) { subwave_size = 2; } const std::string params = std::string() + "-DINDEX_TYPE=" + OclTypeTraits<cl_int>::type + " -DVALUE_TYPE=" + OclTypeTraits<cl_float>::type + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type + " -DWG_SIZE=" + std::to_string(group_size) + " -DWAVE_SIZE=" + std::to_string(wave_size) + " -DSUBWAVE_SIZE=" + std::to_string(subwave_size); //TODO add error handling //copy indices clEnqueueCopyBuffer(control->queue(), pCsr-> colIndices, pCoo-> colIndices, 0, 0, sizeof(cl_int) * pCoo->num_nonzeros, 0, NULL, NULL); //copy values clEnqueueCopyBuffer(control->queue(), pCsr-> values, pCoo-> values, 0, 0, sizeof(cl_float) * pCoo->num_nonzeros, 0, NULL, NULL); return csr2coo_transform( pCoo->num_rows, pCoo->num_cols, pCsr->rowOffsets, pCoo->rowIndices, params, group_size, subwave_size, control); }
clsparseStatus indices_to_offsets(clsparse::vector<T>& offsets, const clsparse::vector<T>& indices, const clsparseControl control) { typedef typename clsparse::vector<T> IndicesArray; typedef typename clsparse::vector<T>::size_type SizeType; //if (std::is_integral<T>) if (!clsparseInitialized) { return clsparseNotInitialized; } //check opencl elements if (control == nullptr) { return clsparseInvalidControlObject; } SizeType size = indices.size() > offsets.size() ? indices.size() : offsets.size(); IndicesArray values (control, indices.size(), 1, CL_MEM_READ_WRITE, true); IndicesArray keys_output (control, indices.size(), 0, CL_MEM_READ_WRITE, false); IndicesArray values_output (control, size, 0, CL_MEM_READ_WRITE, false); clsparseStatus status = internal::reduce_by_key(keys_output, values_output, indices, values, control); CLSPARSE_V(status, "Error: reduce_by_key"); if (status != clsparseSuccess) return status; assert(values_output.size() >= offsets.size()); cl_event clEvent; cl_int cl_status = clEnqueueCopyBuffer(control->queue(), values_output.data()(), offsets.data()(), 0, 0, offsets.size() * sizeof(T), 0, nullptr, &clEvent); CLSPARSE_V(cl_status, "Error: Enqueue copy buffer values to offsets"); cl_status = clWaitForEvents(1, &clEvent); CLSPARSE_V(cl_status, "Error: clWaitForEvents"); cl_status = clReleaseEvent(clEvent); CLSPARSE_V(cl_status, "Error: clReleaseEvent"); // Dunno why but this throws CL_INVALID_CONTEXT erro; // cl::Event event; // cl::enqueueCopyBuffer(values_output.data(), offsets.data(), // 0, 0, offsets.size(), // nullptr, &event); // CLSPARSE_V(cl_status, "Error: enqueueCopyBuffer"); // CLSPARSE_V(event.wait(), "Error: event wait"); status = exclusive_scan<EW_PLUS>(offsets, offsets, control); return status; }
clsparseStatus clsparseDCsrMatrixfromFile( clsparseCsrMatrix* csrMatx, const char* filePath, clsparseControl control, cl_bool read_explicit_zeroes ) { clsparseCsrMatrixPrivate* pCsrMatx = static_cast<clsparseCsrMatrixPrivate*>( csrMatx ); // Check that the file format is matrix market; the only format we can read right now // This is not a complete solution, and fails for directories with file names etc... // TODO: Should we use boost filesystem? std::string strPath( filePath ); if( strPath.find_last_of( '.' ) != std::string::npos ) { std::string ext = strPath.substr( strPath.find_last_of( '.' ) + 1 ); if( ext != "mtx" ) return clsparseInvalidFileFormat; } else return clsparseInvalidFileFormat; // Read data from a file on disk into CPU buffers // Data is read natively as COO format with the reader MatrixMarketReader< cl_double > mm_reader; if( mm_reader.MMReadFormat( filePath, read_explicit_zeroes ) ) return clsparseInvalidFile; // BUG: We need to check to see if openCL buffers currently exist and deallocate them first! // FIX: Below code will check whether the buffers were allocated in the first place; { clsparseStatus validationStatus = validateMemObject(pCsrMatx->values, mm_reader.GetNumNonZeroes() * sizeof(cl_double)); // I dont want to reallocate buffer because I suppress the users buffer memory flags; // It is users responsibility to provide good buffer; if (validationStatus != clsparseSuccess) return validationStatus; validationStatus = validateMemObject(pCsrMatx->col_indices, mm_reader.GetNumNonZeroes() * sizeof(clsparseIdx_t)); if (validationStatus != clsparseSuccess) return validationStatus; validationStatus = validateMemObject(pCsrMatx->row_pointer, (mm_reader.GetNumRows() + 1) * sizeof(clsparseIdx_t)); if (validationStatus != clsparseSuccess) return validationStatus; } pCsrMatx->num_rows = mm_reader.GetNumRows( ); pCsrMatx->num_cols = mm_reader.GetNumCols( ); pCsrMatx->num_nonzeros = mm_reader.GetNumNonZeroes( ); // Transfers data from CPU buffer to GPU buffers cl_int mapStatus = 0; clMemRAII< cl_double > rCsrValues( control->queue( ), pCsrMatx->values); clMemRAII< clsparseIdx_t > rCsrcol_indices( control->queue( ), pCsrMatx->col_indices ); clMemRAII< clsparseIdx_t > rCsrrow_pointer( control->queue( ), pCsrMatx->row_pointer ); cl_double* fCsrValues = rCsrValues.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->valOffset( ), pCsrMatx->num_nonzeros, &mapStatus ); if (mapStatus != CL_SUCCESS) { CLSPARSE_V(mapStatus, "Error: Mapping rCsrValues failed"); return clsparseInvalidMemObj; } clsparseIdx_t* iCsrcol_indices = rCsrcol_indices.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->colIndOffset( ), pCsrMatx->num_nonzeros, &mapStatus ); if (mapStatus != CL_SUCCESS) { CLSPARSE_V(mapStatus, "Error: Mapping rCsrcol_indices failed"); return clsparseInvalidMemObj; } clsparseIdx_t* iCsrrow_pointer = rCsrrow_pointer.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->rowOffOffset( ), pCsrMatx->num_rows + 1, &mapStatus ); if (mapStatus != CL_SUCCESS) { CLSPARSE_V(mapStatus, "Error: Mapping rCsrrow_pointer failed"); return clsparseInvalidMemObj; } // The following section of code converts the sparse format from COO to CSR Coordinate< cl_double >* coords = mm_reader.GetUnsymCoordinates( ); std::sort( coords, coords + pCsrMatx->num_nonzeros, CoordinateCompare< cl_double > ); clsparseIdx_t current_row = 1; iCsrrow_pointer[ 0 ] = 0; for (clsparseIdx_t i = 0; i < pCsrMatx->num_nonzeros; i++) { iCsrcol_indices[ i ] = coords[ i ].y; fCsrValues[ i ] = coords[ i ].val; while( coords[ i ].x >= current_row ) iCsrrow_pointer[ current_row++ ] = i; } iCsrrow_pointer[ current_row ] = pCsrMatx->num_nonzeros; while( current_row <= pCsrMatx->num_rows ) iCsrrow_pointer[ current_row++ ] = pCsrMatx->num_nonzeros; return clsparseSuccess; }
clsparseStatus reduce_by_key( int keys_first, int keys_last, int values_first, cl_mem keys_input, cl_mem values_input, cl_mem keys_output, cl_mem values_output, int *count, clsparseControl control ) { cl_int l_Error; /********************************************************************************** * Compile Options *********************************************************************************/ const int kernel0_WgSize = WAVESIZE*KERNEL02WAVES; const int kernel1_WgSize = WAVESIZE*KERNEL1WAVES; const int kernel2_WgSize = WAVESIZE*KERNEL02WAVES; //const std::string params = std::string() + // " -DKERNEL0WORKGROUPSIZE=" + std::to_string(kernel0_WgSize) // + " -DKERNEL1WORKGROUPSIZE=" + std::to_string(kernel1_WgSize) // + " -DKERNEL2WORKGROUPSIZE=" + std::to_string(kernel2_WgSize); const std::string params; cl::Context context = control->getContext(); std::vector<cl::Device> dev = context.getInfo<CL_CONTEXT_DEVICES>(); int computeUnits = dev[0].getInfo< CL_DEVICE_MAX_COMPUTE_UNITS >( ); int wgPerComputeUnit = dev[0].getInfo< CL_DEVICE_MAX_WORK_GROUP_SIZE >( ); int resultCnt = computeUnits * wgPerComputeUnit; cl_uint numElements = keys_last - keys_first + 1; size_t sizeInputBuff = numElements; int modWgSize = (sizeInputBuff & (kernel0_WgSize-1)); if( modWgSize ) { sizeInputBuff &= ~modWgSize; sizeInputBuff += kernel0_WgSize; } cl_uint numWorkGroupsK0 = static_cast< cl_uint >( sizeInputBuff / kernel0_WgSize ); size_t sizeScanBuff = numWorkGroupsK0; modWgSize = (sizeScanBuff & (kernel0_WgSize-1)); if( modWgSize ) { sizeScanBuff &= ~modWgSize; sizeScanBuff += kernel0_WgSize; } cl_mem tempArrayVec = clCreateBuffer(context(),CL_MEM_READ_WRITE, (numElements)*sizeof(int), NULL, NULL ); /********************************************************************************** * Kernel 0 *********************************************************************************/ cl::Kernel kernel0 = KernelCache::get(control->queue,"reduce_by_key", "OffsetCalculation", params); KernelWrap kWrapper0(kernel0); kWrapper0 << keys_input << tempArrayVec << numElements; cl::NDRange local0(kernel0_WgSize); cl::NDRange global0(sizeInputBuff); cl_int status = kWrapper0.run(control, global0, local0); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } int init = 0; scan(0, numElements - 1, tempArrayVec, tempArrayVec, 0, 0, control ); int pattern = 0; cl_mem keySumArray = clCreateBuffer(context(),CL_MEM_READ_WRITE, (sizeScanBuff)*sizeof(int), NULL, NULL ); cl_mem preSumArray = clCreateBuffer(context(),CL_MEM_READ_WRITE, (sizeScanBuff)*sizeof(int), NULL, NULL ); cl_mem postSumArray = clCreateBuffer(context(),CL_MEM_READ_WRITE,(sizeScanBuff)*sizeof(int), NULL, NULL ); clEnqueueFillBuffer(control->queue(), keySumArray, &pattern, sizeof(int), 0, (sizeScanBuff)*sizeof(int), 0, NULL, NULL); clEnqueueFillBuffer(control->queue(), preSumArray, &pattern, sizeof(int), 0, (sizeScanBuff)*sizeof(int), 0, NULL, NULL); clEnqueueFillBuffer(control->queue(), postSumArray, &pattern, sizeof(int), 0, (sizeScanBuff)*sizeof(int), 0, NULL, NULL); /********************************************************************************** * Kernel 1 *********************************************************************************/ cl::Kernel kernel1 = KernelCache::get(control->queue,"reduce_by_key", "perBlockScanByKey", params); KernelWrap kWrapper1(kernel1); kWrapper1 << tempArrayVec << values_input << numElements << keySumArray << preSumArray; cl::NDRange local1(kernel0_WgSize); cl::NDRange global1(sizeInputBuff); status = kWrapper1.run(control, global1, local1); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } /********************************************************************************** * Kernel 2 *********************************************************************************/ cl_uint workPerThread = static_cast< cl_uint >( sizeScanBuff / kernel1_WgSize ); cl::Kernel kernel2 = KernelCache::get(control->queue,"reduce_by_key", "intraBlockInclusiveScanByKey", params); KernelWrap kWrapper2(kernel2); kWrapper2 << keySumArray << preSumArray << postSumArray << numWorkGroupsK0 << workPerThread; cl::NDRange local2(kernel1_WgSize); cl::NDRange global2(kernel1_WgSize); status = kWrapper2.run(control, global2, local2); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } /********************************************************************************** * Kernel 3 *********************************************************************************/ cl::Kernel kernel3 = KernelCache::get(control->queue,"reduce_by_key", "keyValueMapping", params); KernelWrap kWrapper3(kernel3); kWrapper3 << keys_input << keys_output << values_input << values_output << tempArrayVec << keySumArray << postSumArray << numElements; cl::NDRange local3(kernel0_WgSize); cl::NDRange global3(sizeInputBuff); status = kWrapper3.run(control, global3, local3); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } int *h_result = (int *) malloc (sizeof(int)); clEnqueueReadBuffer(control->queue(), tempArrayVec, 1, (numElements-1)*sizeof(int), sizeof(int), h_result, 0, 0, 0); *count = *(h_result); //printf("h_result = %d\n", *count ); //release buffers clReleaseMemObject(tempArrayVec); clReleaseMemObject(preSumArray); clReleaseMemObject(postSumArray); clReleaseMemObject(keySumArray); return clsparseSuccess; } //end of reduce_by_key