void setup_buffer( double alpha, double beta, const std::string& path ) { int fileError = sparseHeaderfromFile(&n_vals, &n_rows, &n_cols, path.c_str()); if (fileError != 0) { throw clsparse::io_exception( "Could not read matrix market header from disk: " + path); } if (csrMatrixfromFile(row_offsets, col_indices, values, path.c_str(), explicit_zeroes)) { throw clsparse::io_exception( "Could not read matrix market from disk: " + path); } //n_rows = row_offsets.size( ); //n_cols = col_indices.size( ); //n_vals = values.size( ); cudaError_t err = cudaMalloc( (void**) &device_row_offsets, (n_rows + 1) * sizeof( clsparseIdx_t ) ); CUDA_V_THROW( err, "cudaMalloc device_row_offsets" ); err = cudaMalloc( (void**) &device_col_indices, n_vals * sizeof( clsparseIdx_t ) ); CUDA_V_THROW( err, "cudaMalloc device_col_indices" ); err = cudaMalloc( (void**) &device_values, n_vals * sizeof( T ) ); CUDA_V_THROW( err, "cudaMalloc device_values" ); err = cudaMalloc( (void**) &device_A, n_rows * n_cols * sizeof( T ) ); CUDA_V_THROW( err, "cudaMalloc device_A" ); }
xDense2Csr(StatisticalTimer& timer) : cusparseFunc(timer) { cusparseStatus_t err = cusparseCreateMatDescr(&descrA); CUDA_V_THROW(err, "cusparseCreateMatDescr failed"); err = cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL); CUDA_V_THROW(err, "cusparseSetMatType failed"); err = cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO); CUDA_V_THROW(err, "cusparseSetMatIndexBase failed"); n_rows = 0; n_cols = 0; n_vals = 0; device_col_indices = nullptr; device_row_offsets = nullptr; device_values = nullptr; device_A = nullptr; nnzPerRow = nullptr; devRowOffsets = nullptr; devColIndices = nullptr; devValues = nullptr; }// end
void releaseGPUBuffer_deleteCPUBuffer( ) { CUDA_V_THROW( cudaFree( deviceCSRRowOffsets ), "cudafree deviceCSRRowOffsets" ); CUDA_V_THROW( cudaFree( deviceCooRowInd ), "cudafree deviceCooRowInd" ); row_indices.clear( ); col_indices.clear( ); values.clear( ); }
void initialize_gpu_buffer( ) { cudaError_t err = cudaMemcpy( deviceCooRowInd, &row_indices[ 0 ], row_indices.size( ) * sizeof( clsparseIdx_t ), cudaMemcpyHostToDevice ); CUDA_V_THROW( err, "cudaMalloc deviceCSRRowOffsets" ); err = cudaMemset(deviceCSRRowOffsets, 0x0, (n_rows + 1) * sizeof( clsparseIdx_t )); CUDA_V_THROW( err, "cudaMemset deviceCSRRowOffsets" ); }// end of function
void reset_gpu_write_buffer() { cudaError_t err = cudaMemset(devRowOffsets, 0x0, (n_rows + 1) * sizeof(int)); CUDA_V_THROW(err, "cudaMemset reset_gpu_write_buffer: devRowOffsets"); err = cudaMemset(devColIndices, 0x0, n_vals * sizeof(int)); CUDA_V_THROW(err, "cudaMemset reset_gpu_write_buffer: devColIndices"); err = cudaMemset(devValues, 0x0, n_vals * sizeof(T)); CUDA_V_THROW(err, "cudaMemset reset_gpu_write_buffer: devValues"); }
void releaseGPUBuffer_deleteCPUBuffer( ) { //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp) //need to do this before we eventually hit the destructor CUDA_V_THROW( cudaFree( device_values ), "cudafree device_values" ); CUDA_V_THROW( cudaFree( device_row_offsets ), "cudafree device_row_offsets" ); CUDA_V_THROW( cudaFree( device_col_indices ), "cudafree device_col_indices" ); CUDA_V_THROW( cudaFree( device_A ), "cudafree device_A" ); row_offsets.clear( ); col_indices.clear( ); values.clear( ); }
void initialize_gpu_buffer( ) { cudaError_t err = cudaMemcpy( device_row_offsets, &row_offsets[ 0 ], row_offsets.size( ) * sizeof( clsparseIdx_t ), cudaMemcpyHostToDevice ); CUDA_V_THROW( err, "cudaMalloc device_row_offsets" ); err = cudaMemcpy( device_col_indices, &col_indices[ 0 ], col_indices.size( ) * sizeof( clsparseIdx_t ), cudaMemcpyHostToDevice ); CUDA_V_THROW( err, "cudaMalloc device_col_indices" ); err = cudaMemcpy( device_values, &values[ 0 ], values.size( ) * sizeof( T ), cudaMemcpyHostToDevice ); CUDA_V_THROW( err, "cudaMalloc device_values" ); err = cudaMemset( device_A, 0x0, n_rows * n_cols * sizeof( T ) ); CUDA_V_THROW( err, "cudaMalloc device_A" ); }
void reset_gpu_write_buffer( ) { err = cudaMemset(deviceCSRRowOffsets, 0x0, (n_rows + 1) * sizeof( clsparseIdx_t )); CUDA_V_THROW( err, "cudaMemset deviceCSRRowOffsets" ); cudaDeviceSynchronize( ); }// end of function
xCsr2Dense( StatisticalTimer& timer, bool read_explicit_zeroes = true ): cusparseFunc( timer ) { cusparseStatus_t err = cusparseCreateMatDescr( &descrA ); CUDA_V_THROW( err, "cusparseCreateMatDescr failed" ); err = cusparseSetMatType( descrA, CUSPARSE_MATRIX_TYPE_GENERAL ); CUDA_V_THROW( err, "cusparseSetMatType failed" ); err = cusparseSetMatIndexBase( descrA, CUSPARSE_INDEX_BASE_ZERO ); CUDA_V_THROW( err, "cusparseSetMatIndexBase failed" ); n_rows = 0; n_cols = 0; n_vals = 0; explicit_zeroes = read_explicit_zeroes; }
void setup_buffer( double alpha, double beta, const std::string& path ) { int fileError = sparseHeaderfromFile( &n_vals, &n_rows, &n_cols, path.c_str( ) ); if( fileError != 0 ) { throw clsparse::io_exception( "Could not read matrix market header from disk" + path); } if( cooMatrixfromFile( row_indices, col_indices, values, path.c_str( ), explicit_zeroes ) ) { throw clsparse::io_exception( "Could not read matrix market from disk: " + path ); } // Input: COO Row Indices err = cudaMalloc( (void**)&deviceCooRowInd, n_vals * sizeof( clsparseIdx_t ) ); CUDA_V_THROW( err, "cudaMalloc deviceCooRowInd" ); // Output: CSR cudaError_t err = cudaMalloc( (void**)&deviceCSRRowOffsets, ( n_rows + 1 ) * sizeof( clsparseIdx_t ) ); CUDA_V_THROW( err, "cudaMalloc deviceCSRRowOffsets" ); }// End of function
void xCoo2Csr<double>:: xCoo2Csr_Function( bool flush ) { cuSparseStatus = cusparseXcoo2csr( handle, deviceCooRowInd, n_vals, n_rows, deviceCSRRowOffsets, CUSPARSE_INDEX_BASE_ZERO ); CUDA_V_THROW( cuSparseStatus, "cusparseCoo2Csr" ); cudaDeviceSynchronize( ); }
void xDense2Csr<double>:: csr2dense_Function(bool flush) { cuSparseStatus = cusparseDcsr2dense(handle, n_rows, n_cols, descrA, device_values, device_row_offsets, device_col_indices, device_A, n_rows); //dense Matrix A stored in Col-major format CUDA_V_THROW(cuSparseStatus, "cusparseDcsr2dense"); cudaDeviceSynchronize(); }// end of function
void xCsr2Dense<double>:: xCsr2Dense_Function( bool flush ) { cuSparseStatus = cusparseDcsr2dense( handle, n_rows, n_cols, descrA, device_values, device_row_offsets, device_col_indices, device_A, n_rows ); CUDA_V_THROW( cuSparseStatus, "cusparseDcsr2dense" ); cudaDeviceSynchronize( ); }
void xDense2Csr<double>:: xDense2Csr_Function(bool flush) { cuSparseStatus = cusparseDdense2csr(handle, n_rows, n_cols, descrA, device_A, n_rows, // dense matrix in col-major format, lda is number of elements in major dimension (number of rows) nnzPerRow, devValues, devRowOffsets, devColIndices); CUDA_V_THROW(cuSparseStatus, "cusparseDdense2csr"); cudaDeviceSynchronize(); }// end of function
void setup_buffer(double alpha, double beta, const std::string& path) { int fileError = sparseHeaderfromFile(&n_vals, &n_rows, &n_cols, path.c_str()); if (fileError != 0) { throw clsparse::io_exception("Could not read matrix market header from disk"); } if (csrMatrixfromFile(row_offsets, col_indices, values, path.c_str())) { throw clsparse::io_exception("Could not read matrix market header from disk"); } cudaError_t err = cudaMalloc((void**)&device_row_offsets, (n_rows + 1) * sizeof(int)); CUDA_V_THROW(err, "cudaMalloc device_row_offsets"); err = cudaMalloc((void**)&device_col_indices, n_vals * sizeof(int)); CUDA_V_THROW(err, "cudaMalloc device_col_indices"); err = cudaMalloc((void**)&device_values, n_vals * sizeof(T)); CUDA_V_THROW(err, "cudaMalloc device_values"); err = cudaMalloc((void**)&device_A, n_rows * n_cols * sizeof(T)); CUDA_V_THROW(err, "cudaMalloc device_A"); // Output CSR err = cudaMalloc((void**)&devRowOffsets, (n_rows + 1) * sizeof(int)); CUDA_V_THROW(err, "cudaMalloc devRowOffsets"); err = cudaMalloc((void**)&devColIndices, n_vals * sizeof(int)); CUDA_V_THROW(err, "cudaMalloc devColIndices"); err = cudaMalloc((void**)&devValues, n_vals * sizeof(T)); CUDA_V_THROW(err, "cudaMalloc devValues"); // Allocate memory for nnzPerRow err = cudaMalloc((void**)&nnzPerRow, n_rows * sizeof(int)); CUDA_V_THROW(err, "cudaMalloc nnzPerRow"); }// end
void reset_gpu_write_buffer( ) { cudaError_t err = cudaMemset( device_A, 0x0, n_rows * n_cols * sizeof( T ) ); CUDA_V_THROW( err, "cudaMemset reset_gpu_write_buffer" ); }
void initialize_gpu_buffer() { cudaError_t err = cudaMemcpy(device_row_offsets, &row_offsets[0], row_offsets.size() * sizeof(int), cudaMemcpyHostToDevice); CUDA_V_THROW(err, "cudaMalloc device_row_offsets"); err = cudaMemcpy(device_col_indices, &col_indices[0], col_indices.size() * sizeof(int), cudaMemcpyHostToDevice); CUDA_V_THROW(err, "cudaMalloc device_col_indices"); err = cudaMemcpy(device_values, &values[0], values.size() * sizeof(T), cudaMemcpyHostToDevice); CUDA_V_THROW(err, "cudaMalloc device_values"); err = cudaMemset(device_A, 0x0, n_rows * n_cols * sizeof(T)); CUDA_V_THROW(err, "cudaMalloc device_A"); // call csr2dense to get input in dense format csr2dense_Function(true); int nnzA; // Compute number of nonzero elements per row if (typeid(T) == typeid(float)) { cuSparseStatus = cusparseSnnz(handle, CUSPARSE_DIRECTION_ROW, n_rows, n_cols, descrA, reinterpret_cast< float*> (device_A), n_rows, nnzPerRow, &nnzA); CUDA_V_THROW(cuSparseStatus, "cusparseSnnz"); } else if (typeid(T) == typeid(double)) { cuSparseStatus = cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, n_rows, n_cols, descrA, reinterpret_cast< double*> (device_A), n_rows, nnzPerRow, &nnzA); CUDA_V_THROW(cuSparseStatus, "cusparseDnnz"); } else { // error } if (nnzA != n_vals) { // error } cudaDeviceSynchronize(); // Once we get input in dense format, no-loner input csr values are needed. CUDA_V_THROW(cudaFree(device_values), "cudafree device_values"); CUDA_V_THROW(cudaFree(device_row_offsets), "cudafree device_row_offsets"); CUDA_V_THROW(cudaFree(device_col_indices), "cudafree device_col_indices"); }// end