void initialize_gpu_buffer() { cudaError_t err = cudaMemcpy(device_row_offsets, &row_offsets[0], row_offsets.size() * sizeof(int), cudaMemcpyHostToDevice); CUDA_V_THROW(err, "cudaMalloc device_row_offsets"); err = cudaMemcpy(device_col_indices, &col_indices[0], col_indices.size() * sizeof(int), cudaMemcpyHostToDevice); CUDA_V_THROW(err, "cudaMalloc device_col_indices"); err = cudaMemcpy(device_values, &values[0], values.size() * sizeof(T), cudaMemcpyHostToDevice); CUDA_V_THROW(err, "cudaMalloc device_values"); err = cudaMemset(device_A, 0x0, n_rows * n_cols * sizeof(T)); CUDA_V_THROW(err, "cudaMalloc device_A"); // call csr2dense to get input in dense format csr2dense_Function(true); int nnzA; // Compute number of nonzero elements per row if (typeid(T) == typeid(float)) { cuSparseStatus = cusparseSnnz(handle, CUSPARSE_DIRECTION_ROW, n_rows, n_cols, descrA, reinterpret_cast< float*> (device_A), n_rows, nnzPerRow, &nnzA); CUDA_V_THROW(cuSparseStatus, "cusparseSnnz"); } else if (typeid(T) == typeid(double)) { cuSparseStatus = cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, n_rows, n_cols, descrA, reinterpret_cast< double*> (device_A), n_rows, nnzPerRow, &nnzA); CUDA_V_THROW(cuSparseStatus, "cusparseDnnz"); } else { // error } if (nnzA != n_vals) { // error } cudaDeviceSynchronize(); // Once we get input in dense format, no-loner input csr values are needed. CUDA_V_THROW(cudaFree(device_values), "cudafree device_values"); CUDA_V_THROW(cudaFree(device_row_offsets), "cudafree device_row_offsets"); CUDA_V_THROW(cudaFree(device_col_indices), "cudafree device_col_indices"); }// end
cusparseStatus_t cusparseXnnz(cusparseDirection_t dirA, int m, int n, const cusparseMatDescr_t descrA, const float *A, int lda, int *nnzPerRowColumn, int *nnzTotalDevHostPtr) { return cusparseSnnz(g_context->cusparseHandle, dirA, m, n, descrA, A, lda, nnzPerRowColumn, nnzTotalDevHostPtr); }