void fill_matrix(const command_queue &q, int n, int m, const row_t *row, const col_t *col, const double *val) { device_vector<int> r(q, n + 1, row); device_vector<int> c(q, row[n], col + row[0]); device_vector<double> v(q, row[n], val + row[0]); if (row[0] != 0) vector<int>(q, r) -= row[0]; cuda_check( cusparseDcsr2hyb(handle, n, m, desc.get(), v.raw_ptr(), r.raw_ptr(), c.raw_ptr(), mat.get(), 0, CUSPARSE_HYB_PARTITION_AUTO ) ); }
int TxMatrixOptimizationDataCU::ingestLocalMatrix(SparseMatrix& A) { std::vector<local_int_t> i(A.localNumberOfRows + 1, 0); // Slight overallocation for these arrays std::vector<local_int_t> j; j.reserve(A.localNumberOfNonzeros); std::vector<double> a; a.reserve(A.localNumberOfNonzeros); scatterFromHalo.setNumRows(A.localNumberOfRows); scatterFromHalo.setNumCols(A.localNumberOfColumns); scatterFromHalo.clear(); // We're splitting the matrix into diagonal and off-diagonal block to // enable overlapping of computation and communication. i[0] = 0; for (local_int_t m = 0; m < A.localNumberOfRows; ++m) { local_int_t nonzerosInRow = 0; for (local_int_t n = 0; n < A.nonzerosInRow[m]; ++n) { local_int_t col = A.mtxIndL[m][n]; if (col < A.localNumberOfRows) { j.push_back(col); a.push_back(A.matrixValues[m][n]); ++nonzerosInRow; } else { scatterFromHalo.addEntry(m, col, A.matrixValues[m][n]); } } i[m + 1] = i[m] + nonzerosInRow; } // Setup SpMV data on Device cudaError_t err = cudaSuccess; int* i_d; err = cudaMalloc((void**)&i_d, i.size() * sizeof(i[0])); CHKCUDAERR(err); err = cudaMemcpy(i_d, &i[0], i.size() * sizeof(i[0]), cudaMemcpyHostToDevice); CHKCUDAERR(err); int* j_d; err = cudaMalloc((void**)&j_d, j.size() * sizeof(j[0])); CHKCUDAERR(err); err = cudaMemcpy(j_d, &j[0], j.size() * sizeof(j[0]), cudaMemcpyHostToDevice); CHKCUDAERR(err); double* a_d; err = cudaMalloc((void**)&a_d, a.size() * sizeof(a[0])); CHKCUDAERR(err); err = cudaMemcpy(a_d, &a[0], a.size() * sizeof(a[0]), cudaMemcpyHostToDevice); CHKCUDAERR(err); cusparseStatus_t cerr = CUSPARSE_STATUS_SUCCESS; cerr = cusparseCreateMatDescr(&matDescr); CHKCUSPARSEERR(cerr); cerr = cusparseSetMatIndexBase(matDescr, CUSPARSE_INDEX_BASE_ZERO); CHKCUSPARSEERR(cerr); cerr = cusparseSetMatType(matDescr, CUSPARSE_MATRIX_TYPE_GENERAL); CHKCUSPARSEERR(cerr); cerr = cusparseCreateHybMat(&localMatrix); CHKCUSPARSEERR(cerr); cerr = cusparseDcsr2hyb(handle, A.localNumberOfRows, A.localNumberOfColumns, matDescr, a_d, i_d, j_d, localMatrix, 27, CUSPARSE_HYB_PARTITION_USER); CHKCUSPARSEERR(cerr); #ifndef HPCG_NOMPI err = cudaMalloc((void**)&elementsToSend, A.totalToBeSent * sizeof(*elementsToSend)); CHKCUDAERR(err); err = cudaMemcpy(elementsToSend, A.elementsToSend, A.totalToBeSent * sizeof(*elementsToSend), cudaMemcpyHostToDevice); CHKCUDAERR(err); err = cudaMalloc((void**)&sendBuffer_d, A.totalToBeSent * sizeof(double)); CHKCUDAERR(err); #endif // Set up the GS data. gelusStatus_t gerr = GELUS_STATUS_SUCCESS; gelusSolveDescription_t solveDescr; gerr = gelusCreateSolveDescr(&solveDescr); CHKGELUSERR(gerr); gerr = gelusSetSolveOperation(solveDescr, GELUS_OPERATION_NON_TRANSPOSE); CHKGELUSERR(gerr); gerr = gelusSetSolveFillMode(solveDescr, GELUS_FILL_MODE_FULL); CHKGELUSERR(gerr); gerr = gelusSetSolveStorageFormat(solveDescr, GELUS_STORAGE_FORMAT_HYB); CHKGELUSERR(gerr); gerr = gelusSetOptimizationLevel(solveDescr, GELUS_OPTIMIZATION_LEVEL_THREE); CHKGELUSERR(gerr); gerr = cugelusCreateSorIterationData(&gsContext); CHKGELUSERR(gerr); #ifdef HPCG_DEBUG std::cout << A.localNumberOfRows << std::endl; std::cout << A.localNumberOfColumns << std::endl; std::cout << A.localNumberOfNonzeros << std::endl; int myrank; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); if (myrank == 0) { dumpMatrix(std::cout, i, j, a); } #endif gerr = cugelusDcsrsor_iteration_analysis( A.localNumberOfRows, solveDescr, GELUS_SOR_SYMMETRIC, 1.0, &i[0], &j[0], &a[0], gsContext); gerr = gelusDestroySolveDescr(solveDescr); CHKGELUSERR(gerr); if (A.mgData) { err = cudaMalloc((void**)&f2c, A.mgData->rc->localLength * sizeof(local_int_t)); CHKCUDAERR(err); err = cudaMemcpy(f2c, A.mgData->f2cOperator, A.mgData->rc->localLength * sizeof(local_int_t), cudaMemcpyHostToDevice); CHKCUDAERR(err); } err = cudaMalloc((void**)&workvector, A.localNumberOfRows * sizeof(double)); CHKCUDAERR(err); return (int)cerr | (int)gerr | (int)err; }
/* //////////////////////////////////////////////////////////////////////////// -- testing sparse matrix vector product */ int main( int argc, char** argv ) { TESTING_INIT(); magma_queue_t queue; magma_queue_create( /*devices[ opts->device ],*/ &queue ); magma_d_sparse_matrix hA, hA_SELLP, hA_ELL, dA, dA_SELLP, dA_ELL; hA_SELLP.blocksize = 8; hA_SELLP.alignment = 8; real_Double_t start, end, res; magma_int_t *pntre; double c_one = MAGMA_D_MAKE(1.0, 0.0); double c_zero = MAGMA_D_MAKE(0.0, 0.0); magma_int_t i, j; for( i = 1; i < argc; ++i ) { if ( strcmp("--blocksize", argv[i]) == 0 ) { hA_SELLP.blocksize = atoi( argv[++i] ); } else if ( strcmp("--alignment", argv[i]) == 0 ) { hA_SELLP.alignment = atoi( argv[++i] ); } else break; } printf( "\n# usage: ./run_dspmv" " [ --blocksize %d --alignment %d (for SELLP) ]" " matrices \n\n", (int) hA_SELLP.blocksize, (int) hA_SELLP.alignment ); while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); magma_dm_5stencil( laplace_size, &hA, queue ); } else { // file-matrix test magma_d_csr_mtx( &hA, argv[i], queue ); } printf( "\n# matrix info: %d-by-%d with %d nonzeros\n\n", (int) hA.num_rows,(int) hA.num_cols,(int) hA.nnz ); real_Double_t FLOPS = 2.0*hA.nnz/1e9; magma_d_vector hx, hy, dx, dy, hrefvec, hcheck; // init CPU vectors magma_d_vinit( &hx, Magma_CPU, hA.num_rows, c_zero, queue ); magma_d_vinit( &hy, Magma_CPU, hA.num_rows, c_zero, queue ); // init DEV vectors magma_d_vinit( &dx, Magma_DEV, hA.num_rows, c_one, queue ); magma_d_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); #ifdef MAGMA_WITH_MKL // calling MKL with CSR pntre = (magma_int_t*)malloc( (hA.num_rows+1)*sizeof(magma_int_t) ); pntre[0] = 0; for (j=0; j<hA.num_rows; j++ ) { pntre[j] = hA.row[j+1]; } MKL_INT num_rows = hA.num_rows; MKL_INT num_cols = hA.num_cols; MKL_INT nnz = hA.nnz; MKL_INT *col; TESTING_MALLOC_CPU( col, MKL_INT, nnz ); for( magma_int_t t=0; t < hA.nnz; ++t ) { col[ t ] = hA.col[ t ]; } MKL_INT *row; TESTING_MALLOC_CPU( row, MKL_INT, num_rows ); for( magma_int_t t=0; t < hA.num_rows; ++t ) { row[ t ] = hA.col[ t ]; } start = magma_wtime(); for (j=0; j<10; j++ ) { mkl_dcsrmv( "N", &num_rows, &num_cols, MKL_ADDR(&c_one), "GFNC", MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), MKL_ADDR(&c_zero), MKL_ADDR(hy.val) ); } end = magma_wtime(); printf( "\n > MKL : %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10/(end-start) ); TESTING_FREE_CPU( row ); TESTING_FREE_CPU( col ); free(pntre); #endif // MAGMA_WITH_MKL // copy matrix to GPU magma_d_mtransfer( hA, &dA, Magma_CPU, Magma_DEV, queue ); // SpMV on GPU (CSR) -- this is the reference! start = magma_sync_wtime( queue ); for (j=0; j<10; j++) magma_d_spmv( c_one, dA, dx, c_zero, dy, queue ); end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (standard CSR).\n", (end-start)/10, FLOPS*10/(end-start) ); magma_d_mfree(&dA, queue ); magma_d_vtransfer( dy, &hrefvec , Magma_DEV, Magma_CPU, queue ); // convert to ELL and copy to GPU magma_d_mconvert( hA, &hA_ELL, Magma_CSR, Magma_ELL, queue ); magma_d_mtransfer( hA_ELL, &dA_ELL, Magma_CPU, Magma_DEV, queue ); magma_d_mfree(&hA_ELL, queue ); magma_d_vfree( &dy, queue ); magma_d_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); // SpMV on GPU (ELL) start = magma_sync_wtime( queue ); for (j=0; j<10; j++) magma_d_spmv( c_one, dA_ELL, dx, c_zero, dy, queue ); end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (standard ELL).\n", (end-start)/10, FLOPS*10/(end-start) ); magma_d_mfree(&dA_ELL, queue ); magma_d_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue ); res = 0.0; for(magma_int_t k=0; k<hA.num_rows; k++ ) res=res + MAGMA_D_REAL(hcheck.val[k]) - MAGMA_D_REAL(hrefvec.val[k]); if ( res < .000001 ) printf("# tester spmv ELL: ok\n"); else printf("# tester spmv ELL: failed\n"); magma_d_vfree( &hcheck, queue ); // convert to SELLP and copy to GPU magma_d_mconvert( hA, &hA_SELLP, Magma_CSR, Magma_SELLP, queue ); magma_d_mtransfer( hA_SELLP, &dA_SELLP, Magma_CPU, Magma_DEV, queue ); magma_d_mfree(&hA_SELLP, queue ); magma_d_vfree( &dy, queue ); magma_d_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); // SpMV on GPU (SELLP) start = magma_sync_wtime( queue ); for (j=0; j<10; j++) magma_d_spmv( c_one, dA_SELLP, dx, c_zero, dy, queue ); end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (SELLP).\n", (end-start)/10, FLOPS*10/(end-start) ); magma_d_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue ); res = 0.0; for(magma_int_t k=0; k<hA.num_rows; k++ ) res=res + MAGMA_D_REAL(hcheck.val[k]) - MAGMA_D_REAL(hrefvec.val[k]); printf("# |x-y|_F = %8.2e\n", res); if ( res < .000001 ) printf("# tester spmv SELL-P: ok\n"); else printf("# tester spmv SELL-P: failed\n"); magma_d_vfree( &hcheck, queue ); magma_d_mfree(&dA_SELLP, queue ); // SpMV on GPU (CUSPARSE - CSR) // CUSPARSE context // cusparseHandle_t cusparseHandle = 0; cusparseStatus_t cusparseStatus; cusparseStatus = cusparseCreate(&cusparseHandle); cusparseSetStream( cusparseHandle, queue ); cusparseMatDescr_t descr = 0; cusparseStatus = cusparseCreateMatDescr(&descr); cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); double alpha = c_one; double beta = c_zero; magma_d_vfree( &dy, queue ); magma_d_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); // copy matrix to GPU magma_d_mtransfer( hA, &dA, Magma_CPU, Magma_DEV, queue ); start = magma_sync_wtime( queue ); for (j=0; j<10; j++) cusparseStatus = cusparseDcsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, hA.num_rows, hA.num_cols, hA.nnz, &alpha, descr, dA.dval, dA.drow, dA.dcol, dx.dval, &beta, dy.dval); end = magma_sync_wtime( queue ); if (cusparseStatus != 0) printf("error in cuSPARSE CSR\n"); printf( " > CUSPARSE: %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10/(end-start) ); cusparseMatDescr_t descrA; cusparseStatus = cusparseCreateMatDescr(&descrA); if (cusparseStatus != 0) printf("error\n"); cusparseHybMat_t hybA; cusparseStatus = cusparseCreateHybMat( &hybA ); if (cusparseStatus != 0) printf("error\n"); magma_d_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue ); res = 0.0; for(magma_int_t k=0; k<hA.num_rows; k++ ) res=res + MAGMA_D_REAL(hcheck.val[k]) - MAGMA_D_REAL(hrefvec.val[k]); printf("# |x-y|_F = %8.2e\n", res); if ( res < .000001 ) printf("# tester spmv cuSPARSE CSR: ok\n"); else printf("# tester spmv cuSPARSE CSR: failed\n"); magma_d_vfree( &hcheck, queue ); magma_d_vfree( &dy, queue ); magma_d_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); cusparseDcsr2hyb(cusparseHandle, hA.num_rows, hA.num_cols, descrA, dA.dval, dA.drow, dA.dcol, hybA, 0, CUSPARSE_HYB_PARTITION_AUTO); start = magma_sync_wtime( queue ); for (j=0; j<10; j++) cusparseStatus = cusparseDhybmv( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA, hybA, dx.dval, &beta, dy.dval); end = magma_sync_wtime( queue ); if (cusparseStatus != 0) printf("error in cuSPARSE HYB\n"); printf( " > CUSPARSE: %.2e seconds %.2e GFLOP/s (HYB).\n", (end-start)/10, FLOPS*10/(end-start) ); magma_d_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue ); res = 0.0; for(magma_int_t k=0; k<hA.num_rows; k++ ) res=res + MAGMA_D_REAL(hcheck.val[k]) - MAGMA_D_REAL(hrefvec.val[k]); printf("# |x-y|_F = %8.2e\n", res); if ( res < .000001 ) printf("# tester spmv cuSPARSE HYB: ok\n"); else printf("# tester spmv cuSPARSE HYB: failed\n"); magma_d_vfree( &hcheck, queue ); cusparseDestroyMatDescr( descrA ); cusparseDestroyHybMat( hybA ); cusparseDestroy( cusparseHandle ); magma_d_mfree(&dA, queue ); printf("\n\n"); // free CPU memory magma_d_mfree(&hA, queue ); magma_d_vfree(&hx, queue ); magma_d_vfree(&hy, queue ); magma_d_vfree(&hrefvec, queue ); // free GPU memory magma_d_vfree(&dx, queue ); magma_d_vfree(&dy, queue ); i++; } magma_queue_destroy( queue ); TESTING_FINALIZE(); return 0; }