magma_int_t magma_capplypastix( magma_c_vector b, magma_c_vector *x, magma_c_preconditioner *precond ){ #if defined(HAVE_PASTIX) #if defined(PRECISION_d) pastix_int_t ncol; /* Size of the matrix */ pastix_int_t *colptr = NULL; /* Indexes of first element of each column in row and values */ pastix_int_t *rows = NULL; /* Row of each element of the matrix */ pastix_float_t *values = NULL; /* Value of each element of the matrix */ pastix_float_t *rhs = NULL; /* right hand side */ pastix_int_t *iparm; /* integer parameters for pastix */ float *dparm; /* floating parameters for pastix */ pastix_int_t *perm = NULL; /* Permutation tabular */ pastix_int_t *invp = NULL; /* Reverse permutation tabular */ magma_c_vector b_h; magma_c_vtransfer( b, &b_h, b.memory_location, Magma_CPU); rhs = (pastix_float_t*) b_h.val; ncol = precond->M.num_rows; colptr = (pastix_int_t*) precond->M.col; rows = (pastix_int_t*) precond->M.row; values = (pastix_float_t*) precond->M.val; iparm = precond->iparm; dparm = precond->dparm; perm = (pastix_int_t*)precond->int_array_1; invp = (pastix_int_t*)precond->int_array_1; /*******************************************/ /* Step 5 - Solve */ /* For each one of your Right-hand-side */ /* members. */ /* Also consider using multiple */ /* right-hand-side members. */ /*******************************************/ iparm[IPARM_START_TASK] = API_TASK_SOLVE; iparm[IPARM_END_TASK] = API_TASK_REFINEMENT; pastix(&(precond->pastix_data), MPI_COMM_WORLD, ncol, colptr, rows, values, perm, invp, b_h.val, 1, iparm, dparm); // fix that x is not allocated every time // in case of many iterations, it might be faster to use // magma_csetvector( ncol, // b_h.val, 1, x->val, 1 ); magma_c_vfree( x ); magma_c_vtransfer( b_h, x, Magma_CPU, b.memory_location); magma_c_vfree( &b_h); #else printf( "error: only real supported yet.\n"); #endif #else printf( "error: pastix not available.\n"); #endif return MAGMA_SUCCESS; }
magma_int_t magma_cpastixsetup( magma_c_sparse_matrix A, magma_c_vector b, magma_c_preconditioner *precond ){ #if defined(HAVE_PASTIX) #if defined(PRECISION_d) pastix_data_t *pastix_data = NULL; /* Pointer to a storage structure needed by pastix */ pastix_int_t ncol; /* Size of the matrix */ pastix_int_t *colptr = NULL; /* Indexes of first element of each column in row and values */ pastix_int_t *rows = NULL; /* Row of each element of the matrix */ pastix_float_t *values = NULL; /* Value of each element of the matrix */ pastix_float_t *rhs = NULL; /* right hand side */ pastix_int_t *iparm = NULL; /* integer parameters for pastix */ float *dparm = NULL; /* floating parameters for pastix */ pastix_int_t *perm = NULL; /* Permutation tabular */ pastix_int_t *invp = NULL; /* Reverse permutation tabular */ pastix_int_t mat_type; magma_c_sparse_matrix A_h1, B; magma_c_vector diag, c_t, b_h; magma_c_vinit( &c_t, Magma_CPU, A.num_rows, MAGMA_C_ZERO ); magma_c_vinit( &diag, Magma_CPU, A.num_rows, MAGMA_C_ZERO ); magma_c_vtransfer( b, &b_h, A.memory_location, Magma_CPU); if( A.storage_type != Magma_CSR ){ magma_c_mtransfer( A, &A_h1, A.memory_location, Magma_CPU); magma_c_mconvert( A_h1, &B, A_h1.storage_type, Magma_CSR); } else{ magma_c_mtransfer( A, &B, A.memory_location, Magma_CPU); } rhs = (pastix_float_t*) b_h.val; ncol = B.num_rows; colptr = B.row; rows = B.col; values = (pastix_float_t*) B.val; mat_type = API_SYM_NO; iparm = (pastix_int_t*)malloc(IPARM_SIZE*sizeof(pastix_int_t)); dparm = (pastix_float_t*)malloc(DPARM_SIZE*sizeof(pastix_float_t)); /*******************************************/ /* Initialize parameters to default values */ /*******************************************/ iparm[IPARM_MODIFY_PARAMETER] = API_NO; pastix(&pastix_data, MPI_COMM_WORLD, ncol, colptr, rows, values, perm, invp, rhs, 1, iparm, dparm); iparm[IPARM_THREAD_NBR] = 16; iparm[IPARM_SYM] = mat_type; iparm[IPARM_FACTORIZATION] = API_FACT_LU; iparm[IPARM_VERBOSE] = API_VERBOSE_YES; iparm[IPARM_ORDERING] = API_ORDER_SCOTCH; iparm[IPARM_INCOMPLETE] = API_NO; iparm[IPARM_RHS_MAKING] = API_RHS_B; //iparm[IPARM_AMALGAMATION] = 5; iparm[IPARM_LEVEL_OF_FILL] = 0; /* if (incomplete == 1) { dparm[DPARM_EPSILON_REFINEMENT] = 1e-7; } */ /* * Matrix needs : * - to be in fortran numbering * - to have only the lower triangular part in symmetric case * - to have a graph with a symmetric structure in unsymmetric case * If those criteria are not matched, the csc will be reallocated and changed. */ iparm[IPARM_MATRIX_VERIFICATION] = API_YES; perm = (pastix_int_t*)malloc(ncol*sizeof(pastix_int_t)); invp = (pastix_int_t*)malloc(ncol*sizeof(pastix_int_t)); /*******************************************/ /* Step 1 - Ordering / Scotch */ /* Perform it only when the pattern of */ /* matrix change. */ /* eg: mesh refinement */ /* In many cases users can simply go from */ /* API_TASK_ORDERING to API_TASK_ANALYSE */ /* in one call. */ /*******************************************/ /*******************************************/ /* Step 2 - Symbolic factorization */ /* Perform it only when the pattern of */ /* matrix change. */ /*******************************************/ /*******************************************/ /* Step 3 - Mapping and Compute scheduling */ /* Perform it only when the pattern of */ /* matrix change. */ /*******************************************/ /*******************************************/ /* Step 4 - Numerical Factorisation */ /* Perform it each time the values of the */ /* matrix changed. */ /*******************************************/ iparm[IPARM_START_TASK] = API_TASK_ORDERING; iparm[IPARM_END_TASK] = API_TASK_NUMFACT; pastix(&pastix_data, MPI_COMM_WORLD, ncol, colptr, rows, values, perm, invp, NULL, 1, iparm, dparm); precond->int_array_1 = (magma_int_t*) perm; precond->int_array_2 = (magma_int_t*) invp; precond->M.val = (magmaFloatComplex*) values; precond->M.col = (magma_int_t*) colptr; precond->M.row = (magma_int_t*) rows; precond->M.num_rows = A.num_rows; precond->M.num_cols = A.num_cols; precond->M.memory_location = Magma_CPU; precond->pastix_data = pastix_data; precond->iparm = iparm; precond->dparm = dparm; if( A.storage_type != Magma_CSR){ magma_c_mfree( &A_h1 ); } magma_c_vfree( &b_h); magma_c_mfree( &B ); #else printf( "error: only real supported yet.\n"); #endif #else printf( "error: pastix not available.\n"); #endif return MAGMA_SUCCESS; }
/* //////////////////////////////////////////////////////////////////////////// -- testing sparse matrix vector product */ int main( int argc, char** argv ) { TESTING_INIT(); magma_queue_t queue; magma_queue_create( /*devices[ opts->device ],*/ &queue ); magma_c_sparse_matrix hA, hA_SELLP, hA_ELL, dA, dA_SELLP, dA_ELL; hA_SELLP.blocksize = 8; hA_SELLP.alignment = 8; real_Double_t start, end, res; magma_int_t *pntre; magmaFloatComplex c_one = MAGMA_C_MAKE(1.0, 0.0); magmaFloatComplex c_zero = MAGMA_C_MAKE(0.0, 0.0); magma_int_t i, j; for( i = 1; i < argc; ++i ) { if ( strcmp("--blocksize", argv[i]) == 0 ) { hA_SELLP.blocksize = atoi( argv[++i] ); } else if ( strcmp("--alignment", argv[i]) == 0 ) { hA_SELLP.alignment = atoi( argv[++i] ); } else break; } printf( "\n# usage: ./run_cspmv" " [ --blocksize %d --alignment %d (for SELLP) ]" " matrices \n\n", (int) hA_SELLP.blocksize, (int) hA_SELLP.alignment ); while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); magma_cm_5stencil( laplace_size, &hA, queue ); } else { // file-matrix test magma_c_csr_mtx( &hA, argv[i], queue ); } printf( "\n# matrix info: %d-by-%d with %d nonzeros\n\n", (int) hA.num_rows,(int) hA.num_cols,(int) hA.nnz ); real_Double_t FLOPS = 2.0*hA.nnz/1e9; magma_c_vector hx, hy, dx, dy, hrefvec, hcheck; // init CPU vectors magma_c_vinit( &hx, Magma_CPU, hA.num_rows, c_zero, queue ); magma_c_vinit( &hy, Magma_CPU, hA.num_rows, c_zero, queue ); // init DEV vectors magma_c_vinit( &dx, Magma_DEV, hA.num_rows, c_one, queue ); magma_c_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); #ifdef MAGMA_WITH_MKL // calling MKL with CSR pntre = (magma_int_t*)malloc( (hA.num_rows+1)*sizeof(magma_int_t) ); pntre[0] = 0; for (j=0; j<hA.num_rows; j++ ) { pntre[j] = hA.row[j+1]; } MKL_INT num_rows = hA.num_rows; MKL_INT num_cols = hA.num_cols; MKL_INT nnz = hA.nnz; MKL_INT *col; TESTING_MALLOC_CPU( col, MKL_INT, nnz ); for( magma_int_t t=0; t < hA.nnz; ++t ) { col[ t ] = hA.col[ t ]; } MKL_INT *row; TESTING_MALLOC_CPU( row, MKL_INT, num_rows ); for( magma_int_t t=0; t < hA.num_rows; ++t ) { row[ t ] = hA.col[ t ]; } start = magma_wtime(); for (j=0; j<10; j++ ) { mkl_ccsrmv( "N", &num_rows, &num_cols, MKL_ADDR(&c_one), "GFNC", MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), MKL_ADDR(&c_zero), MKL_ADDR(hy.val) ); } end = magma_wtime(); printf( "\n > MKL : %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10/(end-start) ); TESTING_FREE_CPU( row ); TESTING_FREE_CPU( col ); free(pntre); #endif // MAGMA_WITH_MKL // copy matrix to GPU magma_c_mtransfer( hA, &dA, Magma_CPU, Magma_DEV, queue ); // SpMV on GPU (CSR) -- this is the reference! start = magma_sync_wtime( queue ); for (j=0; j<10; j++) magma_c_spmv( c_one, dA, dx, c_zero, dy, queue ); end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (standard CSR).\n", (end-start)/10, FLOPS*10/(end-start) ); magma_c_mfree(&dA, queue ); magma_c_vtransfer( dy, &hrefvec , Magma_DEV, Magma_CPU, queue ); // convert to ELL and copy to GPU magma_c_mconvert( hA, &hA_ELL, Magma_CSR, Magma_ELL, queue ); magma_c_mtransfer( hA_ELL, &dA_ELL, Magma_CPU, Magma_DEV, queue ); magma_c_mfree(&hA_ELL, queue ); magma_c_vfree( &dy, queue ); magma_c_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); // SpMV on GPU (ELL) start = magma_sync_wtime( queue ); for (j=0; j<10; j++) magma_c_spmv( c_one, dA_ELL, dx, c_zero, dy, queue ); end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (standard ELL).\n", (end-start)/10, FLOPS*10/(end-start) ); magma_c_mfree(&dA_ELL, queue ); magma_c_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue ); res = 0.0; for(magma_int_t k=0; k<hA.num_rows; k++ ) res=res + MAGMA_C_REAL(hcheck.val[k]) - MAGMA_C_REAL(hrefvec.val[k]); if ( res < .000001 ) printf("# tester spmv ELL: ok\n"); else printf("# tester spmv ELL: failed\n"); magma_c_vfree( &hcheck, queue ); // convert to SELLP and copy to GPU magma_c_mconvert( hA, &hA_SELLP, Magma_CSR, Magma_SELLP, queue ); magma_c_mtransfer( hA_SELLP, &dA_SELLP, Magma_CPU, Magma_DEV, queue ); magma_c_mfree(&hA_SELLP, queue ); magma_c_vfree( &dy, queue ); magma_c_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); // SpMV on GPU (SELLP) start = magma_sync_wtime( queue ); for (j=0; j<10; j++) magma_c_spmv( c_one, dA_SELLP, dx, c_zero, dy, queue ); end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (SELLP).\n", (end-start)/10, FLOPS*10/(end-start) ); magma_c_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue ); res = 0.0; for(magma_int_t k=0; k<hA.num_rows; k++ ) res=res + MAGMA_C_REAL(hcheck.val[k]) - MAGMA_C_REAL(hrefvec.val[k]); printf("# |x-y|_F = %8.2e\n", res); if ( res < .000001 ) printf("# tester spmv SELL-P: ok\n"); else printf("# tester spmv SELL-P: failed\n"); magma_c_vfree( &hcheck, queue ); magma_c_mfree(&dA_SELLP, queue ); // SpMV on GPU (CUSPARSE - CSR) // CUSPARSE context // cusparseHandle_t cusparseHandle = 0; cusparseStatus_t cusparseStatus; cusparseStatus = cusparseCreate(&cusparseHandle); cusparseSetStream( cusparseHandle, queue ); cusparseMatDescr_t descr = 0; cusparseStatus = cusparseCreateMatDescr(&descr); cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); magmaFloatComplex alpha = c_one; magmaFloatComplex beta = c_zero; magma_c_vfree( &dy, queue ); magma_c_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); // copy matrix to GPU magma_c_mtransfer( hA, &dA, Magma_CPU, Magma_DEV, queue ); start = magma_sync_wtime( queue ); for (j=0; j<10; j++) cusparseStatus = cusparseCcsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, hA.num_rows, hA.num_cols, hA.nnz, &alpha, descr, dA.dval, dA.drow, dA.dcol, dx.dval, &beta, dy.dval); end = magma_sync_wtime( queue ); if (cusparseStatus != 0) printf("error in cuSPARSE CSR\n"); printf( " > CUSPARSE: %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10/(end-start) ); cusparseMatDescr_t descrA; cusparseStatus = cusparseCreateMatDescr(&descrA); if (cusparseStatus != 0) printf("error\n"); cusparseHybMat_t hybA; cusparseStatus = cusparseCreateHybMat( &hybA ); if (cusparseStatus != 0) printf("error\n"); magma_c_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue ); res = 0.0; for(magma_int_t k=0; k<hA.num_rows; k++ ) res=res + MAGMA_C_REAL(hcheck.val[k]) - MAGMA_C_REAL(hrefvec.val[k]); printf("# |x-y|_F = %8.2e\n", res); if ( res < .000001 ) printf("# tester spmv cuSPARSE CSR: ok\n"); else printf("# tester spmv cuSPARSE CSR: failed\n"); magma_c_vfree( &hcheck, queue ); magma_c_vfree( &dy, queue ); magma_c_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); cusparseCcsr2hyb(cusparseHandle, hA.num_rows, hA.num_cols, descrA, dA.dval, dA.drow, dA.dcol, hybA, 0, CUSPARSE_HYB_PARTITION_AUTO); start = magma_sync_wtime( queue ); for (j=0; j<10; j++) cusparseStatus = cusparseChybmv( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA, hybA, dx.dval, &beta, dy.dval); end = magma_sync_wtime( queue ); if (cusparseStatus != 0) printf("error in cuSPARSE HYB\n"); printf( " > CUSPARSE: %.2e seconds %.2e GFLOP/s (HYB).\n", (end-start)/10, FLOPS*10/(end-start) ); magma_c_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue ); res = 0.0; for(magma_int_t k=0; k<hA.num_rows; k++ ) res=res + MAGMA_C_REAL(hcheck.val[k]) - MAGMA_C_REAL(hrefvec.val[k]); printf("# |x-y|_F = %8.2e\n", res); if ( res < .000001 ) printf("# tester spmv cuSPARSE HYB: ok\n"); else printf("# tester spmv cuSPARSE HYB: failed\n"); magma_c_vfree( &hcheck, queue ); cusparseDestroyMatDescr( descrA ); cusparseDestroyHybMat( hybA ); cusparseDestroy( cusparseHandle ); magma_c_mfree(&dA, queue ); printf("\n\n"); // free CPU memory magma_c_mfree(&hA, queue ); magma_c_vfree(&hx, queue ); magma_c_vfree(&hy, queue ); magma_c_vfree(&hrefvec, queue ); // free GPU memory magma_c_vfree(&dx, queue ); magma_c_vfree(&dy, queue ); i++; } magma_queue_destroy( queue ); TESTING_FINALIZE(); return 0; }