magma_int_t magma_scsrget_gpu( magma_s_matrix A, magma_int_t *m, magma_int_t *n, magmaIndex_ptr *row, magmaIndex_ptr *col, magmaFloat_ptr *val, magma_queue_t queue ) { magma_int_t info = 0; magma_s_matrix A_DEV={Magma_CSR}, A_CSR={Magma_CSR}; if ( A.memory_location == Magma_DEV && A.storage_type == Magma_CSR ) { *m = A.num_rows; *n = A.num_cols; *val = A.dval; *col = A.dcol; *row = A.drow; } else { CHECK( magma_smconvert( A, &A_CSR, A.storage_type, Magma_CSR, queue )); CHECK( magma_smtransfer( A_CSR, &A_DEV, A.memory_location, Magma_DEV, queue )); magma_scsrget_gpu( A_DEV, m, n, row, col, val, queue ); } cleanup: magma_smfree( &A_CSR, queue ); magma_smfree( &A_DEV, queue ); return info; }
extern "C" magma_int_t magma_smshrink( magma_s_matrix A, magma_s_matrix *B, magma_queue_t queue ) { magma_int_t info = 0; magma_s_matrix hA={Magma_CSR}, hACSR={Magma_CSR}, hB={Magma_CSR}, hBCSR={Magma_CSR}; if( A.num_rows<=A.num_cols){ if( A.memory_location == Magma_CPU && A.storage_type == Magma_CSR ){ CHECK( magma_smconvert( A, B, Magma_CSR, Magma_CSR, queue )); for(magma_int_t i=0; i<A.nnz; i++){ if( B->col[i] >= A.num_rows ){ B->val[i] = MAGMA_S_ZERO; } } CHECK( magma_smcsrcompressor( B, queue ) ); B->num_cols = B->num_rows; } else { CHECK( magma_smtransfer( A, &hA, A.memory_location, Magma_CPU, queue )); CHECK( magma_smconvert( hA, &hACSR, A.storage_type, Magma_CSR, queue )); CHECK( magma_smshrink( hACSR, &hBCSR, queue )); CHECK( magma_smconvert( hBCSR, &hB, Magma_CSR, A.storage_type, queue )); CHECK( magma_smtransfer( hB, B, Magma_CPU, A.memory_location, queue )); } } else { printf("%% error: A has too many rows: m > n.\n"); info = MAGMA_ERR_NOT_SUPPORTED; goto cleanup; } cleanup: magma_smfree( &hA, queue ); magma_smfree( &hB, queue ); magma_smfree( &hACSR, queue ); magma_smfree( &hBCSR, queue ); return info; }
/* //////////////////////////////////////////////////////////////////////////// -- testing sparse matrix vector product */ int main( int argc, char** argv ) { magma_int_t info = 0; TESTING_CHECK( magma_init() ); magma_print_environment(); magma_queue_t queue=NULL; magma_queue_create( 0, &queue ); magma_s_matrix hA={Magma_CSR}, hA_SELLP={Magma_CSR}, dA={Magma_CSR}, dA_SELLP={Magma_CSR}; magma_s_matrix hx={Magma_CSR}, hy={Magma_CSR}, dx={Magma_CSR}, dy={Magma_CSR}, hrefvec={Magma_CSR}, hcheck={Magma_CSR}; hA_SELLP.blocksize = 8; hA_SELLP.alignment = 8; real_Double_t start, end, res; #ifdef MAGMA_WITH_MKL magma_int_t *pntre=NULL; #endif cusparseHandle_t cusparseHandle = NULL; cusparseMatDescr_t descr = NULL; float c_one = MAGMA_S_MAKE(1.0, 0.0); float c_zero = MAGMA_S_MAKE(0.0, 0.0); float accuracy = 1e-10; #define PRECISION_s #if defined(PRECISION_c) accuracy = 1e-4; #endif #if defined(PRECISION_s) accuracy = 1e-4; #endif magma_int_t i, j; for( i = 1; i < argc; ++i ) { if ( strcmp("--blocksize", argv[i]) == 0 ) { hA_SELLP.blocksize = atoi( argv[++i] ); } else if ( strcmp("--alignment", argv[i]) == 0 ) { hA_SELLP.alignment = atoi( argv[++i] ); } else break; } printf("\n# usage: ./run_sspmm" " [ --blocksize %lld --alignment %lld (for SELLP) ] matrices\n\n", (long long) hA_SELLP.blocksize, (long long) hA_SELLP.alignment ); while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); TESTING_CHECK( magma_sm_5stencil( laplace_size, &hA, queue )); } else { // file-matrix test TESTING_CHECK( magma_s_csr_mtx( &hA, argv[i], queue )); } printf("%% matrix info: %lld-by-%lld with %lld nonzeros\n", (long long) hA.num_rows, (long long) hA.num_cols, (long long) hA.nnz ); real_Double_t FLOPS = 2.0*hA.nnz/1e9; // m - number of rows for the sparse matrix // n - number of vectors to be multiplied in the SpMM product magma_int_t m, n; m = hA.num_rows; n = 48; // init CPU vectors TESTING_CHECK( magma_svinit( &hx, Magma_CPU, m, n, c_one, queue )); TESTING_CHECK( magma_svinit( &hy, Magma_CPU, m, n, c_zero, queue )); // init DEV vectors TESTING_CHECK( magma_svinit( &dx, Magma_DEV, m, n, c_one, queue )); TESTING_CHECK( magma_svinit( &dy, Magma_DEV, m, n, c_zero, queue )); // calling MKL with CSR #ifdef MAGMA_WITH_MKL TESTING_CHECK( magma_imalloc_cpu( &pntre, m + 1 ) ); pntre[0] = 0; for (j=0; j < m; j++ ) { pntre[j] = hA.row[j+1]; } MKL_INT num_rows = hA.num_rows; MKL_INT num_cols = hA.num_cols; MKL_INT nnz = hA.nnz; MKL_INT num_vecs = n; MKL_INT *col; TESTING_CHECK( magma_malloc_cpu( (void**) &col, nnz * sizeof(MKL_INT) )); for( magma_int_t t=0; t < hA.nnz; ++t ) { col[ t ] = hA.col[ t ]; } MKL_INT *row; TESTING_CHECK( magma_malloc_cpu( (void**) &row, num_rows * sizeof(MKL_INT) )); for( magma_int_t t=0; t < hA.num_rows; ++t ) { row[ t ] = hA.col[ t ]; } // === Call MKL with consecutive SpMVs, using mkl_scsrmv === // warmp up mkl_scsrmv( "N", &num_rows, &num_cols, MKL_ADDR(&c_one), "GFNC", MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), MKL_ADDR(&c_zero), MKL_ADDR(hy.val) ); start = magma_wtime(); for (j=0; j < 10; j++ ) { mkl_scsrmv( "N", &num_rows, &num_cols, MKL_ADDR(&c_one), "GFNC", MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), MKL_ADDR(&c_zero), MKL_ADDR(hy.val) ); } end = magma_wtime(); printf( "\n > MKL SpMVs : %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10/(end-start) ); // === Call MKL with blocked SpMVs, using mkl_scsrmm === char transa = 'n'; MKL_INT ldb = n, ldc=n; char matdescra[6] = {'g', 'l', 'n', 'c', 'x', 'x'}; // warm up mkl_scsrmm( &transa, &num_rows, &num_vecs, &num_cols, MKL_ADDR(&c_one), matdescra, MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), &ldb, MKL_ADDR(&c_zero), MKL_ADDR(hy.val), &ldc ); start = magma_wtime(); for (j=0; j < 10; j++ ) { mkl_scsrmm( &transa, &num_rows, &num_vecs, &num_cols, MKL_ADDR(&c_one), matdescra, MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), &ldb, MKL_ADDR(&c_zero), MKL_ADDR(hy.val), &ldc ); } end = magma_wtime(); printf( "\n > MKL SpMM : %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10.*n/(end-start) ); magma_free_cpu( row ); magma_free_cpu( col ); row = NULL; col = NULL; #endif // MAGMA_WITH_MKL // copy matrix to GPU TESTING_CHECK( magma_smtransfer( hA, &dA, Magma_CPU, Magma_DEV, queue )); // SpMV on GPU (CSR) start = magma_sync_wtime( queue ); for (j=0; j < 10; j++) { TESTING_CHECK( magma_s_spmv( c_one, dA, dx, c_zero, dy, queue )); } end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (standard CSR).\n", (end-start)/10, FLOPS*10.*n/(end-start) ); TESTING_CHECK( magma_smtransfer( dy, &hrefvec , Magma_DEV, Magma_CPU, queue )); magma_smfree(&dA, queue ); // convert to SELLP and copy to GPU TESTING_CHECK( magma_smconvert( hA, &hA_SELLP, Magma_CSR, Magma_SELLP, queue )); TESTING_CHECK( magma_smtransfer( hA_SELLP, &dA_SELLP, Magma_CPU, Magma_DEV, queue )); magma_smfree(&hA_SELLP, queue ); magma_smfree( &dy, queue ); TESTING_CHECK( magma_svinit( &dy, Magma_DEV, dx.num_rows, dx.num_cols, c_zero, queue )); // SpMV on GPU (SELLP) start = magma_sync_wtime( queue ); for (j=0; j < 10; j++) { TESTING_CHECK( magma_s_spmv( c_one, dA_SELLP, dx, c_zero, dy, queue )); } end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (SELLP).\n", (end-start)/10, FLOPS*10.*n/(end-start) ); TESTING_CHECK( magma_smtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue )); res = 0.0; for(magma_int_t k=0; k < hA.num_rows; k++ ) { res=res + MAGMA_S_REAL(hcheck.val[k]) - MAGMA_S_REAL(hrefvec.val[k]); } printf("%% |x-y|_F = %8.2e\n", res); if ( res < accuracy ) printf("%% tester spmm SELL-P: ok\n"); else printf("%% tester spmm SELL-P: failed\n"); magma_smfree( &hcheck, queue ); magma_smfree(&dA_SELLP, queue ); // SpMV on GPU (CUSPARSE - CSR) // CUSPARSE context // magma_smfree( &dy, queue ); TESTING_CHECK( magma_svinit( &dy, Magma_DEV, dx.num_rows, dx.num_cols, c_zero, queue )); //#ifdef PRECISION_d start = magma_sync_wtime( queue ); TESTING_CHECK( cusparseCreate( &cusparseHandle )); TESTING_CHECK( cusparseSetStream( cusparseHandle, magma_queue_get_cuda_stream(queue) )); TESTING_CHECK( cusparseCreateMatDescr( &descr )); TESTING_CHECK( cusparseSetMatType( descr, CUSPARSE_MATRIX_TYPE_GENERAL )); TESTING_CHECK( cusparseSetMatIndexBase( descr, CUSPARSE_INDEX_BASE_ZERO )); float alpha = c_one; float beta = c_zero; // copy matrix to GPU TESTING_CHECK( magma_smtransfer( hA, &dA, Magma_CPU, Magma_DEV, queue) ); for (j=0; j < 10; j++) { cusparseScsrmm(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, dA.num_rows, n, dA.num_cols, dA.nnz, &alpha, descr, dA.dval, dA.drow, dA.dcol, dx.dval, dA.num_cols, &beta, dy.dval, dA.num_cols); } end = magma_sync_wtime( queue ); printf( " > CUSPARSE: %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10*n/(end-start) ); TESTING_CHECK( magma_smtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue )); res = 0.0; for(magma_int_t k=0; k < hA.num_rows; k++ ) { res = res + MAGMA_S_REAL(hcheck.val[k]) - MAGMA_S_REAL(hrefvec.val[k]); } printf("%% |x-y|_F = %8.2e\n", res); if ( res < accuracy ) printf("%% tester spmm cuSPARSE: ok\n"); else printf("%% tester spmm cuSPARSE: failed\n"); magma_smfree( &hcheck, queue ); cusparseDestroyMatDescr( descr ); cusparseDestroy( cusparseHandle ); descr = NULL; cusparseHandle = NULL; //#endif printf("\n\n"); // free CPU memory magma_smfree( &hA, queue ); magma_smfree( &hx, queue ); magma_smfree( &hy, queue ); magma_smfree( &hrefvec, queue ); // free GPU memory magma_smfree( &dx, queue ); magma_smfree( &dy, queue ); magma_smfree( &dA, queue); #ifdef MAGMA_WITH_MKL magma_free_cpu( pntre ); #endif i++; } magma_queue_destroy( queue ); TESTING_CHECK( magma_finalize() ); return info; }
extern "C" magma_int_t magma_sbicg( magma_s_matrix A, magma_s_matrix b, magma_s_matrix *x, magma_s_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_BICG; solver_par->numiter = 0; solver_par->spmv_count = 0; // some useful variables float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t dofs = A.num_rows * b.num_cols; // workspace magma_s_matrix r={Magma_CSR}, rt={Magma_CSR}, p={Magma_CSR}, pt={Magma_CSR}, z={Magma_CSR}, zt={Magma_CSR}, q={Magma_CSR}, y={Magma_CSR}, yt={Magma_CSR}, qt={Magma_CSR}; // need to transpose the matrix magma_s_matrix AT={Magma_CSR}, Ah1={Magma_CSR}, Ah2={Magma_CSR}; CHECK( magma_svinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &rt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &p, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &pt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &q, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &qt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &y, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &yt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &z, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &zt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // solver variables float alpha, rho, beta, rho_new, ptq; float res, nomb, nom0, r0; // transpose the matrix magma_smtransfer( A, &Ah1, Magma_DEV, Magma_CPU, queue ); magma_smconvert( Ah1, &Ah2, A.storage_type, Magma_CSR, queue ); magma_smfree(&Ah1, queue ); magma_smtransposeconjugate( Ah2, &Ah1, queue ); magma_smfree(&Ah2, queue ); Ah2.blocksize = A.blocksize; Ah2.alignment = A.alignment; magma_smconvert( Ah1, &Ah2, Magma_CSR, A.storage_type, queue ); magma_smfree(&Ah1, queue ); magma_smtransfer( Ah2, &AT, Magma_CPU, Magma_DEV, queue ); magma_smfree(&Ah2, queue ); // solver setup CHECK( magma_sresidualvec( A, b, *x, &r, &nom0, queue)); res = nom0; solver_par->init_res = nom0; magma_scopy( dofs, r.dval, 1, rt.dval, 1, queue ); // rr = r rho_new = magma_sdot( dofs, rt.dval, 1, r.dval, 1, queue ); // rho=<rr,r> rho = alpha = MAGMA_S_MAKE( 1.0, 0. ); nomb = magma_snrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } if ( (r0 = nomb * solver_par->rtol) < ATOLERANCE ){ r0 = ATOLERANCE; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } if ( nom0 < r0 ) { info = MAGMA_SUCCESS; goto cleanup; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; magma_scopy( dofs, r.dval, 1 , y.dval, 1, queue ); // y=r magma_scopy( dofs, y.dval, 1 , z.dval, 1, queue ); // z=y magma_scopy( dofs, rt.dval, 1 , yt.dval, 1, queue ); // yt=rt magma_scopy( dofs, yt.dval, 1 , zt.dval, 1, queue ); // zt=yt rho= rho_new; rho_new = magma_sdot( dofs, rt.dval, 1, z.dval, 1, queue ); // rho=<rt,z> if( magma_s_isnan_inf( rho_new ) ){ info = MAGMA_DIVERGENCE; break; } if( solver_par->numiter==1 ){ magma_scopy( dofs, z.dval, 1 , p.dval, 1, queue ); // yt=rt magma_scopy( dofs, zt.dval, 1 , pt.dval, 1, queue ); // zt=yt } else { beta = rho_new/rho; magma_sscal( dofs, beta, p.dval, 1, queue ); // p = beta*p magma_saxpy( dofs, c_one , z.dval, 1 , p.dval, 1, queue ); // p = z+beta*p magma_sscal( dofs, MAGMA_S_CONJ(beta), pt.dval, 1, queue ); // pt = beta*pt magma_saxpy( dofs, c_one , zt.dval, 1 , pt.dval, 1, queue ); // pt = zt+beta*pt } CHECK( magma_s_spmv( c_one, A, p, c_zero, q, queue )); // v = Ap CHECK( magma_s_spmv( c_one, AT, pt, c_zero, qt, queue )); // v = Ap solver_par->spmv_count++; solver_par->spmv_count++; ptq = magma_sdot( dofs, pt.dval, 1, q.dval, 1, queue ); alpha = rho_new /ptq; magma_saxpy( dofs, alpha, p.dval, 1 , x->dval, 1, queue ); // x=x+alpha*p magma_saxpy( dofs, c_neg_one * alpha, q.dval, 1 , r.dval, 1, queue ); // r=r+alpha*q magma_saxpy( dofs, c_neg_one * MAGMA_S_CONJ(alpha), qt.dval, 1 , rt.dval, 1, queue ); // r=r+alpha*q res = magma_snrm2( dofs, r.dval, 1, queue ); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( res/nomb <= solver_par->rtol || res <= solver_par->atol ){ break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; CHECK( magma_sresidualvec( A, b, *x, &r, &residual, queue)); solver_par->iter_res = res; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->rtol*solver_par->init_res || solver_par->iter_res < solver_par->atol ) { info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_smfree(&r, queue ); magma_smfree(&rt, queue ); magma_smfree(&p, queue ); magma_smfree(&pt, queue ); magma_smfree(&q, queue ); magma_smfree(&qt, queue ); magma_smfree(&y, queue ); magma_smfree(&yt, queue ); magma_smfree(&z, queue ); magma_smfree(&zt, queue ); magma_smfree(&AT, queue ); magma_smfree(&Ah1, queue ); magma_smfree(&Ah2, queue ); solver_par->info = info; return info; } /* magma_sbicg */
/* //////////////////////////////////////////////////////////////////////////// -- testing any solver */ int main( int argc, char** argv ) { magma_int_t info = 0; TESTING_INIT(); magma_sopts zopts; magma_queue_t queue=NULL; magma_queue_create( /*devices[ opts->device ],*/ &queue ); float one = MAGMA_S_MAKE(1.0, 0.0); float zero = MAGMA_S_MAKE(0.0, 0.0); magma_s_matrix A={Magma_CSR}, B={Magma_CSR}, B_d={Magma_CSR}; magma_s_matrix x={Magma_CSR}, b={Magma_CSR}; int i=1; CHECK( magma_sparse_opts( argc, argv, &zopts, &i, queue )); B.blocksize = zopts.blocksize; B.alignment = zopts.alignment; if ( zopts.solver_par.solver != Magma_PCG && zopts.solver_par.solver != Magma_PGMRES && zopts.solver_par.solver != Magma_PBICGSTAB && zopts.solver_par.solver != Magma_ITERREF && zopts.solver_par.solver != Magma_LOBPCG ) zopts.precond_par.solver = Magma_NONE; CHECK( magma_ssolverinfo_init( &zopts.solver_par, &zopts.precond_par, queue )); while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); CHECK( magma_sm_5stencil( laplace_size, &A, queue )); } else { // file-matrix test CHECK( magma_s_csr_mtx( &A, argv[i], queue )); } printf( "\n# matrix info: %d-by-%d with %d nonzeros\n\n", (int) A.num_rows,(int) A.num_cols,(int) A.nnz ); // for the eigensolver case zopts.solver_par.ev_length = A.num_rows; CHECK( magma_seigensolverinfo_init( &zopts.solver_par, queue )); // scale matrix CHECK( magma_smscale( &A, zopts.scaling, queue )); CHECK( magma_smconvert( A, &B, Magma_CSR, zopts.output_format, queue )); CHECK( magma_smtransfer( B, &B_d, Magma_CPU, Magma_DEV, queue )); // vectors and initial guess CHECK( magma_svinit( &b, Magma_DEV, A.num_cols, 1, one, queue )); //magma_svinit( &x, Magma_DEV, A.num_cols, 1, one, queue ); //magma_s_spmv( one, B_d, x, zero, b, queue ); // b = A x //magma_smfree(&x, queue ); CHECK( magma_svinit( &x, Magma_DEV, A.num_cols, 1, zero, queue )); info = magma_s_solver( B_d, b, &x, &zopts, queue ); if( info != 0 ){ printf("error: solver returned: %s (%d).\n", magma_strerror( info ), info ); } magma_ssolverinfo( &zopts.solver_par, &zopts.precond_par, queue ); magma_smfree(&B_d, queue ); magma_smfree(&B, queue ); magma_smfree(&A, queue ); magma_smfree(&x, queue ); magma_smfree(&b, queue ); i++; } cleanup: magma_smfree(&B_d, queue ); magma_smfree(&B, queue ); magma_smfree(&A, queue ); magma_smfree(&x, queue ); magma_smfree(&b, queue ); magma_ssolverinfo_free( &zopts.solver_par, &zopts.precond_par, queue ); magma_queue_destroy( queue ); TESTING_FINALIZE(); return info; }
/* //////////////////////////////////////////////////////////////////////////// -- testing any solver */ int main( int argc, char** argv ) { magma_int_t info = 0; TESTING_INIT(); magma_sopts zopts; magma_queue_t queue=NULL; magma_queue_create( /*devices[ opts->device ],*/ &queue ); real_Double_t res; magma_s_matrix A={Magma_CSR}, AT={Magma_CSR}, A2={Magma_CSR}, B={Magma_CSR}, B_d={Magma_CSR}; int i=1; real_Double_t start, end; CHECK( magma_sparse_opts( argc, argv, &zopts, &i, queue )); B.blocksize = zopts.blocksize; B.alignment = zopts.alignment; while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); CHECK( magma_sm_5stencil( laplace_size, &A, queue )); } else { // file-matrix test CHECK( magma_s_csr_mtx( &A, argv[i], queue )); } printf( "\n# matrix info: %d-by-%d with %d nonzeros\n\n", (int) A.num_rows,(int) A.num_cols,(int) A.nnz ); // scale matrix CHECK( magma_smscale( &A, zopts.scaling, queue )); // remove nonzeros in matrix start = magma_sync_wtime( queue ); for (int j=0; j<10; j++) CHECK( magma_smcsrcompressor( &A, queue )); end = magma_sync_wtime( queue ); printf( " > MAGMA CPU: %.2e seconds.\n", (end-start)/10 ); // transpose CHECK( magma_smtranspose( A, &AT, queue )); // convert, copy back and forth to check everything works CHECK( magma_smconvert( AT, &B, Magma_CSR, Magma_CSR, queue )); magma_smfree(&AT, queue ); CHECK( magma_smtransfer( B, &B_d, Magma_CPU, Magma_DEV, queue )); magma_smfree(&B, queue ); start = magma_sync_wtime( queue ); for (int j=0; j<10; j++) CHECK( magma_smcsrcompressor_gpu( &B_d, queue )); end = magma_sync_wtime( queue ); printf( " > MAGMA GPU: %.2e seconds.\n", (end-start)/10 ); CHECK( magma_smtransfer( B_d, &B, Magma_DEV, Magma_CPU, queue )); magma_smfree(&B_d, queue ); CHECK( magma_smconvert( B, &AT, Magma_CSR, Magma_CSR, queue )); magma_smfree(&B, queue ); // transpose back CHECK( magma_smtranspose( AT, &A2, queue )); magma_smfree(&AT, queue ); CHECK( magma_smdiff( A, A2, &res, queue )); printf("# ||A-B||_F = %8.2e\n", res); if ( res < .000001 ) printf("# tester matrix compressor: ok\n"); else printf("# tester matrix compressor: failed\n"); magma_smfree(&A, queue ); magma_smfree(&A2, queue ); i++; } cleanup: magma_smfree(&AT, queue ); magma_smfree(&B, queue ); magma_smfree(&A, queue ); magma_smfree(&A2, queue ); magma_queue_destroy( queue ); TESTING_FINALIZE(); return info; }
extern "C" magma_int_t magma_smtransposeconjugate( magma_s_matrix A, magma_s_matrix *B, magma_queue_t queue ) { // for symmetric matrices: convert to csc using cusparse magma_int_t info = 0; cusparseHandle_t handle=NULL; cusparseMatDescr_t descrA=NULL; cusparseMatDescr_t descrB=NULL; magma_s_matrix ACSR={Magma_CSR}, BCSR={Magma_CSR}; magma_s_matrix A_d={Magma_CSR}, B_d={Magma_CSR}; if( A.storage_type == Magma_CSR && A.memory_location == Magma_DEV ) { // fill in information for B B->storage_type = A.storage_type; B->diagorder_type = A.diagorder_type; B->memory_location = Magma_DEV; B->num_rows = A.num_cols; // transposed B->num_cols = A.num_rows; // transposed B->nnz = A.nnz; B->true_nnz = A.true_nnz; if ( A.fill_mode == MagmaFull ) { B->fill_mode = MagmaFull; } else if ( A.fill_mode == MagmaLower ) { B->fill_mode = MagmaUpper; } else if ( A.fill_mode == MagmaUpper ) { B->fill_mode = MagmaLower; } B->dval = NULL; B->drow = NULL; B->dcol = NULL; // memory allocation CHECK( magma_smalloc( &B->dval, B->nnz )); CHECK( magma_index_malloc( &B->drow, B->num_rows + 1 )); CHECK( magma_index_malloc( &B->dcol, B->nnz )); // CUSPARSE context // CHECK_CUSPARSE( cusparseCreate( &handle )); CHECK_CUSPARSE( cusparseSetStream( handle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrA )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrB )); CHECK_CUSPARSE( cusparseSetMatType( descrA, CUSPARSE_MATRIX_TYPE_GENERAL )); CHECK_CUSPARSE( cusparseSetMatType( descrB, CUSPARSE_MATRIX_TYPE_GENERAL )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrA, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrB, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseScsr2csc( handle, A.num_rows, A.num_cols, A.nnz, A.dval, A.drow, A.dcol, B->dval, B->dcol, B->drow, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO) ); CHECK( magma_smconjugate( B, queue )); } else if ( A.memory_location == Magma_CPU ){ CHECK( magma_smtransfer( A, &A_d, A.memory_location, Magma_DEV, queue )); CHECK( magma_smtransposeconjugate( A_d, &B_d, queue )); CHECK( magma_smtransfer( B_d, B, Magma_DEV, A.memory_location, queue )); } else { CHECK( magma_smconvert( A, &ACSR, A.storage_type, Magma_CSR, queue )); CHECK( magma_smtransposeconjugate( ACSR, &BCSR, queue )); CHECK( magma_smconvert( BCSR, B, Magma_CSR, A.storage_type, queue )); } cleanup: cusparseDestroyMatDescr( descrA ); cusparseDestroyMatDescr( descrB ); cusparseDestroy( handle ); magma_smfree( &A_d, queue ); magma_smfree( &B_d, queue ); magma_smfree( &ACSR, queue ); magma_smfree( &BCSR, queue ); if( info != 0 ){ magma_smfree( B, queue ); } return info; }
extern "C" magma_int_t magma_sjacobi( magma_s_matrix A, magma_s_matrix b, magma_s_matrix *x, magma_s_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // some useful variables float c_zero = MAGMA_S_ZERO; real_Double_t tempo1, tempo2, runtime=0; float residual; //float nom0 = 0.0; magma_s_matrix r={Magma_CSR}, d={Magma_CSR}, ACSR={Magma_CSR}; CHECK( magma_smconvert(A, &ACSR, A.storage_type, Magma_CSR, queue ) ); // prepare solver feedback solver_par->solver = Magma_JACOBI; solver_par->info = MAGMA_SUCCESS; // solver setup CHECK( magma_svinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_sresidualvec( ACSR, b, *x, &r, &residual, queue)); solver_par->init_res = residual; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t) residual; } //nom0 = residual; // Jacobi setup CHECK( magma_sjacobisetup_diagscal( ACSR, &d, queue )); magma_s_solver_par jacobiiter_par; if ( solver_par->verbose > 0 ) { jacobiiter_par.maxiter = solver_par->verbose; } else { jacobiiter_par.maxiter = solver_par->maxiter; } solver_par->numiter = 0; solver_par->spmv_count = 0; // Jacobi iterator do { tempo1 = magma_sync_wtime( queue ); solver_par->numiter = solver_par->numiter+jacobiiter_par.maxiter; //CHECK( magma_sjacobiiter_sys( A, b, d, r, x, &jacobiiter_par, queue ) ); CHECK( magma_sjacobispmvupdate(jacobiiter_par.maxiter, ACSR, r, b, d, x, queue )); solver_par->spmv_count = solver_par->spmv_count+jacobiiter_par.maxiter; tempo2 = magma_sync_wtime( queue ); runtime += tempo2 - tempo1; //CHECK( magma_sjacobispmvupdate_bw(jacobiiter_par.maxiter, A, r, b, d, x, queue )); if ( solver_par->verbose > 0 ) { CHECK( magma_sresidualvec( ACSR, b, *x, &r, &residual, queue)); solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) residual; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) runtime; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); solver_par->runtime = (real_Double_t) runtime; CHECK( magma_sresidualvec( A, b, *x, &r, &residual, queue)); solver_par->final_res = residual; if ( solver_par->init_res > solver_par->final_res ) info = MAGMA_SUCCESS; else info = MAGMA_DIVERGENCE; cleanup: magma_smfree( &r, queue ); magma_smfree( &d, queue ); magma_smfree( &ACSR, queue ); solver_par->info = info; return info; } /* magma_sjacobi */
extern "C" magma_int_t magma_scsrsplit( magma_int_t offset, magma_int_t bsize, magma_s_matrix A, magma_s_matrix *D, magma_s_matrix *R, magma_queue_t queue ) { magma_int_t info = 0; magma_int_t i, k, j, nnz_diag, nnz_offd; D->val = NULL; D->col = NULL; D->row = NULL; D->rowidx = NULL; D->blockinfo = NULL; D->diag = NULL; D->dval = NULL; D->dcol = NULL; D->drow = NULL; D->drowidx = NULL; D->ddiag = NULL; R->val = NULL; R->col = NULL; R->row = NULL; R->rowidx = NULL; R->blockinfo = NULL; R->diag = NULL; R->dval = NULL; R->dcol = NULL; R->drow = NULL; R->drowidx = NULL; R->ddiag = NULL; if ( A.memory_location == Magma_CPU && ( A.storage_type == Magma_CSR || A.storage_type == Magma_CSRCOO ) ) { nnz_diag = nnz_offd = 0; // Count the new number of nonzeroes in the two matrices for( i=0; i<offset; i+=offset ){ for( k=i; k<min(A.num_rows,i+offset); k++ ){ int check = 0; for( j=A.row[k]; j<A.row[k+1]; j++ ){ if ( A.col[j] < i ) nnz_offd++; else if ( A.col[j] < i+offset ){ if( A.col[j] == k ){ check = 1; } nnz_diag++; } else nnz_offd++; } if( check == 0 ){ printf("error: matrix contains zero on diagonal at (%d,%d).\n", int(i), int(i)); info = -1; goto cleanup; } } } magma_int_t ii = i; for( i=ii; i<A.num_rows; i+=bsize ){ for( k=i; k<min(A.num_rows,i+bsize); k++ ){ int check = 0; for( j=A.row[k]; j<A.row[k+1]; j++ ){ if ( A.col[j] < i ) nnz_offd++; else if ( A.col[j] < i+bsize ){ if( A.col[j] == k ){ check = 1; } nnz_diag++; } else nnz_offd++; } if( check == 0 ){ printf("error: matrix contains zero on diagonal at (%d,%d).\n", int(i), int(i)); info = -1; goto cleanup; } } } // Allocate memory for the new matrices D->storage_type = Magma_CSRD; D->memory_location = A.memory_location; D->num_rows = A.num_rows; D->num_cols = A.num_cols; D->nnz = nnz_diag; R->storage_type = Magma_CSR; R->memory_location = A.memory_location; R->num_rows = A.num_rows; R->num_cols = A.num_cols; R->nnz = nnz_offd; CHECK( magma_smalloc_cpu( &D->val, nnz_diag )); CHECK( magma_index_malloc_cpu( &D->row, A.num_rows+1 )); CHECK( magma_index_malloc_cpu( &D->col, nnz_diag )); CHECK( magma_smalloc_cpu( &R->val, nnz_offd )); CHECK( magma_index_malloc_cpu( &R->row, A.num_rows+1 )); CHECK( magma_index_malloc_cpu( &R->col, nnz_offd )); // Fill up the new sparse matrices D->row[0] = 0; R->row[0] = 0; nnz_offd = nnz_diag = 0; for( i=0; i<offset; i+=offset) { for( k=i; k<min(A.num_rows,i+offset); k++ ) { D->row[k+1] = D->row[k]; R->row[k+1] = R->row[k]; for( j=A.row[k]; j<A.row[k+1]; j++ ) { if ( A.col[j] < i ) { R->val[nnz_offd] = A.val[j]; R->col[nnz_offd] = A.col[j]; R->row[k+1]++; nnz_offd++; } else if ( A.col[j] < i+offset ) { // larger than diagonal remain as before if ( A.col[j]>k ) { D->val[nnz_diag] = A.val[ j ]; D->col[nnz_diag] = A.col[ j ]; D->row[k+1]++; } // diagonal is written first else if ( A.col[j]==k ) { D->val[D->row[k]] = A.val[ j ]; D->col[D->row[k]] = A.col[ j ]; D->row[k+1]++; } // smaller than diagonal are shifted one to the right // to have room for the diagonal else { D->val[nnz_diag+1] = A.val[ j ]; D->col[nnz_diag+1] = A.col[ j ]; D->row[k+1]++; } nnz_diag++; } else { R->val[nnz_offd] = A.val[j]; R->col[nnz_offd] = A.col[j]; R->row[k+1]++; nnz_offd++; } } } } ii = i; for( i=ii; i<A.num_rows; i+=bsize) { for( k=i; k<min(A.num_rows,i+bsize); k++ ) { D->row[k+1] = D->row[k]; R->row[k+1] = R->row[k]; for( j=A.row[k]; j<A.row[k+1]; j++ ) { if ( A.col[j] < i ) { R->val[nnz_offd] = A.val[j]; R->col[nnz_offd] = A.col[j]; R->row[k+1]++; nnz_offd++; } else if ( A.col[j] < i+bsize ) { // larger than diagonal remain as before if ( A.col[j]>k ) { D->val[nnz_diag] = A.val[ j ]; D->col[nnz_diag] = A.col[ j ]; D->row[k+1]++; } // diagonal is written first else if ( A.col[j]==k ) { D->val[D->row[k]] = A.val[ j ]; D->col[D->row[k]] = A.col[ j ]; D->row[k+1]++; } // smaller than diagonal are shifted one to the right // to have room for the diagonal else { D->val[nnz_diag+1] = A.val[ j ]; D->col[nnz_diag+1] = A.col[ j ]; D->row[k+1]++; } nnz_diag++; } else { R->val[nnz_offd] = A.val[j]; R->col[nnz_offd] = A.col[j]; R->row[k+1]++; nnz_offd++; } } } } } else { magma_s_matrix Ah={Magma_CSR}, ACSR={Magma_CSR}, DCSR={Magma_CSR}, RCSR={Magma_CSR}, Dh={Magma_CSR}, Rh={Magma_CSR}; CHECK( magma_smtransfer( A, &Ah, A.memory_location, Magma_CPU, queue )); CHECK( magma_smconvert( Ah, &ACSR, A.storage_type, Magma_CSR, queue )); CHECK( magma_scsrsplit( offset, bsize, ACSR, &DCSR, &RCSR, queue )); CHECK( magma_smconvert( DCSR, &Dh, Magma_CSR, A.storage_type, queue )); CHECK( magma_smconvert( RCSR, &Rh, Magma_CSR, A.storage_type, queue )); CHECK( magma_smtransfer( Dh, D, Magma_CPU, A.memory_location, queue )); CHECK( magma_smtransfer( Rh, R, Magma_CPU, A.memory_location, queue )); magma_smfree( &Ah, queue ); magma_smfree( &ACSR, queue ); magma_smfree( &Dh, queue ); magma_smfree( &DCSR, queue ); magma_smfree( &Rh, queue ); magma_smfree( &RCSR, queue ); } cleanup: if( info != 0 ){ magma_smfree( D, queue ); magma_smfree( R, queue ); } return info; }
extern "C" magma_int_t magma_sjacobisetup( magma_s_matrix A, magma_s_matrix b, magma_s_matrix *M, magma_s_matrix *c, magma_queue_t queue ) { magma_int_t info = 0; magma_int_t i; magma_s_matrix A_h1={Magma_CSR}, A_h2={Magma_CSR}, B={Magma_CSR}, C={Magma_CSR}; magma_s_matrix diag={Magma_CSR}, c_t={Magma_CSR}, b_h={Magma_CSR}; CHECK( magma_svinit( &c_t, Magma_CPU, A.num_rows, b.num_cols, MAGMA_S_ZERO, queue )); CHECK( magma_svinit( &diag, Magma_CPU, A.num_rows, b.num_cols, MAGMA_S_ZERO, queue )); CHECK( magma_smtransfer( b, &b_h, A.memory_location, Magma_CPU, queue )); if ( A.storage_type != Magma_CSR ) { CHECK( magma_smtransfer( A, &A_h1, A.memory_location, Magma_CPU, queue )); CHECK( magma_smconvert( A_h1, &B, A_h1.storage_type, Magma_CSR, queue )); } else { CHECK( magma_smtransfer( A, &B, A.memory_location, Magma_CPU, queue )); } for( magma_int_t rowindex=0; rowindex<B.num_rows; rowindex++ ) { magma_int_t start = (B.drow[rowindex]); magma_int_t end = (B.drow[rowindex+1]); for( i=start; i<end; i++ ) { if ( B.dcol[i]==rowindex ) { diag.val[rowindex] = B.val[i]; if ( MAGMA_S_REAL( diag.val[rowindex]) == 0 ) printf(" error: zero diagonal element in row %d!\n", int(rowindex)); } } for( i=start; i<end; i++ ) { B.val[i] = B.val[i] / diag.val[rowindex]; if ( B.dcol[i]==rowindex ) { B.val[i] = MAGMA_S_MAKE( 0., 0. ); } } c_t.val[rowindex] = b_h.val[rowindex] / diag.val[rowindex]; } CHECK( magma_s_csr_compressor(&B.val, &B.drow, &B.dcol, &C.val, &C.drow, &C.dcol, &B.num_rows, queue )); C.num_rows = B.num_rows; C.num_cols = B.num_cols; C.memory_location = B.memory_location; C.nnz = C.drow[B.num_rows]; C.storage_type = B.storage_type; C.memory_location = B.memory_location; if ( A.storage_type != Magma_CSR) { A_h2.alignment = A.alignment; A_h2.blocksize = A.blocksize; CHECK( magma_smconvert( C, &A_h2, Magma_CSR, A_h1.storage_type, queue )); CHECK( magma_smtransfer( A_h2, M, Magma_CPU, A.memory_location, queue )); } else { CHECK( magma_smtransfer( C, M, Magma_CPU, A.memory_location, queue )); } CHECK( magma_smtransfer( c_t, c, Magma_CPU, A.memory_location, queue )); if ( A.storage_type != Magma_CSR) { magma_smfree( &A_h1, queue ); magma_smfree( &A_h2, queue ); } cleanup: magma_smfree( &B, queue ); magma_smfree( &C, queue ); magma_smfree( &diag, queue ); magma_smfree( &c_t, queue ); magma_smfree( &b_h, queue ); return info; }
extern "C" magma_int_t magma_sjacobisetup_diagscal( magma_s_matrix A, magma_s_matrix *d, magma_queue_t queue ) { magma_int_t info = 0; magma_int_t i; magma_s_matrix A_h1={Magma_CSR}, B={Magma_CSR}; magma_s_matrix diag={Magma_CSR}; CHECK( magma_svinit( &diag, Magma_CPU, A.num_rows, 1, MAGMA_S_ZERO, queue )); if ( A.storage_type != Magma_CSR || A.memory_location != Magma_CPU ) { CHECK( magma_smtransfer( A, &A_h1, A.memory_location, Magma_CPU, queue )); CHECK( magma_smconvert( A_h1, &B, A_h1.storage_type, Magma_CSR, queue )); for( magma_int_t rowindex=0; rowindex<B.num_rows; rowindex++ ) { magma_int_t start = (B.drow[rowindex]); magma_int_t end = (B.drow[rowindex+1]); for( i=start; i<end; i++ ) { if ( B.dcol[i]==rowindex ) { diag.val[rowindex] = 1.0/B.val[i]; break; } } if ( diag.val[rowindex] == MAGMA_S_ZERO ){ printf(" error: zero diagonal element in row %d!\n", int(rowindex)); if ( A.storage_type != Magma_CSR) { magma_smfree( &A_h1, queue ); } magma_smfree( &B, queue ); magma_smfree( &diag, queue ); info = MAGMA_ERR_BADPRECOND; goto cleanup; } } } else{ for( magma_int_t rowindex=0; rowindex<A.num_rows; rowindex++ ) { magma_int_t start = (A.drow[rowindex]); magma_int_t end = (A.drow[rowindex+1]); for( i=start; i<end; i++ ) { if ( A.dcol[i]==rowindex ) { diag.val[rowindex] = 1.0/A.val[i]; break; } } if ( diag.val[rowindex] == MAGMA_S_ZERO ){ printf(" error: zero diagonal element in row %d!\n", int(rowindex)); if ( A.storage_type != Magma_CSR) { magma_smfree( &A_h1, queue ); } magma_smfree( &B, queue ); magma_smfree( &diag, queue ); info = MAGMA_ERR_BADPRECOND; goto cleanup; } } } CHECK( magma_smtransfer( diag, d, Magma_CPU, Magma_DEV, queue )); cleanup: magma_smfree( &A_h1, queue ); magma_smfree( &B, queue ); magma_smfree( &diag, queue ); return info; }
/* //////////////////////////////////////////////////////////////////////////// -- testing any solver */ int main( int argc, char** argv ) { magma_int_t info = 0; TESTING_INIT(); magma_sopts zopts; magma_queue_t queue=NULL; magma_queue_create( &queue ); float one = MAGMA_S_MAKE(1.0, 0.0); float zero = MAGMA_S_MAKE(0.0, 0.0); magma_s_matrix A={Magma_CSR}, B={Magma_CSR}, B_d={Magma_CSR}; magma_s_matrix x={Magma_CSR}, b={Magma_CSR}; int i=1; CHECK( magma_sparse_opts( argc, argv, &zopts, &i, queue )); B.blocksize = zopts.blocksize; B.alignment = zopts.alignment; CHECK( magma_ssolverinfo_init( &zopts.solver_par, &zopts.precond_par, queue )); while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); CHECK( magma_sm_5stencil( laplace_size, &A, queue )); } else { // file-matrix test CHECK( magma_s_csr_mtx( &A, argv[i], queue )); } // for the eigensolver case zopts.solver_par.ev_length = A.num_cols; CHECK( magma_seigensolverinfo_init( &zopts.solver_par, queue )); // scale matrix CHECK( magma_smscale( &A, zopts.scaling, queue )); // preconditioner if ( zopts.solver_par.solver != Magma_ITERREF ) { CHECK( magma_s_precondsetup( A, b, &zopts.solver_par, &zopts.precond_par, queue ) ); } CHECK( magma_smconvert( A, &B, Magma_CSR, zopts.output_format, queue )); printf( "\n%% matrix info: %d-by-%d with %d nonzeros\n\n", int(A.num_rows), int(A.num_cols), int(A.nnz) ); printf("matrixinfo = [ \n"); printf("%% size (m x n) || nonzeros (nnz) || nnz/m || stored nnz\n"); printf("%%======================================================================" "======%%\n"); printf(" %8d %8d %10d %4d %10d\n", int(B.num_rows), int(B.num_cols), int(B.true_nnz), int(B.true_nnz/B.num_rows), int(B.nnz) ); printf("%%======================================================================" "======%%\n"); printf("];\n"); CHECK( magma_smtransfer( B, &B_d, Magma_CPU, Magma_DEV, queue )); // vectors and initial guess CHECK( magma_svinit( &b, Magma_DEV, A.num_rows, 1, one, queue )); //magma_svinit( &x, Magma_DEV, A.num_cols, 1, one, queue ); //magma_s_spmv( one, B_d, x, zero, b, queue ); // b = A x //magma_smfree(&x, queue ); CHECK( magma_svinit( &x, Magma_DEV, A.num_cols, 1, zero, queue )); info = magma_s_solver( B_d, b, &x, &zopts, queue ); if( info != 0 ) { printf("%%error: solver returned: %s (%d).\n", magma_strerror( info ), int(info) ); } printf("data = [\n"); magma_ssolverinfo( &zopts.solver_par, &zopts.precond_par, queue ); printf("];\n\n"); printf("precond_info = [\n"); printf("%% setup runtime\n"); printf(" %.6f %.6f\n", zopts.precond_par.setuptime, zopts.precond_par.runtime ); printf("];\n\n"); magma_smfree(&B_d, queue ); magma_smfree(&B, queue ); magma_smfree(&A, queue ); magma_smfree(&x, queue ); magma_smfree(&b, queue ); i++; } cleanup: magma_smfree(&B_d, queue ); magma_smfree(&B, queue ); magma_smfree(&A, queue ); magma_smfree(&x, queue ); magma_smfree(&b, queue ); magma_ssolverinfo_free( &zopts.solver_par, &zopts.precond_par, queue ); magma_queue_destroy( queue ); TESTING_FINALIZE(); return info; }
extern "C" magma_int_t magma_slsqr( magma_s_matrix A, magma_s_matrix b, magma_s_matrix *x, magma_s_solver_par *solver_par, magma_s_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_LSQR; solver_par->numiter = 0; solver_par->spmv_count = 0; magma_int_t m = A.num_rows * b.num_cols; magma_int_t n = A.num_cols * b.num_cols; // local variables float c_zero = MAGMA_S_ZERO, c_one = MAGMA_S_ONE; // solver variables float s, nom0, r0, res=0, nomb, phibar, beta, alpha, c, rho, rhot, phi, thet, normr, normar, norma, sumnormd2, normd; // need to transpose the matrix magma_s_matrix AT={Magma_CSR}, Ah1={Magma_CSR}, Ah2={Magma_CSR}; // GPU workspace magma_s_matrix r={Magma_CSR}, v={Magma_CSR}, z={Magma_CSR}, zt={Magma_CSR}, d={Magma_CSR}, vt={Magma_CSR}, q={Magma_CSR}, w={Magma_CSR}, u={Magma_CSR}; CHECK( magma_svinit( &r, Magma_DEV, A.num_cols, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &v, Magma_DEV, A.num_cols, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &z, Magma_DEV, A.num_cols, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &d, Magma_DEV, A.num_cols, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &vt,Magma_DEV, A.num_cols, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &q, Magma_DEV, A.num_cols, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &w, Magma_DEV, A.num_cols, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &u, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_svinit( &zt,Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); // transpose the matrix magma_smtransfer( A, &Ah1, Magma_DEV, Magma_CPU, queue ); magma_smconvert( Ah1, &Ah2, A.storage_type, Magma_CSR, queue ); magma_smfree(&Ah1, queue ); magma_smtransposeconjugate( Ah2, &Ah1, queue ); magma_smfree(&Ah2, queue ); Ah2.blocksize = A.blocksize; Ah2.alignment = A.alignment; magma_smconvert( Ah1, &Ah2, Magma_CSR, A.storage_type, queue ); magma_smfree(&Ah1, queue ); magma_smtransfer( Ah2, &AT, Magma_CPU, Magma_DEV, queue ); magma_smfree(&Ah2, queue ); // solver setup CHECK( magma_sresidualvec( A, b, *x, &r, &nom0, queue)); solver_par->init_res = nom0; nomb = magma_snrm2( m, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } if ( (r0 = nomb * solver_par->rtol) < ATOLERANCE ){ r0 = ATOLERANCE; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nom0; solver_par->timing[0] = 0.0; } if ( nom0 < r0 ) { info = MAGMA_SUCCESS; goto cleanup; } magma_scopy( m, b.dval, 1, u.dval, 1, queue ); beta = magma_snrm2( m, u.dval, 1, queue ); magma_sscal( m, MAGMA_S_MAKE(1./beta, 0.0 ), u.dval, 1, queue ); normr = beta; c = 1.0; s = 0.0; phibar = beta; CHECK( magma_s_spmv( c_one, AT, u, c_zero, v, queue )); if( precond_par->solver == Magma_NONE ){ ; } else { CHECK( magma_s_applyprecond_right( MagmaTrans, A, v, &zt, precond_par, queue )); CHECK( magma_s_applyprecond_left( MagmaTrans, A, zt, &v, precond_par, queue )); } alpha = magma_snrm2( n, v.dval, 1, queue ); magma_sscal( n, MAGMA_S_MAKE(1./alpha, 0.0 ), v.dval, 1, queue ); normar = alpha * beta; norma = 0; sumnormd2 = 0; //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; // start iteration do { solver_par->numiter++; if( precond_par->solver == Magma_NONE || A.num_rows != A.num_cols ) { magma_scopy( n, v.dval, 1 , z.dval, 1, queue ); } else { CHECK( magma_s_applyprecond_left( MagmaNoTrans, A, v, &zt, precond_par, queue )); CHECK( magma_s_applyprecond_right( MagmaNoTrans, A, zt, &z, precond_par, queue )); } //CHECK( magma_s_spmv( c_one, A, z, MAGMA_S_MAKE(-alpha,0.0), u, queue )); CHECK( magma_s_spmv( c_one, A, z, c_zero, zt, queue )); magma_sscal( m, MAGMA_S_MAKE(-alpha, 0.0 ), u.dval, 1, queue ); magma_saxpy( m, c_one, zt.dval, 1, u.dval, 1, queue ); solver_par->spmv_count++; beta = magma_snrm2( m, u.dval, 1, queue ); magma_sscal( m, MAGMA_S_MAKE(1./beta, 0.0 ), u.dval, 1, queue ); // norma = norm([norma alpha beta]); norma = sqrt(norma*norma + alpha*alpha + beta*beta ); //lsvec( solver_par->numiter-1 ) = normar / norma; thet = -s * alpha; rhot = c * alpha; rho = sqrt( rhot * rhot + beta * beta ); c = rhot / rho; s = - beta / rho; phi = c * phibar; phibar = s * phibar; // d = (z - thet * d) / rho; magma_sscal( n, MAGMA_S_MAKE(-thet, 0.0 ), d.dval, 1, queue ); magma_saxpy( n, c_one, z.dval, 1, d.dval, 1, queue ); magma_sscal( n, MAGMA_S_MAKE(1./rho, 0.0 ), d.dval, 1, queue ); normd = magma_snrm2( n, d.dval, 1, queue ); sumnormd2 = sumnormd2 + normd*normd; // convergence check res = normr; if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose == c_zero ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } // check for convergence in A*x=b if ( res/nomb <= solver_par->rtol || res <= solver_par->atol ){ info = MAGMA_SUCCESS; break; } // check for convergence in min{|b-A*x|} if ( A.num_rows != A.num_cols && ( normar/(norma*normr) <= solver_par->rtol || normar <= solver_par->atol ) ){ printf("%% warning: quit from minimization convergence check.\n"); info = MAGMA_SUCCESS; break; } magma_saxpy( n, MAGMA_S_MAKE( phi, 0.0 ), d.dval, 1, x->dval, 1, queue ); normr = fabs(s) * normr; CHECK( magma_s_spmv( c_one, AT, u, c_zero, vt, queue )); solver_par->spmv_count++; if( precond_par->solver == Magma_NONE ){ ; } else { CHECK( magma_s_applyprecond_right( MagmaTrans, A, vt, &zt, precond_par, queue )); CHECK( magma_s_applyprecond_left( MagmaTrans, A, zt, &vt, precond_par, queue )); } magma_sscal( n, MAGMA_S_MAKE(-beta, 0.0 ), v.dval, 1, queue ); magma_saxpy( n, c_one, vt.dval, 1, v.dval, 1, queue ); alpha = magma_snrm2( n, v.dval, 1, queue ); magma_sscal( n, MAGMA_S_MAKE(1./alpha, 0.0 ), v.dval, 1, queue ); normar = alpha * fabs(s*phi); } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; CHECK( magma_sresidualvec( A, b, *x, &r, &residual, queue)); solver_par->iter_res = res; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter && info == MAGMA_SUCCESS ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == c_zero ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->rtol*solver_par->init_res || solver_par->iter_res < solver_par->atol ) { info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose == c_zero ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) res; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_smfree(&r, queue ); magma_smfree(&v, queue ); magma_smfree(&z, queue ); magma_smfree(&zt, queue ); magma_smfree(&d, queue ); magma_smfree(&vt, queue ); magma_smfree(&q, queue ); magma_smfree(&u, queue ); magma_smfree(&w, queue ); magma_smfree(&AT, queue ); magma_smfree(&Ah1, queue ); magma_smfree(&Ah2, queue ); solver_par->info = info; return info; } /* magma_sqmr */
extern "C" magma_int_t magma_smslice( magma_int_t num_slices, magma_int_t slice, magma_s_matrix A, magma_s_matrix *B, magma_s_matrix *ALOC, magma_s_matrix *ANLOC, magma_index_t *comm_i, float *comm_v, magma_int_t *start, magma_int_t *end, magma_queue_t queue ) { magma_int_t info = 0; if( A.num_rows != A.num_cols ){ printf("%% error: only supported for square matrices.\n"); info = MAGMA_ERR_NOT_SUPPORTED; goto cleanup; } if ( A.memory_location == Magma_CPU && A.storage_type == Magma_CSR ){ CHECK( magma_smconvert( A, B, Magma_CSR, Magma_CSR, queue ) ); magma_free_cpu( B->col ); magma_free_cpu( B->val ); CHECK( magma_smconvert( A, ALOC, Magma_CSR, Magma_CSR, queue ) ); magma_free_cpu( ALOC->col ); magma_free_cpu( ALOC->row ); magma_free_cpu( ALOC->val ); CHECK( magma_smconvert( A, ANLOC, Magma_CSR, Magma_CSR, queue ) ); magma_free_cpu( ANLOC->col ); magma_free_cpu( ANLOC->row ); magma_free_cpu( ANLOC->val ); magma_int_t i,j,k, nnz, nnz_loc=0, loc_row = 0, nnz_nloc = 0; magma_index_t col; magma_int_t size = magma_ceildiv( A.num_rows, num_slices ); magma_int_t lstart = slice*size; magma_int_t lend = min( (slice+1)*size, A.num_rows ); // correct size for last slice size = lend-lstart; CHECK( magma_index_malloc_cpu( &ALOC->row, size+1 ) ); CHECK( magma_index_malloc_cpu( &ANLOC->row, size+1 ) ); // count elements for slice - identity for rest nnz = A.row[ lend ] - A.row[ lstart ] + ( A.num_rows - size ); CHECK( magma_index_malloc_cpu( &B->col, nnz ) ); CHECK( magma_smalloc_cpu( &B->val, nnz ) ); // for the communication plan for( i=0; i<A.num_rows; i++ ) { comm_i[i] = 0; comm_v[i] = MAGMA_S_ZERO; } k=0; B->row[i] = 0; ALOC->row[0] = 0; ANLOC->row[0] = 0; // identity above slice for( i=0; i<lstart; i++ ) { B->row[i+1] = B->row[i]+1; B->val[k] = MAGMA_S_ONE; B->col[k] = i; k++; } // slice for( i=lstart; i<lend; i++ ) { B->row[i+1] = B->row[i] + (A.row[i+1]-A.row[i]); for( j=A.row[i]; j<A.row[i+1]; j++ ){ B->val[k] = A.val[j]; col = A.col[j]; B->col[k] = col; // communication plan if( col<lstart || col>=lend ){ comm_i[ col ] = 1; comm_v[ col ] = comm_v[ col ] + MAGMA_S_MAKE( MAGMA_S_ABS( A.val[j] ), 0.0 ); nnz_nloc++; } else { nnz_loc++; } k++; } loc_row++; ALOC->row[ loc_row ] = nnz_loc; ANLOC->row[ loc_row ] = nnz_nloc; } CHECK( magma_index_malloc_cpu( &ALOC->col, nnz_loc ) ); CHECK( magma_smalloc_cpu( &ALOC->val, nnz_loc ) ); ALOC->num_rows = size; ALOC->num_cols = size; ALOC->nnz = nnz_loc; CHECK( magma_index_malloc_cpu( &ANLOC->col, nnz_nloc ) ); CHECK( magma_smalloc_cpu( &ANLOC->val, nnz_nloc ) ); ANLOC->num_rows = size; ANLOC->num_cols = A.num_cols; ANLOC->nnz = nnz_nloc; nnz_loc = 0; nnz_nloc = 0; // local/nonlocal matrix for( i=lstart; i<lend; i++ ) { for( j=A.row[i]; j<A.row[i+1]; j++ ){ col = A.col[j]; // insert only in local part in ALOC, nonlocal in ANLOC if( col<lstart || col>=lend ){ ANLOC->val[ nnz_nloc ] = A.val[j]; ANLOC->col[ nnz_nloc ] = col; nnz_nloc++; } else { ALOC->val[ nnz_loc ] = A.val[j]; ALOC->col[ nnz_loc ] = col-lstart; nnz_loc++; } } } // identity below slice for( i=lend; i<A.num_rows; i++ ) { B->row[i+1] = B->row[i]+1; B->val[k] = MAGMA_S_ONE; B->col[k] = i; k++; } B->nnz = k; *start = lstart; *end = lend; } else { printf("error: mslice only supported for CSR matrices on the CPU: %d %d.\n", int(A.memory_location), int(A.storage_type) ); info = MAGMA_ERR_NOT_SUPPORTED; } cleanup: return info; }